diff --git a/.gitignore b/.gitignore
index c9f46de8fb8..99bb80d6380 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,8 @@ data.prototext*
 # Can also ignore all directories and files in a directory.
 # tmp/**/* 
 build
+spack_environments/users/
+
+
+# we don't want to collect slurm output
+**/slurm-*.out
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000000..4afaeaac3b8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,11 @@
+[submodule "applications/graph/snap"]
+	path = applications/graph/snap
+	url = https://github.com/snap-stanford/snap
+	ignore = dirty
+[submodule "applications/graph/largescale_node2vec"]
+	path = applications/graph/largescale_node2vec
+	url = https://lc.llnl.gov/bitbucket/scm/havoq/largescale_node2vec.git
+	ignore = dirty
+[submodule "applications/ATOM/moses"]
+	path = applications/ATOM/moses
+	url = git@github.com:samadejacobs/moses.git
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 9e2728c0935..dd95022b107 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,7 +1,24 @@
 # .readthedocs.yml
+# Config file for Read the Docs
+# https://docs.readthedocs.io/en/stable/config-file/v2.html
+
+version: 2
+
+sphinx:
+  builder: html
+  configuration: docs/conf.py
+
+formats: []
 
 build:
   image: latest
 
 python:
   version: 3.7
+  install:
+    - requirements: docs/sphinx_requirements.txt
+    
+submodules:
+  include: []
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4dfb77a0e19..807adfb27da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.13)
 
 project(LBANN CXX)
 
@@ -48,7 +48,7 @@ endif ()
 #
 
 set(LBANN_VERSION_MAJOR 0)
-set(LBANN_VERSION_MINOR 99)
+set(LBANN_VERSION_MINOR 100)
 set(LBANN_VERSION_PATCH 0)
 
 set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}.${LBANN_VERSION_PATCH}")
@@ -104,6 +104,20 @@ option(LBANN_WITH_CONDUIT "Enable Conduit library" ON)
 
 option(LBANN_WITH_CUDNN "Include Nvidia cuDNN" ON)
 
+option(LBANN_WITH_DIHYDROGEN "Build with DiHydrogen support" OFF)
+if (LBANN_WITH_DIHYDROGEN)
+  message(WARNING "DiHydrogen support is currently expermimental. "
+    "There is no stable interface. "
+    "Use caution before using any features.")
+endif (LBANN_WITH_DIHYDROGEN)
+
+option(LBANN_WITH_DISTCONV "Enable DiHydrogen's Distconv" OFF)
+if (LBANN_WITH_DISTCONV)
+  message(WARNING "Distconv support is currently expermimental. "
+    "There is no stable interface. "
+    "Use caution before using any features.")
+endif (LBANN_WITH_DISTCONV)
+
 option(LBANN_WITH_HWLOC
   "Enable topology-aware optimizations" ON)
 
@@ -121,13 +135,10 @@ option(LBANN_WITH_VTUNE
 option(LBANN_WITH_UNIT_TESTING
   "Enable the unit testing framework (requires Catch2)" OFF)
 
-# Enable parallel random matrix generation, if possible
+# Use deterministic GPU algorithms and layer operations
 option(LBANN_DETERMINISTIC
   "Use deterministic algorithms as much as possible." OFF)
 
-option(LBANN_SEQUENTIAL_INITIALIZATION
-  "Sequentially consistent initialization" OFF)
-
 option(LBANN_DEBUG_PRINT_SUBTARGETS
   "Turn on debugging output of internal target properties." OFF)
 mark_as_advanced(LBANN_DEBUG_PRINT_SUBTARGETS)
@@ -161,6 +172,11 @@ include(SetupCXX)
 ################################################################
 
 # Required dependencies
+find_package(Threads REQUIRED)
+
+# Argument parsing backend
+find_package(Clara REQUIRED)
+
 find_package(CEREAL NO_MODULE
   HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR}
   PATH_SUFFIXES share/cmake/cereal
@@ -172,16 +188,50 @@ set(LBANN_HAS_CEREAL ${CEREAL_FOUND})
 # The imported target is just called "cereal". Super.
 
 # Setup the linear algebra library
-find_package(Hydrogen 1.2.0 NO_MODULE QUIET
+find_package(Hydrogen 1.3.3 NO_MODULE QUIET
   HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR}
   PATH_SUFFIXES lib/cmake/hydrogen
   NO_DEFAULT_PATH)
 if (NOT Hydrogen_FOUND)
-  find_package(Hydrogen 1.2.0 NO_MODULE QUIET REQUIRED)
+  find_package(Hydrogen 1.3.3 NO_MODULE QUIET REQUIRED)
 endif ()
 message(STATUS "Found Hydrogen: ${Hydrogen_DIR}")
 set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND})
 
+# DiHydrogen and Distconv
+if (LBANN_WITH_DISTCONV AND NOT LBANN_WITH_DIHYDROGEN)
+  message(FATAL_ERROR "Distconv requires DiHydrogen. Enable DiHydrogen to use Distconv.")
+endif ()
+
+if (LBANN_WITH_DIHYDROGEN)
+  if (LBANN_WITH_DISTCONV)
+    find_package(DiHydrogen CONFIG COMPONENTS Meta Patterns DistConv
+      HINTS ${DIHYDROGEN_DIR} $ENV{DIHYDROGEN_DIR}
+      ${H2_DIR} $ENV{H2_DIR}
+      PATH_SUFFIXES install/lib64/cmake install/lib/cmake
+      NO_DEFAULT_PATH)
+    find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
+    set(LBANN_HAS_DISTCONV TRUE)
+  else ()
+    find_package(DiHydrogen CONFIG COMPONENTS Meta Patterns
+      HINTS ${DIHYDROGEN_DIR} $ENV{DIHYDROGEN_DIR}
+      ${H2_DIR} $ENV{H2_DIR}
+      PATH_SUFFIXES install/lib64/cmake install/lib/cmake
+      NO_DEFAULT_PATH)
+    find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns)
+  endif ()
+  set(LBANN_HAS_DIHYDROGEN TRUE)
+endif ()
+
+# Inherit half-precision stuff from Hydrogen
+set(LBANN_HAS_HALF ${HYDROGEN_HAVE_HALF}) # This is CPU-only
+
+# Not the ideal fix, but should be fine for now.
+if (Aluminum_FOUND)
+  message(STATUS "Aluminum found in Hydrogen. Using Aluminum.")
+  set(LBANN_WITH_ALUMINUM ON CACHE BOOL "Use aluminum." FORCE)
+endif ()
+
 include(SetupOpenMP)
 include(SetupMPI)
 include(SetupProtobuf)
@@ -201,6 +251,11 @@ set(LBANN_HAS_OPENCV ${OpenCV_FOUND})
 set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
 set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})
 
+# Only used if have GPU and have CPU half.
+if (LBANN_HAS_CUDA AND LBANN_HAS_HALF)
+  set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16})
+endif ()
+
 if (LBANN_HAS_CUDA)
   enable_language(CUDA)
 
@@ -214,13 +269,15 @@ endif ()
 if (LBANN_WITH_ALUMINUM)
   # Aluminum may have already been found by Hydrogen
   if (NOT Aluminum_FOUND)
-    find_package(Aluminum 0.2.0 NO_MODULE QUIET
+    message(WARNING
+      "Using Aluminum without Hydrogen support may not be well-supported.")
+    find_package(Aluminum 0.3.0 NO_MODULE QUIET
       HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR}
       $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR}
       PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum
       NO_DEFAULT_PATH)
     if (NOT Aluminum_FOUND)
-      find_package(Aluminum 0.2.0 NO_MODULE QUIET)
+      find_package(Aluminum 0.3.0 NO_MODULE QUIET)
     endif ()
   endif ()
   set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND})
@@ -264,6 +321,11 @@ if (LBANN_HAS_CUDA)
 
   include(SetupCUDAToolkit)
 
+  if (LBANN_HAS_GPU_FP16)
+    set_property(TARGET cuda::toolkit PROPERTY
+      INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_60>)
+  endif (LBANN_HAS_GPU_FP16)
+
   set(LBANN_HAS_CUDNN ${CUDNN_FOUND})
 
   if (LBANN_HAS_ALUMINUM AND AL_HAS_NCCL)
@@ -271,6 +333,16 @@ if (LBANN_HAS_CUDA)
   else ()
     set(LBANN_HAS_NCCL2 FALSE)
   endif ()
+
+  if (LBANN_WITH_NVSHMEM)
+    find_package(NVSHMEM REQUIRED)
+    set_property(TARGET cuda::toolkit PROPERTY
+      INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_70>)
+    # Build LBANN as a static library to get around a bug in NVSHMEM
+    set(BUILD_SHARED_LIBS OFF)
+  endif ()
+  set(LBANN_HAS_NVSHMEM "${NVSHMEM_FOUND}")
+
 endif (LBANN_HAS_CUDA)
 
 # This shouldn't be here, but is ok for now. This will occasionally be
@@ -415,22 +487,28 @@ if (LBANN_WITH_CONDUIT)
     endif ()
   endforeach ()
 
+  get_filename_component(_conduit_include_dirs
+    "${CONDUIT_INCLUDE_DIRS}" DIRECTORY)
+
   if (HDF5_FOUND_WITH_MODULE)
     list(APPEND _conduit_interface_link_libs
       ${HDF5_LIBRARIES})
 
-    set_target_properties(conduit::conduit
-      PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${HDF5_INCLUDE_DIRS}")
+    list(APPEND _conduit_include_dirs
+      "${HDF5_INCLUDE_DIRS}")
   endif ()
 
+  set_property(TARGET conduit::conduit
+    PROPERTY
+    INTERFACE_INCLUDE_DIRECTORIES
+    "${_conduit_include_dirs}")
+
   set_target_properties(conduit::conduit
     PROPERTIES
     INTERFACE_LINK_LIBRARIES
     "${_conduit_interface_link_libs}")
 
   set(CONDUIT_LIBRARIES conduit::conduit)
-  set(LBANN_HAS_CONDUIT ${Conduit_FOUND})
 endif (LBANN_WITH_CONDUIT)
 
 if (LBANN_WITH_UNIT_TESTING)
@@ -446,7 +524,11 @@ if (LBANN_WITH_UNIT_TESTING)
   # Now that Catch2 has been found, start adding the unit tests
   include(CTest)
   include(Catch)
+  add_subdirectory(src/proto/unit_test)
   add_subdirectory(src/utils/unit_test)
+  add_subdirectory(src/weights/unit_test)
+  add_subdirectory(src/transforms/unit_test)
+  add_subdirectory(src/transforms/vision/unit_test)
 
   # Add this one last
   add_subdirectory(unit_test)
@@ -459,16 +541,16 @@ add_subdirectory(docs)
 # Build LBANN
 ################################################################
 
+# Add LBANN source files
+add_subdirectory(include)
+add_subdirectory(src)
+
 # Write the configure file
 configure_file(
   "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_config.hpp.in"
   "${CMAKE_BINARY_DIR}/lbann_config.hpp"
   @ONLY)
 
-# Add LBANN source files
-add_subdirectory(include)
-add_subdirectory(src)
-
 # Create the LBANN library
 add_library(lbann ${LBANN_SOURCES} ${LBANN_HEADERS} ${LBANN_CUDA_SOURCES})
 
@@ -477,12 +559,10 @@ target_include_directories(lbann PUBLIC
   $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
   $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}>)
 
-if (LBANN_HAS_PYTHON)
-  target_include_directories(lbann PUBLIC ${Python_INCLUDE_DIRS})
-endif ()
-
 # Use the IMPORTED targets when possible.
 target_link_libraries(lbann PUBLIC LbannProto)
+target_link_libraries(lbann PUBLIC Threads::Threads)
+target_link_libraries(lbann PUBLIC clara::clara)
 target_link_libraries(lbann PUBLIC cereal)
 target_link_libraries(lbann PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(lbann PUBLIC MPI::MPI_CXX)
@@ -491,6 +571,15 @@ target_link_libraries(lbann PUBLIC ${HYDROGEN_LIBRARIES})
 target_link_libraries(lbann PUBLIC ${OpenCV_LIBRARIES})
 target_link_libraries(lbann PUBLIC ${CONDUIT_LIBRARIES})
 
+target_link_libraries(lbann PUBLIC
+  $<TARGET_NAME_IF_EXISTS:H2::H2Meta>
+  $<TARGET_NAME_IF_EXISTS:H2::H2Patterns>
+  )
+
+if (LBANN_WITH_DISTCONV)
+  target_link_libraries(lbann PUBLIC H2::H2DistConv)
+endif ()
+
 if (LBANN_HAS_TBINF)
   target_link_libraries(lbann PUBLIC TBinf)
 endif ()
@@ -512,7 +601,12 @@ if (LBANN_HAS_VTUNE)
 endif ()
 
 if (LBANN_HAS_PYTHON)
-  target_link_libraries(lbann PUBLIC ${Python_LIBRARIES})
+  target_link_libraries(lbann PUBLIC Python::Python)
+endif ()
+
+if (LBANN_HAS_NVSHMEM)
+  set_property(TARGET lbann PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(lbann PUBLIC NVSHMEM::NVSHMEM)
 endif ()
 
 if (TARGET LBANN_CXX_FLAGS_werror)
@@ -521,6 +615,27 @@ endif ()
 
 target_link_libraries(lbann PUBLIC ${DL_LIBRARY})
 
+# Fix the -g issue with Clang on OSX
+if (APPLE)
+  # Remove -g from the options
+  string(REPLACE  "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE  "-g" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+
+  # Get all the sources and add "-g" to all of them.
+  get_target_property(_LBANN_SRCS lbann SOURCES)
+  set_source_files_properties(${_LBANN_SRCS}
+    PROPERTIES COMPILE_OPTIONS "-g")
+
+  # Cleanup source files
+  foreach (bad_file IN LISTS _LBANN_SRCS)
+    get_source_file_property(
+      _SRC_COMPILE_OPTS "${bad_file}" COMPILE_OPTIONS)
+    string(REPLACE "-g" "" _SRC_COMPILE_OPTS "${COMPILE_OPTIONS}")
+    set_source_files_properties(
+      "${bad_file}" PROPERTIES COMPILE_OPTIONS "${_SRC_COMPILE_OPTS}")
+  endforeach ()
+endif ()
+
 # Clean things up
 include(LBANNDebugUtilities)
 lbann_remove_default_include_paths_from_all_subtargets(lbann)
@@ -539,6 +654,8 @@ endif ()
 add_subdirectory(model_zoo)
 add_subdirectory(model_zoo/tests)
 add_subdirectory(model_zoo/jag_utils)
+add_subdirectory(applications/CANDLE/pilot2/tools)
+add_subdirectory(applications/ATOM/utils)
 add_subdirectory(tests)
 add_subdirectory(scripts)
 
@@ -733,6 +850,8 @@ string(APPEND _str "\n")
 #Print the true/false guys
 append_str_tf(_str
   LBANN_GNU_LINUX
+  LBANN_HAS_DIHYDROGEN
+  LBANN_HAS_DISTCONV
   LBANN_HAS_HYDROGEN
   LBANN_HAS_OPENCV
   LBANN_HAS_CEREAL
@@ -747,7 +866,6 @@ append_str_tf(_str
   LBANN_HAS_DOXYGEN
   LBANN_HAS_LBANN_PROTO
   LBANN_HAS_ALUMINUM
-  LBANN_HAS_CONDUIT
   LBANN_HAS_PYTHON)
 string(APPEND _str
   "\n== End LBANN Configuration Summary ==\n")
@@ -774,6 +892,13 @@ configure_file(
   "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in"
   "${CMAKE_BINARY_DIR}/lbann_module.lua.install"
   @ONLY)
+configure_file(
+  "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.tcl.in"
+  "${CMAKE_BINARY_DIR}/lbann_module.tcl.install")
+
 install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install"
   RENAME "${LBANN_MODULEFILE_NAME}"
   DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles")
+install(FILES "${CMAKE_BINARY_DIR}/lbann_module.tcl.install"
+  RENAME "${LBANN_VERSION}"
+  DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann")
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index 13418207629..1ebd8e4a2b8 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt
@@ -21,6 +21,102 @@ Bug fixes:
 
 Retired features:
 
+============================== Release Notes: v0.100 ==============================
+Support for new network structures:
+ - 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database.
+ - 3D CosmoFlow Model
+ - DenseNet
+ - ATOM LSTM model
+ - RAS state classifier
+ - node2vec
+ - Transformer and other attention-based models
+ - ExaGAN (formerly CosmoGAN)
+ - MaCC ICF surrogate model
+
+Applications:
+ - Created a directory of example applications, deprecating the "model zoo" directory
+
+Support for new layers:
+ - Embedding layer
+ - Distributed embedding layer
+ - Channel-wise scale/bias layer
+ - Entry-wise scale/bias layer
+ - Gated-Recurrent Units (GRU)
+ - Entry-wise batchnorm
+ - Argmax, Argmin, and one-hot layers
+ - Layer norm
+ - Deconvolution layer (transposed convolution)
+ - Layers for channel-wise operations (channel-wise fully-connected, channel-wise softmax, channel-wise scale/bias, instance norm)
+ - Matrix multiply layer
+
+Python front-end:
+ - Can now configure contrib launcher with environment variables
+ - Added NERSC compute center
+ - Per-layer specification of compute device (CPU or GPU)
+ - Option to write custom batch scripts with Python front-end
+
+Performance optimizations:
+ - Parallelized Python data reader with "multiprocessing" module
+ - Fuse batchnorm stats allreduces in FP/BP.
+ - Tuned concatenate and slice layer
+ - Dynamically allocate and free memory for layer error signals (halves LBANN's memory footprint)
+
+Model portability & usability:
+ - Bamboo tests for individual layers
+
+Internal features:
+ - Added support for DistConv features (distributed, generalized,
+   parallel convolution)
+ - Added support for NVSHMEM 1.0 API (used in distributed embedding
+   layer and DistConv halo exchange)
+ - Support for multiple data types per model (per-layer)
+ - Support for per-layer mixed-precision weight training and inference,
+   includes per-weight object and objective function mixed-precision.
+ - Improved how and when the RNGs are initialized
+ - Callback to dump images to TensorBoard
+ - Callback to save model weights (useful to export to PyTorch)
+ - Callback to save top K models (LTFB)
+ - Improved run-to-run reproducibility by initializing weights in alphabetical order
+ - Moved models from model_zoo directory to applications directory
+ - Cleanup and refactoring of callbacks and layer instantiation
+ - Grouped batchnorm statistics
+ - Callback to print model description
+ - Refactored trainer and training-state out of the model class
+ - Support for transposing data in matrix multiply layers
+ - Added DiHydrogen tensor and DistConv library
+ - Added parallel strategy to layer class to support DistConv
+ - LBANN inference mode supports loading models from multiple directories
+ - Cleanup of checkpoint and restart logic
+
+I/O & data readers:
+ - Added in-memory data store that caches samples in CPU memory.  It can be loaded
+   during the first epoch or preloaded
+ - Added new "transform" data preprocessing ingestion pipeline
+ - Added sample list format for specifying data sets
+ - Introduced data coordinator that manages data readers and extracts them from
+   the input layers
+ - Data store is able to checkpoint / spill it's contents to local disk
+ - Data reader for SMILE strings
+
+Build system:
+ - Hydrogen 1.3.4
+ - Aluminum 0.3.3
+ - Improved documentation on read the docs (RTD)
+ - Robust support for using Spack as a build system around CMake
+ - Identified compute centers for specifying build and run dependencies
+ - Added Catch2-based tests
+
+Bug fixes:
+ - Fixed path resolution for dump weights, save model, and checkpoint callbacks
+ - Added mutexes for preloading the data store
+ - Fixed the LTFB exchange to include all ADAM optimizer state
+ - Fixed the mapping of I/O RNGs to I/O processing threads to ensure
+   consistent and correct multi-threaded performance
+
+Retired features:
+ - moving MNIST data reader is replaced by python data reader
+ - ASCII data reader is deprecated
+
 ============================== Release Notes: v0.99 ==============================
 Support for new training algorithms:
  - Improvements to LTFB infrastructure (including transfer of SGD and Adam hyperparameters)
diff --git a/applications/.gitignore b/applications/.gitignore
new file mode 100644
index 00000000000..aa6a015fd1e
--- /dev/null
+++ b/applications/.gitignore
@@ -0,0 +1,21 @@
+# Setup standard ignores to keep the applications directory hierarchy clean
+  
+# Building in source tree garbage
+.cproject
+.project
+*.o
+*.a
+
+# Emacs backup garbage
+.backup/
+
+# Other standard ignores
+*~
+*.pyc
+\#*#
+.#*
+.*.swp
+.DS_Store
+
+# Python garbage
+__pycache__/
diff --git a/applications/ATOM/README.md b/applications/ATOM/README.md
new file mode 100644
index 00000000000..1789b609f4a
--- /dev/null
+++ b/applications/ATOM/README.md
@@ -0,0 +1,46 @@
+## Accelerating Therapeutics for Opportunities in Medicine (ATOM)
+
+Models for training neural networks to suppor the [ATOM](https://atomscience.org) project
+
+The train_atom_char_rnn.py script implements GRU-based recurrent model for generating new SMILES strings. 
+Original neural network model and training hyperparameters are described in [MOSES benchmark](https://github.com/samadejacobs/moses/tree/master/moses/char_rnn). Please see LBANN documentations on how to install, build and run LBANN code. 
+
+### How to train
+```bash
+run python3 train_atom_char.rnn.py
+```
+
+Expected training output in LBANN (250K ZINC training dataset, on a single LLNL Pascal GPU) is shown:
+```
+--------------------------------------------------------------------------------
+[0] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0]
+            global MB = [  64/   0/   0] global last MB = [  16  /   0  /   0  ]
+             local MB = [  64/   0/   0]  local last MB = [  16+0/   0+0/   0+0]
+--------------------------------------------------------------------------------
+model0 (instance 0) training epoch 0 objective function : 0.438031
+model0 (instance 0) training epoch 0 run time : 1009.55s
+model0 (instance 0) training epoch 0 mini-batch time statistics : 0.257328s mean, 1.89938s max, 0.15177s min, 0.0331048s stdev
+--------------------------------------------------------------------------------
+[1] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0]
+            global MB = [  64/   0/   0] global last MB = [  16  /   0  /   0  ]
+             local MB = [  64/   0/   0]  local last MB = [  16+0/   0+0/   0+0]
+--------------------------------------------------------------------------------
+model0 (instance 0) training epoch 1 objective function : 0.37321
+model0 (instance 0) training epoch 1 run time : 1006.6s
+model0 (instance 0) training epoch 1 mini-batch time statistics : 0.256573s mean, 0.912742s max, 0.158709s min, 0.0193512s stdev
+```
+
+### Inference and Sampling
+
+1. Clone this version of [MOSES benchmark repository](https://github.com/samadejacobs/moses) and follow instructions for installation  
+2. Inference using LBANN pretrained model parameters 
+
+```bash
+
+ python3 MOSES_DIR/scripts/run.py --model char_rnn  --n_samples NUM_SAMPLES \
+                                  --lbann_weights_dir LBANN_WEIGHTS_DIR \
+                                  --lbann_epoch_counts EPOCHS 
+
+```
+
+Command above will load pre_trained LBANN weights and biases from LBANN_WEIGHTS_DIR at a specified EPOCH counts, generate up to NUM_SAMPLES new molecules, and calculate metrics on the new molecules, some metrics relative to the test (validation) dataset.
diff --git a/applications/ATOM/data/vocab_universal.txt b/applications/ATOM/data/vocab_universal.txt
new file mode 100644
index 00000000000..6bca1c7fff5
--- /dev/null
+++ b/applications/ATOM/data/vocab_universal.txt
@@ -0,0 +1,98 @@
+a 0
+b 1
+c 2
+d 3
+e 4
+f 5
+g 6
+h 7
+i 8
+j 9
+k 10
+l 11
+m 12
+n 13
+o 14
+p 15
+q 16
+r 17
+s 18
+t 19
+u 20
+v 21
+w 22
+x 23
+y 24
+z 25
+A 26
+B 27
+C 28
+D 29
+E 30
+F 31
+G 32
+H 33
+I 34
+J 35
+K 36
+L 37
+M 38
+N 39
+O 40
+P 41
+Q 42
+R 43
+S 44
+T 45
+U 46
+V 47
+W 48
+X 49
+Y 50
+Z 51
+0 52
+1 53
+2 54
+3 55
+4 56
+5 57
+6 58
+7 59
+8 60
+9 61
+! 62
+" 63
+# 64
+$ 65
+% 66
+& 67
+' 68
+( 69
+) 70
+* 71
++ 72
+, 73
+- 74
+. 75
+/ 76
+: 77
+; 78
+< 79
+= 80
+> 81
+? 82
+@ 83
+[ 84
+\ 85
+] 86
+^ 87
+_ 88
+` 89
+{ 90
+| 91
+} 92
+~ 93
+<bos> 94
+<eos> 95
+<pad> 96
+<unk> 97
diff --git a/applications/ATOM/dataset.py b/applications/ATOM/dataset.py
new file mode 100644
index 00000000000..35bf0faa699
--- /dev/null
+++ b/applications/ATOM/dataset.py
@@ -0,0 +1,29 @@
+import os
+import numpy as np
+import json
+
+
+# the idea here is to use the same code with abritrary sets of data
+with open(os.environ['DATA_CONFIG'], 'rb') as handle:
+    config = json.load(handle)
+
+pad_index = config['pad_index']
+max_seq_len = config['max_seq_len']
+
+samples = np.load(config['data_path'], allow_pickle=True)
+
+# Sample access functions
+def get_sample(index):
+    sample = samples[index]
+    if len(sample) < max_seq_len:
+        sample = np.concatenate((sample, np.full(max_seq_len-len(sample), pad_index)))
+    else:
+        sample = np.resize(sample, max_seq_len)
+    return sample
+
+def num_samples():
+    return samples.shape[0]
+
+def sample_dims():
+    return [max_seq_len]
+
diff --git a/applications/ATOM/moses b/applications/ATOM/moses
new file mode 160000
index 00000000000..28932ce6ff6
--- /dev/null
+++ b/applications/ATOM/moses
@@ -0,0 +1 @@
+Subproject commit 28932ce6ff6fb1883be888a48b431e17835be1c8
diff --git a/applications/ATOM/readme_smiles_data_reader.txt b/applications/ATOM/readme_smiles_data_reader.txt
new file mode 100644
index 00000000000..fa669ca9e15
--- /dev/null
+++ b/applications/ATOM/readme_smiles_data_reader.txt
@@ -0,0 +1,46 @@
+# Example execution line for running with the smiles_data_reader
+
+setenv BASE /usr/workspace/wsb/hysom/corona/applications/ATOM
+
+run python3 train_atom_char_rnn_REV.py \
+  --nodes=16             \
+  --batch-size=1024      \
+  --sequence-length=57   \
+  --embedding-dim=30     \
+  --num-embeddings=30    \
+  --pad-index=28         \
+  --vocab=/p/lustre2/brainusr/datasets/zinc/vocab_train.txt \
+  --data-reader-prototext=$BASE/smiles_data_reader.prototext \
+ |& tee out
+
+WARNING: at present, code assumes the input file is in csv format with
+         tab delimiters
+
+Optional arguments:
+
+  --num-samples=<int> # If not given, uses all samples in the file
+
+Notes:
+  --sequence-length, --vocab, --num-embeddings, and --embedding-dim should 
+  match the data set; vocabs for various datasets are in /p/lustre2/brainusr/datasets/zinc,
+  /p/lustre2/brainusr/datasets/enamine, etc.
+  For now, assume num-embeddings = embedding-dim = vocab.size(), and 
+  pad-index= vocab.size()-2
+  
+  If --sequence-length is too short, portions of some samples will be discarded.
+
+  The smiles_data_reader dtor prints any characters that were not
+  found in the vocabulary, and the number of characters (if any) that
+  were discarded (but note, statistics are only gathered for P_0)
+
+WARNING (when running with the Python data_reader): 
+   ensure that "--sequence-length" matches "max_seq_len" in the 
+   json file; this is not error-checked (as of this writing). 
+   Also ensure "--pad-index" matches the entry in the json file; 
+   also not error checked.
+
+Verification: to test the above cmd line against output using the python data reader:
+run python3 ./train_atom_char_rnn.py --nodes=16 --pad-index=28 --sequence-length=57 --embedding-dim=30 --num-embeddings=30 --batch-size=1024 |& tee out
+
+
+  
diff --git a/applications/ATOM/requirements.txt b/applications/ATOM/requirements.txt
new file mode 100644
index 00000000000..8c5fcd94d99
--- /dev/null
+++ b/applications/ATOM/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+protobuf
+six
+torch
diff --git a/applications/ATOM/smiles_data_reader.prototext b/applications/ATOM/smiles_data_reader.prototext
new file mode 100644
index 00000000000..d56e9e7e91c
--- /dev/null
+++ b/applications/ATOM/smiles_data_reader.prototext
@@ -0,0 +1,10 @@
+data_reader {
+  reader {
+    name: "smiles"
+    role: "train"
+    shuffle: true
+    percent_of_data_to_use: 1.0
+    data_filedir: "/p/lustre2/brainusr/datasets/zinc"
+    data_filename: "train.csv"
+  }
+}
diff --git a/applications/ATOM/train_atom_char_rnn.py b/applications/ATOM/train_atom_char_rnn.py
new file mode 100644
index 00000000000..dc808ac1ff4
--- /dev/null
+++ b/applications/ATOM/train_atom_char_rnn.py
@@ -0,0 +1,305 @@
+import argparse
+import datetime
+import os
+import os.path
+import sys
+
+from google.protobuf import text_format as txtf
+import json
+import numpy as np
+import torch
+
+import lbann
+import lbann.contrib.launcher
+import lbann.modules
+from lbann.util import str_list
+
+
+def construct_lc_launcher_args():
+
+    # defaults correspond to the settings needed for training on the moses dataset
+    parser = argparse.ArgumentParser(prog="lbann charVAE training")
+    parser.add_argument("--partition", default=None)
+    parser.add_argument("--account", default="hpcdl")
+    parser.add_argument("--scheduler", default="slurm")
+    parser.add_argument(
+        "--data-module-file",
+        default="dataset.py",
+        help="specifies the module that contains the logic for loading data",
+    )
+    parser.add_argument(
+        "--data-config",
+        default=os.path.join(
+            os.path.abspath(os.path.dirname(__file__)), "zinc_data_config.json"
+        ),
+        help="path to a data config file that is used for the construction of python data reader",
+    )
+    parser.add_argument(
+        "--time-limit",
+        type=int,
+        default=720,
+        help="specified time limit in number of minutes",
+    )
+    parser.add_argument("--nodes", type=int, default=1)
+    parser.add_argument("--job-name", default="atom_char_rnn")
+    parser.add_argument("--embedding-dim", type=int, default=None)
+    parser.add_argument("--num-embeddings", type=int, default=None)
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--num-epochs", type=int, default=10)
+    parser.add_argument("--data-reader-prototext", default=None)
+    parser.add_argument("--pad-index", type=int, default=None)
+    parser.add_argument("--sequence-length", type=int, default=None)
+    parser.add_argument("--dump_weights_dir", type=str, default="weights")
+    parser.add_argument("--num-samples", type=int, default=None)
+    parser.add_argument("--num-io-threads", type=int, default=11)
+    parser.add_argument("--vocab", default=None)
+    parser.add_argument("--delimiter", default="c")
+    parser.add_argument("--no-header", type=bool, default=True)
+
+    # these are specific to the Trainer object
+    parser.add_argument(
+        "--procs-per-trainer",
+        type=int,
+        default=0,
+        help="number of processes to use per trainer",
+    )
+
+    # these are the bits and pieces required for loading the model in the moses library...may be useful for evaluation tasks/continuing training/etc
+    parser.add_argument("--gamma", type=float, default=0.5, help="")
+    parser.add_argument(
+        "--hidden", type=int, default=768, help="size of the hidden layer"
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1e-3,
+        help="optimizer learning rate to use for training",
+    )
+    parser.add_argument(
+        "--num-layers", type=int, default=1, help="number of LSTM layers"
+    )
+    parser.add_argument(
+        "--step-size", type=int, default=10, help="learning rate decay step size"
+    )
+
+    # this is just for compatiblity with the moses code
+    parser.add_argument("--dropout", type=float, default=0.5, help="")
+    return parser.parse_args()
+
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+
+def construct_model(run_args):
+    """Construct LBANN model.
+
+    Initial model for ATOM molecular SMILES generation
+    Network architecture and training hyperparameters from
+    https://github.com/samadejacobs/moses/tree/master/moses/char_rnn
+
+    """
+
+    pad_index = run_args.pad_index
+    assert pad_index is not None
+
+    sequence_length = run_args.sequence_length
+    assert sequence_length is not None
+
+    print("sequence length is {}".format(sequence_length))
+    data_layout = "data_parallel"
+
+    # Layer graph
+    _input = lbann.Input(name="inp_tensor", target_mode="N/A")
+    print(sequence_length)
+    x_slice = lbann.Slice(
+        _input,
+        axis=0,
+        slice_points=str_list(range(sequence_length + 1)),
+        name="inp_slice",
+    )
+
+    # embedding layer
+    emb = []
+    embedding_dim = run_args.embedding_dim
+    num_embeddings = run_args.num_embeddings
+    assert embedding_dim is not None
+    assert num_embeddings is not None
+
+    emb_weights = lbann.Weights(
+        initializer=lbann.NormalInitializer(mean=0, standard_deviation=1),
+        name="emb_matrix",
+    )
+
+    lstm1 = lbann.modules.GRU(size=run_args.hidden, data_layout=data_layout)
+    fc = lbann.modules.FullyConnectedModule(
+        size=num_embeddings, data_layout=data_layout
+    )
+
+    last_output = lbann.Constant(
+        value=0.0,
+        num_neurons="{}".format(run_args.hidden),
+        data_layout=data_layout,
+        name="lstm_init_output",
+    )
+
+    lstm1_prev_state = [last_output]
+
+    loss = []
+    idl = []
+    for i in range(sequence_length):
+        idl.append(lbann.Identity(x_slice, name="slice_idl_" + str(i), device="CPU"))
+
+    for i in range(sequence_length - 1):
+
+        emb_l = lbann.Embedding(
+            idl[i],
+            name="emb_" + str(i),
+            weights=emb_weights,
+            embedding_dim=embedding_dim,
+            num_embeddings=num_embeddings,
+        )
+
+        x, lstm1_prev_state = lstm1(emb_l, lstm1_prev_state)
+        fc_l = fc(x)
+        y_soft = lbann.Softmax(fc_l, name="soft_" + str(i))
+        gt = lbann.OneHot(idl[i + 1], size=num_embeddings)
+        ce = lbann.CrossEntropy([y_soft, gt], name="loss_" + str(i))
+        # mask padding in input
+        pad_mask = lbann.NotEqual(
+            [idl[i], lbann.Constant(value=pad_index, num_neurons="1")],
+        )
+        ce_mask = lbann.Multiply([pad_mask, ce], name="loss_mask_" + str(i))
+        loss.append(lbann.LayerTerm(ce_mask, scale=1 / (sequence_length - 1)))
+
+    layers = list(lbann.traverse_layer_graph(_input))
+    # Setup objective function
+    weights = set()
+    for l in layers:
+        weights.update(l.weights)
+    obj = lbann.ObjectiveFunction(loss)
+
+    callbacks = [
+        lbann.CallbackPrint(),
+        lbann.CallbackTimer(),
+        lbann.CallbackStepLearningRate(step=run_args.step_size, amt=run_args.gamma),
+        lbann.CallbackDumpWeights(directory=run_args.dump_weights_dir, epoch_interval=1),
+    ]
+
+    # Construct model
+    return lbann.Model(
+        run_args.num_epochs,
+        layers=layers,
+        weights=weights,
+        objective_function=obj,
+        callbacks=callbacks
+    )
+        #callbacks=callbacks
+
+
+def construct_data_reader(run_args):
+    """
+    Construct Protobuf message for Python data reader.
+
+    The Python data reader will import this Python file to access the
+    sample access functions.
+
+    """
+
+    module_file = os.path.abspath(run_args.data_module_file)
+    os.environ["DATA_CONFIG"] = os.path.abspath(run_args.data_config)
+
+    module_name = os.path.splitext(os.path.basename(module_file))[0]
+    module_dir = os.path.dirname(module_file)
+
+    print("module_name: {}\tmodule_dir: {}".format(module_name, module_dir))
+
+    # Base data reader message
+    message = lbann.reader_pb2.DataReader()
+
+    # Training set data reader
+    data_reader = message.reader.add()
+    data_reader.name = "python"
+    data_reader.role = "train"
+    data_reader.shuffle = True
+    data_reader.percent_of_data_to_use = 1.0
+    data_reader.python.module = module_name
+    data_reader.python.module_dir = module_dir
+    data_reader.python.sample_function = "get_sample"
+    data_reader.python.num_samples_function = "num_samples"
+    data_reader.python.sample_dims_function = "sample_dims"
+
+    return message
+
+
+def main():
+    run_args = construct_lc_launcher_args()
+
+    # add data_config data
+    # and do not overwrite args if data_reader_prototext is enabled
+    if os.path.isfile(run_args.data_config) and not run_args.data_reader_prototext:
+        with open(run_args.data_config, "r") as f:
+            config = json.load(f)
+        for k, v in config.items():
+            setattr(run_args, k, v)
+
+    trainer = lbann.Trainer(
+        run_args.batch_size,
+        name=None,
+        procs_per_trainer=run_args.procs_per_trainer,
+    )
+
+    # define data_reader
+    if run_args.data_reader_prototext:
+        print("Using data_reader_prototext")
+        assert run_args.sequence_length is not None
+        assert run_args.vocab is not None
+
+        data_reader_proto = lbann.lbann_pb2.LbannPB()
+        with open(run_args.data_reader_prototext, "r") as f:
+            txtf.Merge(f.read(), data_reader_proto)
+        data_reader = data_reader_proto.data_reader
+    else:
+        data_reader = construct_data_reader(run_args)
+
+    if "LBANN_EXPERIMENT_DIR" in os.environ:
+        work_dir = os.environ["LBANN_EXPERIMENT_DIR"]
+    else:
+        work_dir = os.path.join(os.getcwd())
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    experiment_dir = os.path.join(
+        work_dir, "{}_{}".format(timestamp, run_args.job_name)
+    )
+    if not os.path.exists(experiment_dir):
+        os.makedirs(experiment_dir)
+
+    # model and optimizer
+    model = construct_model(run_args)
+    opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8)
+
+    # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase)
+    ppn = 4 if run_args.scheduler == "lsf" else 2
+    print("args:\n" + str(run_args))
+    torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name))
+    status = lbann.contrib.launcher.run(
+        trainer,
+        model,
+        data_reader,
+        opt,
+        partition=run_args.partition,
+        scheduler=run_args.scheduler,
+        account=run_args.account,
+        time_limit=run_args.time_limit,
+        nodes=run_args.nodes,
+        procs_per_node=ppn,
+        job_name=run_args.job_name,
+        experiment_dir=experiment_dir,
+        lbann_args=f"--vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}",
+    )
+
+    print("LBANN launcher status:\n" + str(status))
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/applications/ATOM/utils/CMakeLists.txt b/applications/ATOM/utils/CMakeLists.txt
new file mode 100644
index 00000000000..7d76fb6d6f5
--- /dev/null
+++ b/applications/ATOM/utils/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Add a target to control building all the utilities
+add_custom_target(atom-utils)
+
+add_executable(compute_vocab
+  EXCLUDE_FROM_ALL compute_vocab.cpp)
+  target_link_libraries(compute_vocab lbann)
+  add_dependencies(atom-utils compute_vocab)
+
+# Install the binaries
+install( 
+  TARGETS compute_vocab
+  OPTIONAL
+  EXPORT LBANNTargets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+
+#(from Tom)
+# The use of `OPTIONAL` here will trigger CMake warnings. These can
+# safely be ignored and tests confirm that. See these for more info:
+#
+# https://gitlab.kitware.com/cmake/cmake/issues/18258
+# https://cmake.org/pipermail/cmake/2011-August/046014.html
+
diff --git a/applications/ATOM/utils/build_universal_vocab.py b/applications/ATOM/utils/build_universal_vocab.py
new file mode 100644
index 00000000000..344dda974e4
--- /dev/null
+++ b/applications/ATOM/utils/build_universal_vocab.py
@@ -0,0 +1,26 @@
+#
+# run with python 2.7
+#
+import string
+
+a1 = string.letters
+a2 = string.digits
+a3 = string.punctuation
+a4 = a1 + a2 + a3
+
+out = open('vocab_universal.txt', 'w')
+id = 0
+for c in a4 :
+  out.write(c + ' ' + str(id) + '\n')
+  id += 1
+out.write('<bos> ' + str(id) + '\n')
+id += 1
+out.write('<eos> ' + str(id) + '\n')
+id += 1
+out.write('<pad> ' + str(id) + '\n')
+id += 1
+out.write('<unk> ' + str(id) + '\n')
+id += 1
+
+out.close()
+print('\nwrote file: vocab_universal.txt\n')
diff --git a/applications/ATOM/utils/compute_profile.py b/applications/ATOM/utils/compute_profile.py
new file mode 100644
index 00000000000..2d54c0d597d
--- /dev/null
+++ b/applications/ATOM/utils/compute_profile.py
@@ -0,0 +1,50 @@
+import sys
+
+if len(sys.argv) != 3 :
+  print('usage:')
+  print('  ' + sys.argv[0] + ' input_fn output_fn')
+  print('function:')
+  print('  writes data for plotting num_sequences as a function')
+  print('  of sequence length to "output_fn"; prints length')
+  print('  of longest sequence to cout (add two for <bos>, <eos>)')
+  print('delimiter:')
+  print('  is hard-coded for comma\n')
+  exit(9)
+
+a = open(sys.argv[1])
+a.readline() #discard header
+out = open(sys.argv[2], 'w')
+
+longest = 0
+longest_seq = ''
+longest_line_num = 0
+
+data = {}
+j = 0
+for line in a :
+  j += 1
+  if j % 1000 == 0 : print(str(j/1000) + 'K lines processed')
+  t = line.split(',')
+  x = len(t[0])
+  if x not in data :
+    data[x] = 0
+  data[x] += 1
+  if x > longest : 
+    longest = x
+    longest_seq = t[0]
+    longest_line_num = j-1
+
+v = []
+for ell in data :
+  v.append( (ell, data[ell]) )
+v.sort()
+
+
+for d in v :
+  out.write(str(d[0]) + ' ' + str(d[1]) + '\n')
+print('\noutput written to: ', sys.argv[2] + '\n')
+out.close()
+
+print('\nlongest sequence length: ' + str(longest))
+print('line number of longest: ' + str(longest_line_num))
+print('longest sequence length: ' + longest_seq)
diff --git a/applications/ATOM/utils/compute_sample_lengths.py b/applications/ATOM/utils/compute_sample_lengths.py
new file mode 100644
index 00000000000..838779d50a9
--- /dev/null
+++ b/applications/ATOM/utils/compute_sample_lengths.py
@@ -0,0 +1,43 @@
+import sys
+
+if len(sys.argv) != 3 :
+  print(F'''
+    usage: {sys.argv[0]} input_fn output_fn
+    function: computes the length of each SMILES string
+    output: each line of output contains a file name, followed by
+            the length of each string
+    where:
+       "input_fn" contains the names of one or more smiles files;
+       Assumes each file contains a single header line;
+       Assumes delimiter is either tab or comma
+    '''
+  )
+  exit(9)
+
+a = open(sys.argv[1])
+out = open(sys.argv[2], 'w')
+
+sample_id = -1
+num_files = -1
+for line in a :
+  out.write(line[:-1])
+  print('opening: ' + line[:-1])
+  b = open(line[:-1])
+  num_files += 1
+  b.readline() #discard header
+  for line in b :
+    sample_id += 1
+    j = line.find(',')
+    if j == -1 :
+      j == line.find('\t')
+    if j == -1 :
+      print(f"failed to find delimiting character (comma or tab) on line # {sample_id} of file: {line[:-1]}")
+      exit(9)
+    out.write(' ' + str(len( line[:j] )))
+  out.write('\n')
+  b.close()
+  if num_files == 3 : break
+
+a.close()
+out.close()
+print(F'\noutput has been written to: {sys.argv[2]}\n')
diff --git a/applications/ATOM/utils/compute_vocab.cpp b/applications/ATOM/utils/compute_vocab.cpp
new file mode 100644
index 00000000000..4c3675dc4ad
--- /dev/null
+++ b/applications/ATOM/utils/compute_vocab.cpp
@@ -0,0 +1,109 @@
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <set>
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/utils/commify.hpp"
+
+using namespace lbann;
+
+int main(int argc, char **argv) {
+  lbann::world_comm_ptr comm = lbann::initialize(argc, argv);
+  int np = comm->get_procs_in_world();
+
+  std::cerr << "STARTED!\n";
+
+  try {
+
+    if (np != 1) {
+      LBANN_ERROR("please run with a single processor");
+    }
+    if (argc < 3) {
+      std::cerr
+        << "usage: " << argv[0]
+        << " --input_fn=<string> --output_fn=<string> --delimiter=<char>\n"
+        << "where: input_fn is csv file containing SMILES strings;\n"
+        << "       --delimiter is c (comma), t (tab) or 0 (none)\n"
+        << "function: computes vocabulary\n";
+      exit(9);
+    }
+
+    options *opts = options::get();
+    opts->init(argc, argv);
+    double tm1 = get_time();
+
+    const std::string input_fn = opts->get_string("input_fn");
+    std::ifstream in(input_fn.c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open ", input_fn , " for reading");
+    }
+
+    const std::string output_fn = opts->get_string("output_fn");
+    std::ofstream out(output_fn.c_str(), std::ios::binary);
+    if (!out) {
+      LBANN_ERROR("failed to open ", output_fn, " for writing");
+    }
+
+    const std::string w = opts->get_string("delimiter");
+    const char ww = w[0];
+    char d = 0;
+    switch (ww) {
+      case 'c' :
+        d = ',';
+        break;
+      case 't' :
+        d = '\t';
+        break;
+      case '0' :
+        d = '\0';
+        break;
+      default :
+        LBANN_ERROR("Invalid delimiter character; should be 'c', 't', '0'; you passed: ", ww);
+    }
+
+    std::set<char> s;
+
+    std::string line;
+    getline(in, line); //discard header
+    size_t j = 1;
+    while (!in.eof()) {
+      ++j;
+      if (j % 1000 == 0) std::cout << j/1000 << "K lines processed" << std::endl;
+      getline(in, line);
+      if (line.size() < 5) continue;
+      size_t h = line.find(d);
+      if (h == std::string::npos) {
+        LBANN_ERROR("failed to find delimiter: ", d, " on line ", j);
+      }
+      const std::string smiles = line.substr(0, h);
+      for (const auto &t : smiles) {
+        s.insert(t);
+      }
+    }
+
+    int idx = 0;
+    for (const auto &t : s) {
+      out << t << " " << idx++ << std::endl;
+    }
+    out << "<bos> " << idx++ << std::endl;
+    out << "<eos> " << idx++ << std::endl;
+    out << "<pad> " << idx++ << std::endl;
+    out << "<unk> " << idx++ << std::endl;
+
+    in.close();
+    out.close();
+
+    std::cout << "\nprocessing time: " << get_time() - tm1 << std::endl;
+
+  } catch (lbann::exception& e) {
+    El::ReportException(e);
+    return EXIT_FAILURE;
+  } catch (std::exception& e) {
+    El::ReportException(e);
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/applications/ATOM/utils/compute_vocab_frequencies.py b/applications/ATOM/utils/compute_vocab_frequencies.py
new file mode 100644
index 00000000000..a0d04590bd9
--- /dev/null
+++ b/applications/ATOM/utils/compute_vocab_frequencies.py
@@ -0,0 +1,45 @@
+import sys
+
+if len(sys.argv) != 3 :
+  print(F'''
+    usage: {sys.argv[0]} input_filename output_filename
+    where:
+      "input_filename" is a SMILES csv filename
+    function:
+      computes the frequency of each character in the vocabulary,
+      and prints same to "output_filename"
+  '''
+  )
+  exit(9)
+
+out = open(sys.argv[2], 'w')
+
+a = open(sys.argv[1])
+a.readline() # discard header
+h = {}
+j = 1
+for line in a :
+  k1 = line.find(',')
+  if k1 == -1 :
+    k1 = line.find('\t')
+    if k1 == -1 :
+      print('failed to find comma or tab delimiter on line # ' + str(j))
+      exit(9)
+  s = line[:k1]
+  for c in s :
+    if c not in h :
+      h[c] = 0
+    h[c] += 1  
+  j += 1
+  if j % 1000 == 0 : print(str(j/1000) + 'K samples processed')
+
+v = []
+for c in h.keys() :
+  v.append( (h[c], c) )
+v.sort()
+
+for x in v :
+  print(x)
+  out.write(str(x[0]) + ' ' + str(x[1]) + '\n')
+out.close()
+print('\n\nOutput has also been written to: ' + sys.argv[2] + '\n')
diff --git a/applications/ATOM/zinc_data_config.json b/applications/ATOM/zinc_data_config.json
new file mode 100644
index 00000000000..a583ce84eea
--- /dev/null
+++ b/applications/ATOM/zinc_data_config.json
@@ -0,0 +1,10 @@
+{
+
+  "pad_index": 28,
+  "sequence_length": 57,
+  "max_seq_len": 57,
+  "data_path": "/p/lustre2/brainusr/datasets/zinc/moses_zinc_train250K.npy",
+  "embedding_dim": 30,
+  "num_embeddings": 30
+
+}
diff --git a/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt b/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt
new file mode 100644
index 00000000000..951825fe7a7
--- /dev/null
+++ b/applications/CANDLE/pilot2/data/Min_Max_Mean_Std-Dev.txt
@@ -0,0 +1,5 @@
+max x/y/z:  9.6067    7.46754      3.14485 
+min x/y/z: -3.53206  -5.68766     -7.9664 
+mean x/y/z: 1.66227  -0.00509318  -2.23788 
+std dev:    1.17969   0.773128     1.04863 
+
diff --git a/applications/CANDLE/pilot2/data/pilot2_normalization.txt b/applications/CANDLE/pilot2/data/pilot2_normalization.txt
new file mode 100644
index 00000000000..635bb276d2b
--- /dev/null
+++ b/applications/CANDLE/pilot2/data/pilot2_normalization.txt
@@ -0,0 +1,15 @@
+max min mean std_dev:
+2.40001 0 0.36777 0.248641
+1.98564 0 0.186917 0.158741
+1.95085 0 0.138673 0.127767
+2.53137 3.24127e-08 0.447338 0.240423
+2.48228 0 0.273659 0.220102
+2.3157 0 0.408046 0.198704
+2.06274 0 0.0375413 0.0741625
+2.50623 3.25748e-05 0.663633 0.218187
+2.6136 6.98301e-06 0.712461 0.265916
+2.41215 0 0.2882 0.213314
+2.80934 0 0.0411697 0.0691358
+2.4133 0 0.151854 0.150553
+2.69434 0 0.724106 0.293084
+2.71227 0.00951633 0.893146 0.222718
diff --git a/applications/CANDLE/pilot2/tools/CMakeLists.txt b/applications/CANDLE/pilot2/tools/CMakeLists.txt
new file mode 100644
index 00000000000..e95927d0252
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Add a target to control building all the utilities
+add_custom_target(pilot2-utils)
+
+add_executable(test_ras_lipid_data_files_for_errors
+  EXCLUDE_FROM_ALL test_ras_lipid_data_files_for_errors.cpp)
+  target_link_libraries(test_ras_lipid_data_files_for_errors lbann)
+  add_dependencies(pilot2-utils test_ras_lipid_data_files_for_errors)
+
+add_executable(compute_ras_lipid_sig1_normalization
+  EXCLUDE_FROM_ALL compute_ras_lipid_sig1_normalization.cpp)
+  target_link_libraries(compute_ras_lipid_sig1_normalization lbann)
+  add_dependencies(pilot2-utils compute_ras_lipid_sig1_normalization)
+
+add_executable(compute_ras_lipid_bbs_euclid_distances
+  EXCLUDE_FROM_ALL compute_ras_lipid_bbs_euclid_distances.cpp)
+  target_link_libraries(compute_ras_lipid_bbs_euclid_distances lbann)
+  add_dependencies(pilot2-utils compute_ras_lipid_bbs_euclid_distances)
+
+add_executable(compute_ras_lipid_bbs_max_min
+  EXCLUDE_FROM_ALL compute_ras_lipid_bbs_max_min.cpp)
+  target_link_libraries(compute_ras_lipid_bbs_max_min lbann)
+  add_dependencies(pilot2-utils compute_ras_lipid_bbs_max_min)
+
+add_executable(compute_ras_lipid_bbs_euclid_normalization
+  EXCLUDE_FROM_ALL compute_ras_lipid_bbs_euclid_normalization.cpp)
+  target_link_libraries(compute_ras_lipid_bbs_euclid_normalization lbann)
+  add_dependencies(pilot2-utils compute_ras_lipid_bbs_euclid_normalization)
+
+# Install the binaries
+install( TARGETS 
+  test_ras_lipid_data_files_for_errors 
+  compute_ras_lipid_sig1_normalization 
+  compute_ras_lipid_bbs_euclid_distances
+  compute_ras_lipid_bbs_euclid_normalization
+
+  OPTIONAL
+  EXPORT LBANNTargets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+
+#(from Tom)
+# The use of `OPTIONAL` here will trigger CMake warnings. These can
+# safely be ignored and tests confirm that. See these for more info:
+#
+# https://gitlab.kitware.com/cmake/cmake/issues/18258
+# https://cmake.org/pipermail/cmake/2011-August/046014.html
+
diff --git a/applications/CANDLE/pilot2/tools/Notes.txt b/applications/CANDLE/pilot2/tools/Notes.txt
new file mode 100644
index 00000000000..c3a27db71d3
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/Notes.txt
@@ -0,0 +1,39 @@
+These notes are for use during development
+=========================================================================
+(mail 1/21)
+Good question.  It's a good idea to scale these values, especially since we are mixing them into the input with lipid densities.  Can you look at doing the same three options here?
+
+a) raw values w/o scaling
+b) min-max
+c) mean/stddev (z-scale)
+
+(mail from 1/18/2020)
+Hi Dave,
+We currently have the models reading in the lipid densities (13x13x14) cube.  We next want to also include information about the RAS BB positions.  Each sample (frame) has a string of 184 RAS protein backbone beads, and we have the (x,y,z) coordinates of each one.  You can see this as the 184x3 portion below:
+
+In [1]: import numpy as np
+
+In [2]: dat = np.load('/p/gpfs1/moody20/pilot2/lipid_density/sr4/pfpatch_000000917813_sig1.npz')
+
+In [3]: dat.keys()
+Out[3]: ['rots', 'states', 'tilts', 'density_sig1', 'frames', 'bbs', 'probs']
+
+In [4]: dat['bbs'].shape
+Out[4]: (586, 184, 3)
+
+That particular patch file has 586 frames, and each frame has the xyz-coords for 184 backbone beads.
+
+There are lots of ways we can represent this data.  To get started, we are planning to use a matrix giving the distance between each pair of the RAS BB beads.  That gives a 184x184 matrix, where each row is the Euclidean distance from a particular bead to every bead in the chain (including itself).
+
+In addition to that, we'd like to tack on a last column to this matrix that encodes the z-value for each bead.  We may want to normalize that by using the relative distance to the z-position of the first bead in the chain (or maybe the last bead, I forget).  That particular bead, which ever one it is, is considered to be attached to the cell membrane, so it serves as a good baseline.
+
+Can you please help us extend the LBANN data reader to support this additional input?
+
+There is one extra complication in that some of these patch files have more than one RAS.  I think to get started, we only want to consider those patches with a single RAS.
+
+That might be hard to express well in email, so give me a call if you'd like to talk through the details.
+=========
+For "z-value", I just mean the value of the z-coordinate of each bead.  It turns out the cell membrane lies in the xy-plane so that the z-coordinate encodes the distance each bead is from the plane, once we subtract off the value of z-coordinate for the anchor bead that is attached the membrane.
+
+There is an sr4_counts.npz file that list the count of RAS in each patch.  We can use that to filter out patches with more than one RAS.  To get started, it might be ok to ignore that fact.  I think I have only captured a single RAS backbone in each patch, rather than a 184-chain for each one.  At some point when we care about patches with more than one RAS, I'll likely have to regenerate the dataset to make that clean.
+
diff --git a/applications/CANDLE/pilot2/tools/README.txt b/applications/CANDLE/pilot2/tools/README.txt
new file mode 100644
index 00000000000..9dea7ae2278
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/README.txt
@@ -0,0 +1,7 @@
+The tools in this directory are embarrassingly 
+parallel. They don't use GPUs, so you are advised to compile
+lbann without CUDA, in order to use all avalailable CPUs
+on your nodes. 
+
+Typical invocation on lassen:
+  $ jsrun  -n 8 -a 40  -d packed -b "packed:10" -r 1 -c 40 <executable> <args>
diff --git a/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt b/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt
new file mode 100644
index 00000000000..c7a3a98b105
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/README_bbs_binary_file_format.txt
@@ -0,0 +1,17 @@
+Format for binary files; all values are floats
+
+num_frames  #aka, num_samples
+num_beads   #for now, will always be 184
+
+#Repeating, for each frame:
+
+  z-coordinates for each bead #184 entries
+
+  #Repeating, for each beads in the current frame:
+    euclidean distance between beads j and k, 
+    j=0..num_beads-1, k=j+1..num_beads (16836 entries per frame)
+
+Notes:
+  Optional normalization of euclidean distances and/or Z-coordinates
+  will be computed during the data_reader load method
+
diff --git a/applications/CANDLE/pilot2/tools/common.hpp b/applications/CANDLE/pilot2/tools/common.hpp
new file mode 100644
index 00000000000..2732d238091
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/common.hpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef __PILOT2_TOOLS_COMMON_HPP_
+#define __PILOT2_TOOLS_COMMON_HPP_
+
+
+namespace lbann {
+
+const int Num_beads = 184;
+const int Dims = 3;
+const int Word_size = 4;
+
+const int Num_dist = 16836;
+  // 16836 is number of euclid distances
+  // for j in range(0, 183):
+  //   for k in range(j+1, 184):
+  //       t += 1
+
+//=======================================================================
+struct xyz {
+  xyz() {}
+  xyz(float xx, float yy, float zz) : x(xx), y(yy), z(zz) { }
+
+  float x;
+  float y;
+  float z;
+
+  float dist(const xyz &p) {
+    return sqrt( 
+             (pow( (x-p.x), 2) 
+             + pow( (x-p.x), 2) 
+             + pow( (x-p.x), 2))
+           );  
+  }
+  friend std::ostream& operator<<(std::ostream& os, const xyz& p);
+};
+
+std::ostream& operator<<(std::ostream& os, const xyz &p) {
+  os << p.x << "," << p.y << "," << p.z << " ";
+  return os;
+}
+
+//=======================================================================
+
+//void testme();
+
+bool sanity_check_npz_file(std::map<std::string, cnpy::NpyArray> &a, const std::string filename) {
+  const std::vector<size_t> shape = a["bbs"].shape;
+  const float num_samples = static_cast<float>(shape[0]);
+  const int word_size = static_cast<int>(a["bbs"].word_size);
+  bool is_good = true;
+  if (shape[1] != Num_beads || shape[2] != Dims || word_size != Word_size) {
+    is_good = false;
+    std::stringstream s3;
+    for (auto t : shape) { s3 << t << " "; }
+    LBANN_WARNING("Bad file: ", filename, " word_size: ", word_size, " dinum_samples: ", num_samples, " shape: ", s3.str());
+  }
+  return is_good;
+}
+
+void read_sample(
+  int id, 
+  std::vector<float> &data,
+  std::vector<float> &z_coordinates,
+  std::vector<float> &distances) {
+
+  size_t offset = 2 /* n_frames, n_beads */ + id * (Num_beads + Num_dist);
+  z_coordinates.resize(Num_beads);
+  for (size_t j=offset; j < offset + Num_beads; j++) {
+    z_coordinates[j-offset] = data[j];
+  }
+  offset += Num_beads;
+  for (size_t j = offset; j < offset + Num_dist; j++) {
+    if (j >= data.size()) {
+      LBANN_ERROR("j >= data.size(); j: ",j, " datalsize: ", data.size(), " offset: ", offset, " Num_beads: ",Num_beads, " Num_dist: ", Num_dist);
+    }
+    if (j-offset >= distances.size()) {
+      LBANN_ERROR("j-offset >= data.size(); j-offset: ", j-offset, " data.size: ", data.size());
+    }  
+    distances[j-offset] = data[j];
+  }
+}
+
+
+} //namespace lbann 
+
+#endif   // __PILOT2_TOOLS_COMMON_HPP_
diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp
new file mode 100644
index 00000000000..6c3fe6ae9d6
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_distances.cpp
@@ -0,0 +1,140 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <cmath>
+#include <cfloat>
+#include "common.hpp"
+
+using namespace lbann;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+  try {
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (! opts->has_string("filelist")) {
+      LBANN_ERROR("usage: ", argv[0], " --filelist=<string>");
+    }
+    std::string input_fn = opts->get_string("filelist");
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+    size_t nn = 0; // only used for user feedback
+    std::vector<xyz> beads(Num_beads);
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      if (!rank && j == 0) {
+        std::cerr << "Opening for processing: " << filenames[j] << std::endl;
+      }
+
+      // Get num samples, and run sanity checks
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      bool is_good =  sanity_check_npz_file(a, filenames[j]);
+
+      // Open output file
+      std::string fn = filenames[j] + ".bbs_stats";
+      if (!is_good) {
+        fn += ".bad";
+      }
+      std::ofstream out(fn.c_str(), std::ios::binary);
+      if (!out) {
+        LBANN_ERROR("failed to open ", fn, "for writing");
+      }
+
+      if (is_good) {
+        const std::vector<size_t> shape = a["bbs"].shape;
+        const float num_frames = static_cast<float>(shape[0]);
+
+        // output number of frames and beads
+        out.write((char*)&num_frames, sizeof(float));
+        float nbeads = static_cast<float>(Num_beads);
+        out.write((char*)&nbeads, sizeof(float));
+
+        // Get the bbs data array
+        const float *bd = a["bbs"].data<float>();
+
+        // Loop over the samples (frames)
+        for (int k=0; k<num_frames; k++) {
+          // Cache all RAS BB bead coordinates for the current sample
+          beads.clear();
+          for (size_t i=0; i<Num_beads; i++) {
+            beads.push_back(xyz(bd[0], bd[1], bd[2]));
+            bd += 3;
+          }
+
+          // Write output for the current sample
+          //
+          // z-coordinates
+          for (size_t g=0; g<Num_beads; g++) {
+            out.write((char*)&beads[g].z, sizeof(float));
+          }
+
+          // euclidean distance between each pair of beads i, h,
+          // where h >= i
+          for (int i=0; i<Num_beads-1; i++) {
+            for (int h=i+1; h<Num_beads; h++) {
+              float d = beads[i].dist(beads[h]);
+              out.write((char*)&d, sizeof(float));
+            }
+          }
+        }
+
+        // User feedback
+        ++nn;
+        if (!rank) {
+          std::cerr << "approx " << (nn*np) << " files of "
+          << filenames.size() << " processed\n";
+        }
+
+      } // if (is_good)
+
+      // Close output file
+      out.close();
+    }
+
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_normalization.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_normalization.cpp
new file mode 100644
index 00000000000..e2c4e25ef19
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_euclid_normalization.cpp
@@ -0,0 +1,197 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <cmath>
+#include <cfloat>
+#include "common.hpp"
+
+using namespace lbann;
+
+void read_file(const std::string &filename, std::vector<float> &data);
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+
+  try {
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (! opts->has_string("filelist")) {
+      LBANN_ERROR("usage: ", argv[0], " --filelist=<string>");
+    }
+
+    std::string input_fn = opts->get_string("filelist");
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+    std::vector<float> data;
+    std::vector<float> z_coords(Num_beads);
+    std::vector<float> distances(Num_dist);
+
+    double max = FLT_MIN;
+    double min = FLT_MAX;
+    double total= 0;              //for computing mean
+    double n_samples = 0;  //for computing mean
+
+    size_t nn = 0;
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      const std::string fn = filenames[j] + ".bbs_stats";
+      read_file(fn, data);
+      const float *w = data.data();
+      int n_frames = static_cast<int>(*w++);
+      int n_beads = static_cast<int>(*w++);
+      if (n_beads != Num_beads) {
+        LBANN_ERROR("n_beads != Num_beads; n_beads: ", n_beads, " Num_beads: ", Num_beads);
+      }
+      for (int h=0; h<n_frames; h++) {
+        read_sample(h, data, z_coords, distances);
+        int offset = 0;
+        for (int i=0; i<Num_beads-1; i++) {
+          for (int k=i+1; k<Num_beads; k++) {
+            float dist_h_to_i = distances[offset];
+            if (dist_h_to_i < min) { min = dist_h_to_i; }
+            if (dist_h_to_i > max) { max = dist_h_to_i; }
+            total += dist_h_to_i;
+            ++n_samples;
+            offset++;
+          }
+        }
+      }
+
+      // User feedback
+      ++nn;
+      if (!rank) {
+        std::cerr << "approx " << (nn*np) << " files of "
+                  << filenames.size() << " processed\n";
+      }
+    }
+
+    //==================================================================
+
+    // Collect and report global min/max/mean/std-dev values
+    // (using MPI native calls because having separate calls for root/non-root
+    // processes is just annoying. We also have well over a dozen reduce
+    // methods, and I can never remember which to use)
+    //
+    double max_all;
+    double min_all;
+    double total_all;
+    double n_samples_all;
+
+    // only master needs to know min and max
+    MPI_Reduce(&max, &max_all, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&min, &min_all, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+    // all ranks need to know totals and num_samples, in order to compute
+    // std deviation
+    MPI_Allreduce(&total, &total_all, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&n_samples, &n_samples_all, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    double mean = (total_all / n_samples_all);
+
+    // compute standard deviation
+    double v_minus_mean_squared = 0.;
+    nn = 0;
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      const std::string fn = filenames[j] + ".bbs_stats";
+      read_file(fn, data);
+      const float *w = data.data();
+      int n_frames = static_cast<int>(*w++);
+      int n_beads = static_cast<int>(*w++);
+      if (n_beads != Num_beads) {
+        LBANN_ERROR("n_beads != Num_beads");
+      }
+
+      for (int h=0; h<n_frames; h++) {
+        w += Num_beads; // skip over z-coordinates
+        for (int x=0; x<Num_beads-1; x++) {
+          for (int y=x+1; y<Num_beads; y++) {
+            float v = *w++;
+            v_minus_mean_squared += (v-mean)*(v-mean);
+          }
+        }
+      }
+
+      // User feedback
+      ++nn;
+      if (!rank) {
+        std::cerr << "(2) approx " << (nn*np) << " files of "
+                  << filenames.size() << " processed\n";
+      }
+    }
+
+    double all_minus_mean_squared;
+    MPI_Reduce(&v_minus_mean_squared, &all_minus_mean_squared, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    if (!rank) {
+      double std_dev;
+      double v3 = all_minus_mean_squared / n_samples_all;
+      std_dev = sqrt(v3);
+
+      std::cout << "\nmax: " << max_all << std::endl;
+      std::cout << "min: " << min_all << std::endl;
+      std::cout << "mean: " << mean << std::endl;
+      std::cout << "std dev: " << std_dev << std::endl;
+      std::cout << "n_samples_all: " << n_samples_all << std::endl;
+      std::cout << "n_samples: " << n_samples << std::endl;
+    }
+
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
+
+void read_file(const std::string &filename, std::vector<float> &data) {
+  std::ifstream in(filename, std::ios::binary);
+  if (!in) {
+    LBANN_ERROR("failed to open ", filename, " for reading");
+  }
+  in.seekg(0, in.end);
+  size_t n = in.tellg();
+  in.seekg(0, in.beg);
+  data.resize(n);
+  char *work = reinterpret_cast<char*>(data.data());
+  in.read(work, n);
+  if (static_cast<size_t>(in.gcount()) != n) {
+    LBANN_ERROR("in.gcount() != n (gcount: ", in.gcount(), "; n: ", n, ") for file: ", filename);
+  }
+}
diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp
new file mode 100644
index 00000000000..04d5a06918a
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_bbs_max_min.cpp
@@ -0,0 +1,187 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <cmath>
+#include <cfloat>
+
+using namespace lbann;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+  try {
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (! opts->has_string("filelist")) {
+      LBANN_ERROR("usage: ", argv[0], " --filelist=<string>");
+    }
+
+    std::string input_fn = opts->get_string("filelist");
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+    size_t nn = 0; // only for user feedback
+    std::vector<float> max(3, FLT_MIN);
+    std::vector<float> min(3, FLT_MAX);
+    std::vector<double> total(3, 0.); //for computing mean
+    size_t count = 0;            //for compputing mean
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+
+      // Get num samples, and sanity check
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      const std::vector<size_t> shape = a["bbs"].shape;
+      const size_t num_frames = shape[0];
+      const size_t word_size = a["bbs"].word_size;
+      bool is_good = true;
+      if (shape[1] != 184 || shape[2] != 3 || word_size != 4) {
+        is_good = false;
+        std::stringstream s3;
+        for (auto t : shape) { s3 << t << " "; }
+        LBANN_WARNING("Bad file: ", filenames[j], " word_size: ", word_size, " dinum_frames: ", num_frames, " shape: ", s3.str());
+      }
+
+      if (is_good) {
+
+        // Get the bbs data array
+        const float *data = a["bbs"].data<float>();
+
+        // Loop over the bbs entries
+        for (size_t k=0; k<num_frames*184; k++) {
+          float xx = data[0];
+          float yy = data[1];
+          float zz = data[2];
+          if (xx < min[0]) min[0] = xx;
+          if (xx > max[0]) max[0] = xx;
+          if (yy < min[1]) min[1] = yy;
+          if (yy > max[1]) max[1] = yy;
+          if (zz < min[2]) min[2] = zz;
+          if (zz > max[2]) max[2] = zz;
+          total[0] += xx;
+          total[1] += yy;
+          total[2] += zz;
+          data += 3;
+          ++count;
+        }
+
+        ++nn;
+        if (!rank) {
+          std::cout << "approx " << utils::commify(nn*np) << " files of "
+          << utils::commify(filenames.size()) << " processed\n";
+        }
+      }
+    } // END: for (size_t j=rank; j<filenames.size(); j+=np)
+
+    // Collect and report global min/max values
+    // (using MPI native calls because having separate calls for root/non-root
+    // processes is just annoying. We also have well over a dozen reduce
+    // methods, and I can never remember which to use
+    std::vector<float> max_all(3);
+    std::vector<float> min_all(3);
+    std::vector<double> mean(3);
+    size_t count_all;
+
+    // only master needs to know min and max
+    MPI_Reduce(max.data(), max_all.data(), 3, MPI_FLOAT, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(min.data(), min_all.data(), 3, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);
+    // all ranks need to know totals and num_samples, in order to compute
+    // std deviation
+    MPI_Allreduce(total.data(), mean.data(), 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&count, &count_all, 3, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
+
+    for (size_t i=0; i<3; i++) {
+      mean[i] /= count_all;
+    }
+
+    // compute standard deviation
+    std::vector<double> v_minus_mean_squared(3, 0);
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      const std::vector<size_t> shape = a["bbs"].shape;
+      const size_t word_size = a["bbs"].word_size;
+      const size_t num_samples = shape[0];
+      bool is_good = true;
+      if (shape[1] != 184) { is_good = false; }
+      if (shape[2] != 3) { is_good = false; }
+      if (word_size != 4) { is_good = false; }
+      if (is_good) {
+        const float *data = a["bbs"].data<float>();
+        for (size_t k=0; k<num_samples*184; k++) {
+          float xx = data[0];
+          float yy = data[1];
+          float zz = data[2];
+          v_minus_mean_squared[0] += ((xx - mean[0])*(xx - mean[0]));
+          v_minus_mean_squared[1] += ((yy - mean[1])*(yy - mean[1]));
+          v_minus_mean_squared[2] += ((zz - mean[2])*(zz - mean[2]));
+          data += 3;
+        }
+      }
+    }
+    std::vector<double> all_minus_mean_squared(3, 0.);
+    std::vector<double> std_dev(3, 0.);
+    MPI_Reduce(v_minus_mean_squared.data(), all_minus_mean_squared.data(), 3, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    if (!rank) {
+      for (size_t i=0; i<3; i++) {
+        double v3 = all_minus_mean_squared[i] / count_all;
+        std_dev[i] = sqrt(v3);
+      }
+
+      std::cout << "\nmax x/y/z: ";
+      for (auto t : max_all) std::cout << t << " ";
+      std::cout << std::endl;
+      std::cout << "min x/y/z: ";
+      for (auto t : min_all) std::cout << t << " ";
+      std::cout << std::endl;
+      std::cout << "mean x/y/z: ";
+      for (auto t : mean) std::cout << t << " ";
+      std::cout << std::endl;
+      std::cout << "std dev: ";
+      for (auto t : std_dev) std::cout << t << " ";
+      std::cout << std::endl;
+    }
+
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp b/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp
new file mode 100644
index 00000000000..343c9a9ce31
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/compute_ras_lipid_sig1_normalization.cpp
@@ -0,0 +1,219 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <math.h>
+
+using namespace lbann;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+  try {
+    // Initialize options db (this parses the command line)
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (argc == 1) {
+      if (master) {
+        std::cerr << "usage: " << argv[0] << " --filelist=<string> --output_fn=<string>" << std::endl;
+      }
+      return EXIT_FAILURE;
+    }
+
+    if (! (opts->has_string("filelist") && opts->has_string("output_fn"))) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: improper invocation; run with no cmd line args for proper invocation");
+    }
+
+    const std::string input_fn = opts->get_string("filelist");
+    const std::string output_fn = opts->get_string("output_fn");
+
+    //sanity check that we can write to the output file
+    if (master) {
+      std::ofstream out(output_fn.c_str());
+      if (!out) {
+        LBANN_ERROR("failed to open ", output_fn, " for writing");
+      }
+      out.close();
+    }
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+    size_t total_elts_per_channel = 0;
+    std::vector<double> v_max(14, 0.);
+    std::vector<double> v_min(14, std::numeric_limits<double>::max());
+    std::vector<double> v_mean(14, 0);
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      if (master) {
+        std::cerr << "loading: " << filenames[j] << std::endl;
+      }
+
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      size_t n_elts = a["density_sig1"].num_vals;
+      double *data = reinterpret_cast<double*>(a["density_sig1"].data_holder->data());
+
+      int s = 0;
+      for (size_t i=0; i<n_elts; i++) {
+        double vv = data[i];
+        if (s == 0) {
+          ++total_elts_per_channel;
+        }
+        v_mean[s] += vv;
+        if (vv > v_max[s]) v_max[s] = vv;
+        if (vv < v_min[s]) v_min[s] = vv;
+        ++s;
+        if (s == 14) {
+          s = 0;
+        }
+      }
+      if (master) {
+        std::cerr << "approx " << utils::commify(total_elts_per_channel*np) << " samples processed" << std::endl;
+      }
+    }
+    // ==================== finished processing all files ========================
+
+    std::vector<double> f_max(14, 0.);
+    std::vector<double> f_min(14, 0.);
+    std::vector<double> f_mean(14, 0.);
+
+    comm->trainer_allreduce(v_max.data(), v_max.size(), f_max.data(), El::mpi::MAX);
+    comm->trainer_allreduce(v_min.data(), v_min.size(), f_min.data(), El::mpi::MIN);
+    comm->trainer_allreduce(v_mean.data(), v_mean.size(), f_mean.data(), El::mpi::SUM);
+    size_t n3 = comm->trainer_allreduce(total_elts_per_channel);
+    for (size_t j=0; j<f_mean.size(); j++) {
+      f_mean[j] /= n3;
+    }
+
+    // compute standard deviation
+    std::vector<double> v_minus_mean_squared(14, 0.);
+    std::vector<double> stdev(14, 0.);
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      if (master) {
+        std::cerr << "loading: " << filenames[j] << std::endl;
+      }
+
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      size_t n_elts = a["density_sig1"].num_vals;
+      double *data = reinterpret_cast<double*>(a["density_sig1"].data_holder->data());
+
+      int s = 0;
+      for (size_t i=0; i<n_elts; i++) {
+        double vv = data[i];
+        v_minus_mean_squared[s] += ((vv - f_mean[s])*(vv - f_mean[s]));
+//std::cout << "vv: " << vv << " f_mean[s]" << f_mean[s] << "  (vv - f_mean[s])*(vv - f_mean[s]): " << ((vv - f_mean[s])*(vv - f_mean[s])) << " v_minus_mean_squared[s]: " << v_minus_mean_squared[s] << std::endl;
+        ++s;
+        if (s == 14) {
+          s = 0;
+        }
+      }
+    }
+
+    std::vector<double> f_minus_mean_squared(14, 0.);
+    std::vector<double> f_std_dev(14, 0.);
+    comm->trainer_allreduce(v_minus_mean_squared.data(), v_minus_mean_squared.size(), f_minus_mean_squared.data(), El::mpi::SUM);
+    if (master) std::cout << "n3: " << n3 << std::endl;
+    for (size_t j=0; j<f_minus_mean_squared.size(); j++) {
+      double v3 = f_minus_mean_squared[j] / n3;
+if (j == 0) {
+  std::cout << "f_minus_mean_squared[j]: " << f_minus_mean_squared[j] << std::endl;
+  std::cout << "count: " << n3 << "\n / count: " << v3 << std::endl;
+}
+      f_std_dev[j] = sqrt(v3);
+if (j == 0) {
+  std::cout << "std_dev: "<< f_std_dev[j] << std::endl;
+}
+    }
+
+    if (master) {
+      std::cerr << "\nactual num samples processed: " << utils::commify(n3) << std::endl;
+      std::cerr << "\nmax channel normalization values: ";
+      for (auto t : f_max) {
+        std::cerr << t << " ";
+      }
+      std::cerr << std::endl;
+      std::cerr << "\nmin channel normalization values: ";
+      for (auto t : f_min) {
+        std::cerr << t << " ";
+      }
+      std::cerr << std::endl;
+      std::cerr << "\nmean channel normalization values: ";
+      for (auto t : f_mean) {
+        std::cerr << t << " ";
+      }
+      std::cerr << std::endl;
+      std::cerr << "\nstandard deviation values: ";
+      for (auto t : f_std_dev) {
+        std::cerr << t << " ";
+      }
+      std::cerr << std::endl;
+
+      std::ofstream out(output_fn.c_str());
+      out << "max min mean std_dev:\n";
+      for (size_t i=0; i<f_max.size(); i++) {
+        out << f_max[i] << " " << f_min[i] << " "
+            << f_mean[i] << " " << f_std_dev[i] << "\n";
+      }
+      /*
+       * TODO: perhaps put this in prototext, similar to data_reder_jag_conduit
+      out << "data_set_metadata_pilot2 {\n"
+          << "  pilot2_normalization {\n"
+          << "    channel_normalization_params: [\n";
+      for (auto t : f) {
+        out << "      " << t << "\n";
+      }
+      out << "    ]\n"
+          << "  }\n"
+          << "}\n";
+      */
+      out.close();
+    }
+
+if (master) std::cout << "num samples: " << n3 << std::endl;
+
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+
+  // Clean up
+  return EXIT_SUCCESS;
+}
diff --git a/applications/CANDLE/pilot2/tools/test_ras_lipid_data_files_for_errors.cpp b/applications/CANDLE/pilot2/tools/test_ras_lipid_data_files_for_errors.cpp
new file mode 100644
index 00000000000..8cf5ce6c78a
--- /dev/null
+++ b/applications/CANDLE/pilot2/tools/test_ras_lipid_data_files_for_errors.cpp
@@ -0,0 +1,100 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <math.h>
+
+using namespace lbann;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+  try {
+    // Initialize options db (this parses the command line)
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (argc == 1) {
+      if (master) {
+        std::cerr << "usage: " << argv[0] << " --filelist=<string>" << std::endl;
+      }
+      return EXIT_FAILURE;
+    }
+
+    if (! (opts->has_string("filelist"))) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: improper invocation; run with no cmd line args for proper invocation");
+    }
+
+    const std::string input_fn = opts->get_string("filelist");
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    char b[1024];
+    sprintf(b, "debug.%d", rank);
+    std::ofstream out(b);
+    if (! out) {
+      LBANN_ERROR("failed to open ", b, " for reading");
+    }
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      if (master) {
+        std::cerr << "loading: " << filenames[j] << std::endl;
+      }
+
+      out << "opening: " << filenames[j] << std::endl;
+      out.close();
+      out.open(b, std::ofstream::out | std::ofstream::app);
+
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+
+      out << "DONE! opening: " << filenames[j] << std::endl;
+      out.close();
+      out.open(b, std::ofstream::out | std::ofstream::app);
+    }
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+
+  // Clean up
+  return EXIT_SUCCESS;
+}
diff --git a/applications/CANDLE/pilot2/train_ras_classifier.py b/applications/CANDLE/pilot2/train_ras_classifier.py
new file mode 100644
index 00000000000..d6de5664fea
--- /dev/null
+++ b/applications/CANDLE/pilot2/train_ras_classifier.py
@@ -0,0 +1,133 @@
+import numpy as np
+import lbann
+import lbann.modules
+from util import preprocess_data
+
+# Data paths, directory where patches are located
+data_dir = 'data'
+samples = preprocess_data(data_dir)
+
+dims = len(samples[0])
+
+
+num_classes = 3
+num_channels = 14
+
+# Sample access functions
+def get_sample(index):
+    sample = samples[index]
+    return sample
+
+def num_samples():
+    return samples.shape[0]
+
+def sample_dims():
+    return [dims]
+
+def str_list(l):
+    return ' '.join([str(i) for i in l])
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+def construct_model():
+    """Model description
+
+    """
+    import lbann
+    import lbann.modules
+
+
+    fc = lbann.modules.FullyConnectedModule
+    conv = lbann.modules.Convolution2dModule
+
+    conv1 = conv(20, 3, stride=1, padding=1,name='conv1')
+    conv2 = conv(20, 3, stride=1, padding=1,name='conv2')
+    fc1 = fc(100, name='fc1')
+    fc2 = fc(20, name='fc2')
+    fc3 = fc(num_classes, name='fc3')
+    # Layer graph
+    input = lbann.Input(name='inp_tensor')
+    inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0, dims-1, dims]),name='inp_slice')
+    xdata = lbann.Identity(inp_slice)
+    ylabel = lbann.Identity(inp_slice, name='gt_y')
+    #NHWC to NCHW
+    x = lbann.Reshape(xdata, dims='14 13 13')
+    x = conv2(conv1(x))
+    x = lbann.Reshape(x, dims='3380')
+    x = lbann.Dropout(lbann.Relu(fc1(x)),keep_prob=0.5)
+    x = lbann.Dropout(fc2(x),keep_prob=0.5)
+    pred = lbann.Softmax(fc3(x))
+    gt_label  = lbann.OneHot(ylabel, size=num_classes)
+    loss = lbann.CrossEntropy([pred,gt_label],name='loss')
+    acc = lbann.CategoricalAccuracy([pred, gt_label])
+
+
+    layers = list(lbann.traverse_layer_graph(input))
+    # Setup objective function
+    weights = set()
+    for l in layers:
+      weights.update(l.weights)
+    obj = lbann.ObjectiveFunction(loss)
+
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer()]
+
+    # Construct model
+    num_epochs = 10
+    return lbann.Model(num_epochs,
+                       weights=weights,
+                       layers=layers,
+                       metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+def construct_data_reader():
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import this Python file to access the
+    sample access functions.
+
+    """
+    import os.path
+    import lbann
+    module_file = os.path.abspath(__file__)
+    module_name = os.path.splitext(os.path.basename(module_file))[0]
+    module_dir = os.path.dirname(module_file)
+
+    # Base data reader message
+    message = lbann.reader_pb2.DataReader()
+
+    # Training set data reader
+    data_reader = message.reader.add()
+    data_reader.name = 'python'
+    data_reader.role = 'train'
+    data_reader.shuffle = True
+    data_reader.percent_of_data_to_use = 1.0
+    data_reader.python.module = module_name
+    data_reader.python.module_dir = module_dir
+    data_reader.python.sample_function = 'get_sample'
+    data_reader.python.num_samples_function = 'num_samples'
+    data_reader.python.sample_dims_function = 'sample_dims'
+
+    return message
+
+if __name__ == '__main__':
+    import lbann
+    import lbann.contrib.launcher
+    mini_batch_size = 64
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model()
+    opt = lbann.Adam(learn_rate=0.001,beta1=0.9,beta2=0.99,eps=1e-8)
+    data_reader = construct_data_reader()
+    status = lbann.contrib.launcher.run(
+        trainer, model, data_reader, opt,
+        account='hpcdl',
+        scheduler='slurm',
+        time_limit=720,
+        nodes=1,
+        procs_per_node=1,
+        setup_only=False,
+        job_name='candle_p2_ras_classifier')
+    print(status)
diff --git a/applications/CANDLE/pilot2/util.py b/applications/CANDLE/pilot2/util.py
new file mode 100644
index 00000000000..9584a3c0941
--- /dev/null
+++ b/applications/CANDLE/pilot2/util.py
@@ -0,0 +1,95 @@
+import os
+import sys
+import random
+import numpy as np
+
+
+p0_thresh = 0.55
+p1_thresh = 0.85
+p2_thresh = 0.85
+
+def preprocess_data(dirspath,channels=None):
+# define a tuple of specific channels if user listed them
+  channels_tuple = tuple(range(14))
+  if channels is not None:
+    channels_tuple = tuple(channels)
+
+  files_train = []
+  states      = []
+  cons      = []
+
+  #for d in dirspath:
+  for _ in range(1):
+  # get list of all files in datapath and shuffle them
+  # sort by filename before shuffle so we could generate
+  # a consistent list if using the same random seed
+    filenames = os.listdir(dirspath)
+    filenames.sort()
+    random.shuffle(filenames)
+  
+    filenames_divide  = int(1.0 * len(filenames))
+    filenames_train = filenames[:filenames_divide]
+
+    files_train.append([dirspath + "/" + f for f in filenames_train])
+  
+    frame_start = 0
+  
+    for f in filenames_train:
+    # read in the data file
+      d = np.load(dirspath + '/' + f)
+  
+    # extract fields
+      p = d['probs'][d['frames'] >= frame_start]
+      s = d['states'][d['frames'] >= frame_start]
+      #n = d['density_sig1p5'][d['frames'] >= frame_start]
+      n = d['density_sig1'][d['frames'] >= frame_start]
+      #print p.shape, s.shape
+
+      s = s[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)]
+      n = n[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)]
+  
+      states.append(s)
+  
+  
+      # append concentrations, filter out by channel id(s) if given
+      # can we do channel first here, transpose?, move axis?
+      n = np.array(n)
+      n = n.astype(np.float32)
+      if channels:
+        cons.append(n[:,:,:,channels_tuple])
+      else:
+        cons.append(n)
+  
+
+  states      = np.concatenate(states,axis=0)
+  cons        = np.concatenate(cons,axis=0)
+
+  # print list of unique state labels and number of each
+  (values, cnt) = np.unique(states, return_counts=True)
+
+  min_cnt = np.min(cnt)
+  idx_0 = np.where(states == 0)
+  idx_0 = idx_0[0][:min_cnt]
+  idx_1 = np.where(states == 1)
+  idx_1 = idx_1[0][:min_cnt]
+  idx_2 = np.where(states == 2)
+  idx_2 = idx_2[0][:min_cnt]
+  ids = np.concatenate([idx_0, idx_1, idx_2], axis=0)
+  states = states[ids]
+  cons   = cons[ids]
+
+
+  # normalize each concentration channel independently
+  mins = cons.min(axis=(0,1,2), keepdims=True)
+  maxs = cons.max(axis=(0,1,2), keepdims=True)
+
+  cons      /= maxs
+  labels      = states
+   
+  #transpose to NCHW
+  cons = cons.transpose(0,3,1,2)
+
+  X = cons.reshape(cons.shape[0],-1)
+  y = labels.reshape(-1,1)
+  Xy_data = np.hstack((X,y))
+  return Xy_data
diff --git a/applications/CONTRIBUTING.md b/applications/CONTRIBUTING.md
new file mode 100644
index 00000000000..72ff8591e3f
--- /dev/null
+++ b/applications/CONTRIBUTING.md
@@ -0,0 +1,49 @@
+## Contributing Applications:
+
+The application directory contains the user-facing code for projects
+to use LBANN.  Each project directory should contain the python code
+to instantiate the model, run both training and inference, an
+experiments directory, as well as utility / helper code to pre- or
+post-process data.  In addition to project-specific directories the
+directory hierarchy groups together similar projects into broader
+categories, such as vision-based networks.
+
+### Directory Structure:
+
+```
+applications
+└─── ATOM
+```
+
+The applications directory has primary __projects__ directories as well
+as __categories__ that contain related __projects__.
+
+### Project Directory Structure:
+
+The general structure of a project directory should be:
+
+```
+<project>
+└─── README.md
+└─── <app>.py
+└─── lib_<app>.py
+└─── experiments
+      └─── run_<app>.py
+└─── utils
+
+```
+
+* README.md
+  * Describe the project, how to run it, etc.
+* `<app>.py`
+  * Python code that builds the model's compute graph
+* `lib_<app>.py`
+  * Common Python code that builds common substructurs used by the
+    application
+* experiments
+  * Directory to run an experiment.  Should include launcher scripts,
+    etc.
+  * `run_<app>.py`
+    * Launcher script to run the model in train or inference mode
+* utils
+  * Directory for holding pre- and post-processing scripts
diff --git a/applications/MOF/MOFae.py b/applications/MOF/MOFae.py
new file mode 100644
index 00000000000..bfef507aaad
--- /dev/null
+++ b/applications/MOF/MOFae.py
@@ -0,0 +1,115 @@
+import lbann 
+import os
+import os.path
+
+# ----------------------------------
+# Construct Graph
+# ----------------------------------
+def gen_layers(latent_dim, number_of_atoms):
+    ''' Generates the model for the 3D Convolutional Auto Encoder. 
+
+        returns the Directed Acyclic Graph (DAG) that the lbann 
+        model will run on. 
+    '''
+    input_ = lbann.Input( target_mode = "reconstruction")
+    tensors = lbann.Identity(input_)
+
+    tensors = lbann.Reshape(tensors, dims="11 32 32 32", name="Sample")
+    # Input tensor shape is  (number_of_atoms)x32x32x32
+
+    # Encoder 
+
+    x = lbann.Identity(tensors)
+    for i in range(4):
+        out_channels = latent_dim // (2 ** (3-i))
+        
+        x = lbann.Convolution(x,
+                  num_dims = 3, 
+                  num_output_channels = out_channels, 
+                  num_groups = 1, 
+                  conv_dims_i = 4, 
+                  conv_strides_i = 2, 
+                  conv_dilations_i = 1,
+                  conv_pads_i = 1,
+                  has_bias = True,
+                  name="Conv_{0}".format(i))
+
+        x = lbann.BatchNormalization(x, name="Batch_NORM_{0}".format(i+1))
+        x = lbann.LeakyRelu(x, name="Conv_{0}_Activation".format(i+1))
+
+    # Shape: (latent_dim)x2x2x2
+    encoded = lbann.Convolution(x,
+                   num_dims = 3, 
+                   num_output_channels = latent_dim, 
+                   num_groups = 1, 
+                   conv_dims_i = 2, 
+                   conv_strides_i = 2, 
+                   conv_dilations_i = 1,
+                   conv_pads_i  = 0,
+                   has_bias = True,
+                   name ="encoded")
+
+    # Shape: (latent_dim)1x1x1
+
+    # Decoder 
+
+    x = lbann.Deconvolution(encoded,
+                num_dims = 3,
+                num_output_channels = number_of_atoms * 16,
+                num_groups = 1,
+                conv_dims_i = 4,
+                conv_pads_i = 0,
+                conv_strides_i = 2,
+                conv_dilations_i = 1,
+                has_bias = True,
+                name="Deconv_1"
+                )
+    x = lbann.BatchNormalization(x, name="BN_D1") 
+    x = lbann.Tanh(x, name="Deconv_1_Activation")
+
+    for i in range(3):
+        out_channels = number_of_atoms * (2 ** (2-i))    
+        x = lbann.Deconvolution(x,
+                num_dims = 3,
+                num_output_channels = out_channels,
+                num_groups = 1,
+                conv_dims_i = 4,
+                conv_pads_i = 1,
+                conv_strides_i = 2,
+                conv_dilations_i = 1,
+                has_bias = True,
+                name="Deconv_{0}".format(i+2)
+                )
+        x = lbann.BatchNormalization(x, name="BN_D{0}".format(i+2))
+
+        if (i != 2): #Save the last activation layer because we want to dump the outputs 
+            x = lbann.Tanh(x, name="Deconv_{0}_Activation".format(i+2))
+     
+    decoded = lbann.Tanh(x, 
+                 name = "decoded")
+
+    img_loss = lbann.MeanSquaredError([decoded, tensors])
+
+    metrics = [lbann.Metric(img_loss, name='recon_error')]
+    # ----------------------------------
+    # Set up DAG
+    # ----------------------------------
+
+    layers = lbann.traverse_layer_graph(input_) #Generate Model DAG
+    return layers, img_loss, metrics
+def make_data_reader():
+    reader = lbann.reader_pb2.DataReader()
+    _reader = reader.reader.add()
+    _reader.name = 'python'
+    _reader.role = 'train'
+    _reader.shuffle = True
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = 'dataset'
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = 'get_train'
+    _reader.python.num_samples_function = 'num_train_samples'
+    _reader.python.sample_dims_function = 'sample_dims'
+
+    return reader 
+
+
diff --git a/applications/MOF/README.md b/applications/MOF/README.md
new file mode 100644
index 00000000000..d5444889519
--- /dev/null
+++ b/applications/MOF/README.md
@@ -0,0 +1,52 @@
+# Example models for 3D molecular generation 
+
+This directory contains LBANN implementations of 3D molecular generation models for Metal Organic Frameworks from the CoRE MOF Database. The models are based on 3D convolutional fliters on periodic 3D voxel grids. 
+
+
+## Dataset Information 
+
+The dataset used is a subset of the [CoRE MOF Database](https://gregchung.github.io/CoRE-MOFs/). Each Metal Organic Framework is represented as a 32x32x32x11 tensor. 
+
+The representation is channel-wise concatenation of 11 32x32x32 voxel grids, where each voxel grid represents the location of a particular element. 
+
+## Running Instructions
+
+Run in conjuction with the correct slurm / lsf command: 
+
+```
+python3 main.py --nodes N  --procs-per-node P --mini-batch-size B 
+```
+## Testing Dataset 
+
+To test the dataset: 
+
+```
+python3 -m unittest test/*
+```
+
+To test integration and performance:
+
+```
+cd test
+python3 -m pytest 
+```
+## Links  
+
+For more information on the data representation: 
+
+
+
+@article {Kimeaax9324,
+	author = {Kim, Baekjun and Lee, Sangwon and Kim, Jihan},
+	title = {Inverse design of porous materials using artificial neural networks},
+	volume = {6},
+	number = {1},
+	elocation-id = {eaax9324},
+	year = {2020},
+	doi = {10.1126/sciadv.aax9324},
+	publisher = {American Association for the Advancement of Science},
+	}
+	eprint = {https://advances.sciencemag.org/content/6/1/eaax9324.full.pdf},
+	journal = {Science Advances}
+}
+ 
diff --git a/applications/MOF/data/MOFdataset.py b/applications/MOF/data/MOFdataset.py
new file mode 100755
index 00000000000..19439578be2
--- /dev/null
+++ b/applications/MOF/data/MOFdataset.py
@@ -0,0 +1,36 @@
+from pathlib import Path 
+from typing import List
+import os
+import numpy as np 
+
+
+class MOFDataset():
+    '''
+    Custom Dataset loader for MOF data.  
+    '''
+    def __init__(self, path, transform=None):
+        self.path = path
+        path = Path(path)
+        self.data = np.load(path)
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        if self.transform is not None:
+            return self.transform(self.data[idx])
+        else:
+            return self.data[idx]
+        
+
+def test():
+    data_dir = os.path.dirname(os.path.realpath(__file__))
+    test_file_path = os.path.join(data_dir, 'mofs.npy')
+    test_data = MOFDataset(test_file_path)
+    
+    print(test_data[0].shape)
+    print(len(test_data))
+    
+if __name__ == '__main__':
+    test()
diff --git a/applications/MOF/data/__init__.py b/applications/MOF/data/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/applications/MOF/dataset.py b/applications/MOF/dataset.py
new file mode 100755
index 00000000000..a88f80f450a
--- /dev/null
+++ b/applications/MOF/dataset.py
@@ -0,0 +1,50 @@
+import os 
+import numpy as np 
+from data.MOFdataset import MOFDataset 
+
+# MOFdaset is a custom dataset class extending torch.utils.data.Dataset
+
+##
+## For an example look at: 
+## https://github.com/LLNL/lbann/blob/develop/applications/nlp/transformer/dataset.py
+##
+
+data_dir = os.path.dirname(os.path.realpath(__file__))
+
+## Add CLI arguments for training file location and error handling 
+train_file_path = os.path.join(data_dir, 'data/train_mofs.npy')  
+test_file_path = os.path.join(data_dir, 'data/test_mofs.npy')
+
+
+training_data = MOFDataset(train_file_path)
+test_data = MOFDataset(test_file_path)
+
+def get_train (index):
+	return np.float32(training_data[index].flatten()) #Iterable or 1 D array 
+
+def get_test (index):
+	return np.float32(test_data[index].flatten()) #Iterable or 1D array
+def num_train_samples():
+	return len(training_data)
+
+def num_test_samples():
+	return len(test_data)
+
+def sample_dims():
+	return (32*32*32*11, )
+
+if __name__ == '__main__':
+	data_dir = os.path.dirname(os.path.realpath(__file__))
+
+## Add CLI arguments for training file location and error handling 
+	train_file_path = os.path.join(data_dir, 'data/train_mofs.npy')  
+	test_file_path = os.path.join(data_dir, 'data/test_mofs.npy')
+
+	training_data =MOFDataset(train_file_path, no_grid=True)
+	test_data = MOFDataset(test_file_path, no_grid=True)
+
+	print(len(training_data))
+	print(training_data[0].shape)
+
+
+	
diff --git a/applications/MOF/main.py b/applications/MOF/main.py
new file mode 100644
index 00000000000..decc30ba52d
--- /dev/null
+++ b/applications/MOF/main.py
@@ -0,0 +1,92 @@
+import argparse 
+import lbann
+import MOFae
+import dataset
+import os 
+import lbann.contrib.launcher
+import lbann.contrib.args
+# ----------------------------------
+# Command-line arguments
+# ----------------------------------
+
+
+desc = ("Training 3D-CAE on 4D MOF Data using LBANN")
+
+parser = argparse.ArgumentParser(description = desc)
+
+parser.add_argument(
+	'--zdim', action='store',default = 2048, type=int, 
+	help="dimensionality of latent space (dedfault: 2048)", metavar = 'NUM')
+parser.add_argument(
+	'--atoms', action='store', default = 11,type=int, 
+	help="Number of atom species (default: 11)", metavar = 'NUM')
+parser.add_argument(
+    '--job-name', action='store', default='mofae', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=128, type=int,
+    help='mini-batch size (default: 128)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=100, type=int,
+    help='number of epochs (default: 100)', metavar='NUM')
+
+lbann.contrib.args.add_scheduler_arguments(parser)
+args = parser.parse_args()
+
+
+latent_dim = args.zdim
+number_of_atoms = args.atoms
+
+
+layers, img_loss, metrics = MOFae.gen_layers(latent_dim, number_of_atoms)
+mini_batch_size = args.mini_batch_size
+num_epochs = args.num_epochs 
+
+# Callbacks for Debug and Running Model 
+
+print_model = lbann.CallbackPrintModelDescription() #Prints initial Model after Setup
+
+training_output = lbann.CallbackPrint( interval = 1,
+				       print_global_stat_only = False) #Prints training progress
+
+gpu_usage = lbann.CallbackGPUMemoryUsage()
+
+encoded_output = lbann.CallbackDumpOutputs( layers = "decoded", batch_interval = 400, directory = os.path.dirname(os.path.realpath(__file__)), format="npy") 
+
+# ----------------------------------
+# Set up Experiment
+# ----------------------------------
+
+#Generate Model 
+model = lbann.Model(num_epochs,
+		    layers = layers,
+		    objective_function = img_loss, 
+		    metrics = metrics,
+		    callbacks = [print_model, training_output, gpu_usage, encoded_output]
+		   )
+
+#Optimizer 
+
+opt = lbann.Adam(learn_rate = 1e-2,
+		beta1 = 0.9,
+		beta2 = 0.99,
+		eps = 1e-8
+	        )
+
+data_reader = MOFae.make_data_reader()
+
+
+#Trainer 
+
+trainer = lbann.Trainer(mini_batch_size = mini_batch_size,
+						name = "MOF_AE_1"
+						)
+
+# ----------------------------------
+# Run Experiment 
+# ----------------------------------
+
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+
+
+lbann.contrib.launcher.run(trainer, model, data_reader, opt, **kwargs)
diff --git a/applications/MOF/test/__init__.py b/applications/MOF/test/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/applications/MOF/test/conftest.py b/applications/MOF/test/conftest.py
new file mode 100644
index 00000000000..e62cd503f49
--- /dev/null
+++ b/applications/MOF/test/conftest.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../../../bamboo/common_python')
+import tools
+import pytest, re, subprocess
+
+
+def pytest_addoption(parser):
+    cluster = re.sub('[0-9]+', '', subprocess.check_output(
+        'hostname'.split()).decode('utf-8').strip())
+    default_dirname = subprocess.check_output(
+        'git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
+    default_exes = tools.get_default_exes(default_dirname, cluster)
+
+    parser.addoption('--cluster', action='store', default=cluster,
+                     help='--cluster=<cluster> to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster')
+    parser.addoption('--dirname', action='store', default=default_dirname,
+                     help='--dirname=<path_to_dir> to specify the top-level directory. Default directory of build_lbann_lc executable')
+    parser.addoption('--exes', action='store', default=default_exes,
+                     help='--exes={compiler_name: path}')
+    parser.addoption('--weekly', action='store_true', default=False,
+                     help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False')
+
+
+@pytest.fixture
+def cluster(request):
+    return request.config.getoption('--cluster')
+
+
+@pytest.fixture
+def dirname(request):
+    return request.config.getoption('--dirname')
+
+
+@pytest.fixture
+def exes(request):
+    return request.config.getoption('--exes')
+
+
+@pytest.fixture
+def weekly(request):
+    return request.config.getoption('--weekly')
diff --git a/applications/MOF/test/dataset_test.py b/applications/MOF/test/dataset_test.py
new file mode 100644
index 00000000000..4c4d243ab9c
--- /dev/null
+++ b/applications/MOF/test/dataset_test.py
@@ -0,0 +1,39 @@
+import unittest 
+import os.path 
+import sys 
+import numpy as np 
+
+
+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+sys.path.append(root_dir)
+
+import dataset 
+
+
+
+# TO DO: Add data to lustre + gpfs for easier testing 
+
+class dataset_test(unittest.TestCase):
+   
+    def test_num_train_samples(self):
+       #print("Testing num train samples") 
+       self.assertEqual(dataset.num_train_samples(), 64)
+     
+    def test_get_train(self):
+         
+        #print("Testing get train")
+        for i in range(dataset.num_train_samples()):
+            mof = dataset.get_train(i)
+            self.assertIsInstance(mof,  np.ndarray)
+        
+
+    def test_sample_dims(self):
+       # print("Testing Sample Dims")
+        self.assertEqual(dataset.sample_dims()[0], dataset.get_train(0).size)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/applications/MOF/test/test_integration_mof.py b/applications/MOF/test/test_integration_mof.py
new file mode 100644
index 00000000000..a0b0be9c738
--- /dev/null
+++ b/applications/MOF/test/test_integration_mof.py
@@ -0,0 +1,160 @@
+import functools 
+import operator 
+import os 
+import os.path
+import re 
+import sys
+import pytest 
+
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+root_dir = os.path.dirname(current_dir)
+
+sys.path.append(root_dir) # Added lbann/applications/MOF directory 
+
+
+import dataset 
+import MOFae
+applications_dir = os.path.dirname(root_dir)
+lbann_dir = os.path.dirname(applications_dir)
+common_python_dir = os.path.join(lbann_dir, 'bamboo/common_python')# Added lbann/bamboo/common_python 
+sys.path.append(common_python_dir)
+import tools
+
+#Training options 
+num_epochs = 10
+mini_batch_size = 64 
+num_nodes = 2 
+
+# Error 
+
+expected_MSE_range = (0.09, 0.11)
+
+expected_mini_batch_times = {
+    'ray': .35,
+    'pascal':.35
+    }
+
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment. 
+
+    args: 
+        lbann (module): Module for LBANN Python frontend
+        
+    """
+
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    reader = make_data_reader(lbann)
+    
+    # No validation set
+
+    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
+    return trainer, model, reader, optimizer
+
+def make_data_reader(lbann):
+    """Construct LBANN data reader
+
+    """
+    reader = lbann.reader_pb2.DataReader()
+    _reader = reader.reader.add()
+    _reader.name = 'python'
+    _reader.role = 'train'
+    _reader.shuffle = True
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = 'dataset'
+    _reader.python.module_dir = root_dir
+    _reader.python.sample_function = 'get_train'
+    _reader.python.num_samples_function = 'num_train_samples'
+    _reader.python.sample_dims_function = 'sample_dims'
+
+    return reader 
+def construct_model(lbann):
+    
+    latent_dim = 2048
+    number_of_atoms = 11
+    layers, img_loss, metrics = MOFae.gen_layers(latent_dim, number_of_atoms)
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    
+    return lbann.Model(num_epochs,
+                       layers = layers, 
+                       objective_function = img_loss, 
+                       metrics = metrics, 
+                       callbacks = callbacks
+                       )
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        test_accuracy = None
+        mini_batch_times = []
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ recon_error : ([0-9.]+)', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+
+        # Check if training accuracy is within expected range
+        assert (expected_MSE_range[0]
+                < train_accuracy
+                <expected_MSE_range[1]), \
+                'train accuracy is outside expected range'
+       
+        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 
+
+        if (cluster == 'ray'):
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+            mini_batch_times = mini_batch_times[1:]
+            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+            assert (0.75 * expected_mini_batch_times[cluster]
+                    < mini_batch_time
+                    < 1.25 * expected_mini_batch_times[cluster]), \
+                    'average mini-batch time is outside expected range'
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
+
diff --git a/applications/README.md b/applications/README.md
new file mode 100644
index 00000000000..3deb0ff21d1
--- /dev/null
+++ b/applications/README.md
@@ -0,0 +1,12 @@
+## Applications:
+
+The application directory contains the user-facing code for projects
+to use LBANN.  Each project directory should contain the python code
+to instantiate the model, run both training and inference, an
+experiments directory, as well as utility / helper code to pre- or
+post-process data.
+
+These are some of applications that leverage LBANN:
+- [Atom](ATOM/README.md): Accelerating Therapeutics for Opportunities
+  in Medicine (ATOM) - Networks for predicting molecular compounds
+  that are optimized for multiple objectives
diff --git a/applications/graph/.gitignore b/applications/graph/.gitignore
new file mode 100644
index 00000000000..72e300c6440
--- /dev/null
+++ b/applications/graph/.gitignore
@@ -0,0 +1,2 @@
+experiments
+run_scripts
diff --git a/applications/graph/README.md b/applications/graph/README.md
new file mode 100644
index 00000000000..70ed7bccb16
--- /dev/null
+++ b/applications/graph/README.md
@@ -0,0 +1,17 @@
+# Experiments with graph data
+
+This work is focused on scaling graph embedding algorithms on
+distributed systems, both to achieve strong scaling and to handle very
+large graphs.
+
+## Dependencies
+
+- SNAP: C++ package that includes baseline implementation of node2vec
+  algorithm. Install with:
+
+```bash
+cd /path/to/lbann
+git submodule update --init applications/graph/snap
+cd applications/graph/snap
+make
+```
diff --git a/applications/graph/data/.gitignore b/applications/graph/data/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/applications/graph/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/applications/graph/dataset.py b/applications/graph/dataset.py
new file mode 100644
index 00000000000..c1e6aeaba33
--- /dev/null
+++ b/applications/graph/dataset.py
@@ -0,0 +1,135 @@
+"""Random walk dataset.
+
+This is intended to be imported by the Python data reader and used to
+obtain data samples.
+
+"""
+import os.path
+import numpy as np
+import utils.snap
+root_dir = os.path.dirname(os.path.realpath(__file__))
+
+# Graph options
+graph_name = 'blog'
+graph_file = os.path.join(
+    root_dir, 'largescale_node2vec', 'evaluation', 'dataset',
+    'blog', 'edges_0based'
+)
+# graph_file = os.path.join(root_dir, 'data', graph_name, 'graph.txt')
+directed = False
+weighted = False
+
+# Random walk options
+walk_length = 80        # Length of each random walk
+walk_context_length = 10    # Sequence length for Skip-gram
+walks_per_node = 10     # Number of random walks starting on each node
+return_param = 0.25     # p-parameter
+inout_param = 0.25      # q-parameter
+
+# Negative sampling options
+num_negative_samples = 5
+noise_distribution_exp = 0.75   # Exponent to convert unigram
+                                # distribution to noise distribution
+
+# Download graph and perform random walk, if needed
+data_dir = os.path.join(root_dir, 'data', graph_name)
+walk_file = os.path.join(data_dir, 'walk.txt')
+if not os.path.isfile(graph_file):
+    utils.snap.download_graph(graph_name, graph_file)
+if not os.path.isfile(walk_file):
+    utils.snap.node2vec_walk(
+        graph_file,
+        walk_file,
+        walk_length,
+        walks_per_node,
+        return_param,
+        inout_param,
+        directed,
+        weighted)
+
+# Load random walks from file
+walks = np.loadtxt(walk_file, dtype=int)
+assert walks.shape[1] == walk_length, \
+    ('Random walks in {} have length {}, but expected a walk length of {}'
+     .format(walk_file, walks.shape[1], walk_length))
+
+# Generate noise distribution for negative sampling, if needed
+unigram_distribution_file = os.path.join(data_dir, 'unigram_distribution.npy')
+noise_distribution_cdf_file = os.path.join(data_dir, 'noise_distribution_cdf.npy')
+if os.path.isfile(noise_distribution_cdf_file):
+    noise_distribution_cdf = np.load(noise_distribution_cdf_file)
+else:
+    counts = np.bincount(walks.reshape(-1))
+    unigram_distribution = counts / walks.size
+    noise_counts = counts ** noise_distribution_exp
+    noise_distribution_cdf = np.cumsum(noise_counts)
+    noise_distribution_cdf /= noise_distribution_cdf[-1]
+    np.save(unigram_distribution_file, unigram_distribution)
+    np.save(noise_distribution_cdf_file, noise_distribution_cdf)
+
+# Need to reseed RNG after forking processes
+need_to_seed_rng = True
+
+def get_sample(index):
+    """Get a single data sample.
+
+    Consists of a sequence from a random walk and several negative
+    samples.
+
+    """
+
+    # Check if RNG needs to be reseeded
+    global need_to_seed_rng
+    if need_to_seed_rng:
+        np.random.seed()
+        need_to_seed_rng = False
+
+    # Get context window from random walk
+    contexts_per_walk = walk_length - walk_context_length + 1
+    walk_index, context_index = divmod(index, contexts_per_walk)
+    walk_context = walks[walk_index,
+                         context_index:context_index+walk_context_length]
+
+    # Generate negative samples
+    negative_samples = np.searchsorted(noise_distribution_cdf,
+                                       np.random.rand(num_negative_samples))
+
+    # Return concatenated arrays
+    return np.concatenate((negative_samples, walk_context))
+
+def num_samples():
+    """Number of samples in dataset."""
+    num_walks = walks.shape[0]
+    contexts_per_walk = walk_length - walk_context_length + 1
+    return num_walks * contexts_per_walk
+
+def sample_dims():
+    """Dimensions of a data sample."""
+    return (walk_context_length + num_negative_samples,)
+
+def max_graph_node_id(graph_file=graph_file):
+    """Largest node ID in graph.
+
+    Nodes should be numbered consecutively from 0 to
+    (num_graph_nodes-1). If there are any gaps in the IDs, then
+    unnecessary memory will be allocated. If any IDs are negative,
+    there may be mysterious errors.
+
+    Args:
+        graph_file (str): Uncompressed edge list file.
+
+    Returns:
+        int: Largest node ID in graph.
+
+    """
+    max_id = -1
+    with open(graph_file) as f:
+        for line in f:
+            line = line.split('#')[0]
+            line = line.split()
+            if len(line) >= 2:
+                max_id = max(max_id, int(line[0]))
+                max_id = max(max_id, int(line[1]))
+    if max_id < 0:
+        raise RuntimeError('Graph has no non-negative node IDs')
+    return max_id
diff --git a/applications/graph/evaluate.py b/applications/graph/evaluate.py
new file mode 100644
index 00000000000..8b67d2b9e38
--- /dev/null
+++ b/applications/graph/evaluate.py
@@ -0,0 +1,49 @@
+"""Helper script to evaluate quality of node embeddings.
+
+Converts the embedding weights computed by LBANN into a format that
+can be read by Keita's evaluation script.
+
+"""
+import argparse
+import os.path
+import sys
+
+import numpy as np
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    'embedding_file', type=str,
+    help='node embeddings computed by LBANN', metavar='EMBEDDING_FILE')
+parser.add_argument(
+    'label_file', type=str,
+    help='node labels', metavar='LABEL_FILE')
+parser.add_argument(
+    '--snap-embedding-file', default='results.emb', type=str,
+    help='node embeddings in SNAP format', metavar='FILE')
+args = parser.parse_args()
+
+# Construct embedding file in SNAP's format
+embeddings = np.loadtxt(args.embedding_file)
+embeddings = np.transpose(embeddings)
+with open(args.snap_embedding_file, 'w') as f:
+    f.write(f'{embeddings.shape[0]} {embeddings.shape[1]}\n')
+    for index, embedding in enumerate(embeddings):
+        f.write(f'{index} {" ".join(str(x) for x in embedding)}\n')
+
+# Evaluate embeddings with Keita's evaluation script
+root_dir = os.path.dirname(os.path.realpath(__file__))
+eval_script_dir = os.path.join(
+    root_dir,
+    'largescale_node2vec',
+    'evaluation',
+    'multi_label_classification'
+)
+sys.path.append(eval_script_dir)
+import multi_label_classification
+multi_label_classification.main([
+    '-x', args.snap_embedding_file,
+    '-y', args.label_file,
+    '-r', 0.9,
+    '-n', 10
+])
diff --git a/applications/graph/largescale_node2vec b/applications/graph/largescale_node2vec
new file mode 160000
index 00000000000..1b0aa43fdf5
--- /dev/null
+++ b/applications/graph/largescale_node2vec
@@ -0,0 +1 @@
+Subproject commit 1b0aa43fdf5f8e956915926305f3e55c2c17972e
diff --git a/applications/graph/main.py b/applications/graph/main.py
new file mode 100644
index 00000000000..82c63a912b7
--- /dev/null
+++ b/applications/graph/main.py
@@ -0,0 +1,149 @@
+"""Learn embedding weights with LBANN."""
+import argparse
+import os.path
+
+import lbann
+import lbann.contrib.launcher
+import lbann.contrib.args
+
+import dataset
+from utils import make_iterable, str_list
+import utils.snap
+
+# ----------------------------------
+# Options
+# ----------------------------------
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_node2vec', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=1, type=int,
+    help='number of epochs (default: 1)', metavar='NUM')
+parser.add_argument(
+    '--latent-dim', action='store', default=128, type=int,
+    help='latent space dimensions (default: 128)', metavar='NUM')
+parser.add_argument(
+    '--learning-rate', action='store', default=-1, type=float,
+    help='learning rate (default: 0.025*mbsize)', metavar='VAL')
+parser.add_argument(
+    '--work-dir', action='store', default=None, type=str,
+    help='working directory', metavar='DIR')
+args = parser.parse_args()
+
+# ----------------------------------
+# Embedding weights
+# ----------------------------------
+
+encoder_embeddings_weights = lbann.Weights(
+    initializer=lbann.NormalInitializer(
+        mean=0, standard_deviation=1/args.latent_dim,
+    ),
+    name='embeddings',
+)
+decoder_embeddings_weights = lbann.Weights(
+    initializer=lbann.ConstantInitializer(value=0),
+    name='decoder_embeddings',
+)
+
+# ----------------------------------
+# Construct layer graph
+# ----------------------------------
+
+# Properties of graph and random walk
+num_graph_nodes = dataset.max_graph_node_id() + 1
+walk_length = dataset.walk_context_length
+num_negative_samples = dataset.num_negative_samples
+input_size = dataset.sample_dims()[0]
+
+# Embedding vectors, including negative sampling
+# Note: Input is sequence of graph node IDs
+input_ = lbann.Identity(lbann.Input())
+input_slice = lbann.Slice(
+    input_,
+    slice_points=f'0 {num_negative_samples+1} {input_size}'
+)
+decoder_embeddings = lbann.Embedding(
+    input_slice,
+    weights=decoder_embeddings_weights,
+    num_embeddings=num_graph_nodes,
+    embedding_dim=args.latent_dim,
+)
+encoder_embeddings = lbann.Embedding(
+    input_slice,
+    weights=encoder_embeddings_weights,
+    num_embeddings=num_graph_nodes,
+    embedding_dim=args.latent_dim,
+)
+
+# Skip-Gram with negative sampling
+preds = lbann.MatMul(decoder_embeddings, encoder_embeddings, transpose_b=True)
+preds_slice = lbann.Slice(
+    preds,
+    axis=0,
+    slice_points=f'0 {num_negative_samples} {num_negative_samples+1}')
+preds_negative = lbann.Identity(preds_slice)
+preds_positive = lbann.Identity(preds_slice)
+obj_positive = lbann.LogSigmoid(preds_positive)
+obj_positive = lbann.Reduction(obj_positive, mode='sum')
+obj_negative = lbann.WeightedSum(preds_negative, scaling_factors='-1')
+obj_negative = lbann.LogSigmoid(obj_negative)
+obj_negative = lbann.Reduction(obj_negative, mode='sum')
+obj = [
+    lbann.LayerTerm(obj_positive, scale=-1),
+    lbann.LayerTerm(obj_negative, scale=-1/num_negative_samples),
+]
+
+# ----------------------------------
+# Create data reader
+# ----------------------------------
+
+reader = lbann.reader_pb2.DataReader()
+_reader = reader.reader.add()
+_reader.name = 'python'
+_reader.role = 'train'
+_reader.shuffle = True
+_reader.percent_of_data_to_use = 1.0
+_reader.python.module = 'dataset'
+_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+_reader.python.sample_function = 'get_sample'
+_reader.python.num_samples_function = 'num_samples'
+_reader.python.sample_dims_function = 'sample_dims'
+
+# ----------------------------------
+# Run LBANN
+# ----------------------------------
+
+# Create optimizer
+# Note: Learning rate in original word2vec is 0.025
+learning_rate = args.learning_rate
+if learning_rate < 0:
+    learning_rate = 0.025 * args.mini_batch_size
+opt = lbann.SGD(learn_rate=learning_rate)
+
+# Create LBANN objects
+trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+callbacks = [
+    lbann.CallbackPrint(),
+    lbann.CallbackTimer(),
+    lbann.CallbackDumpWeights(basename='embeddings',
+                              epoch_interval=args.num_epochs),
+]
+model = lbann.Model(args.num_epochs,
+                    layers=lbann.traverse_layer_graph(input_),
+                    objective_function=obj,
+                    callbacks=callbacks)
+
+# Run LBANN
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+lbann.contrib.launcher.run(trainer, model, reader, opt,
+                           job_name=args.job_name,
+                           work_dir=args.work_dir,
+                           overwrite_script=True,
+                           **kwargs)
diff --git a/applications/graph/snap b/applications/graph/snap
new file mode 160000
index 00000000000..907c34aac6b
--- /dev/null
+++ b/applications/graph/snap
@@ -0,0 +1 @@
+Subproject commit 907c34aac6bcddc7c2f8efb64be76e87dd7e4ea5
diff --git a/applications/graph/test/test_dataset.py b/applications/graph/test/test_dataset.py
new file mode 100644
index 00000000000..85525ae0c47
--- /dev/null
+++ b/applications/graph/test/test_dataset.py
@@ -0,0 +1,38 @@
+import os.path
+import random
+import sys
+
+# Local paths
+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(root_dir)
+
+def test_dataset():
+    import dataset
+
+    # Check max node ID
+    max_graph_node_id = dataset.max_graph_node_id()
+    assert max_graph_node_id >= 0, 'Negative graph node ID'
+    assert max_graph_node_id != 0, \
+        'Max graph node ID is zero, ' \
+        'which implies graph has only one node or node IDs are negative'
+
+    # Check sample dimensions
+    sample_dims = dataset.sample_dims()
+    assert len(sample_dims) == 1, 'Unexpected dimensions for data sample'
+    assert sample_dims[0] > 0, 'Invalid dimensions for data sample'
+
+    # Check number of samples
+    num_samples = dataset.num_samples()
+    assert num_samples >= 0, 'Invalid number of data samples'
+    assert num_samples != 0, 'Dataset has no data samples'
+
+    # Check samples
+    indices = [random.randint(0, num_samples-1) for _ in range(20)]
+    indices.append(0)
+    indices.append(num_samples-1)
+    for index in indices:
+        sample = dataset.get_sample(index)
+        assert sample.shape == sample_dims, 'Unexpected dimensions for data sample'
+        for node in sample:
+            assert 0 <= node <= max_graph_node_id, \
+                'Invalid graph node ID in data sample'
diff --git a/applications/graph/utils/__init__.py b/applications/graph/utils/__init__.py
new file mode 100644
index 00000000000..370660d532f
--- /dev/null
+++ b/applications/graph/utils/__init__.py
@@ -0,0 +1,17 @@
+"""Utilities for LBANN graph models"""
+import collections.abc
+
+def make_iterable(obj):
+    """Convert to an iterable object.
+
+    Simply returns `obj` if it is alredy iterable. Otherwise returns a
+    1-tuple containing `obj`.
+    """
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+        return obj
+    else:
+        return (obj,)
+
+def str_list(it):
+    """Convert an iterable object to a space-separated string."""
+    return ' '.join([str(i) for i in make_iterable(it)])
diff --git a/applications/graph/utils/snap.py b/applications/graph/utils/snap.py
new file mode 100644
index 00000000000..2c65bf7231c
--- /dev/null
+++ b/applications/graph/utils/snap.py
@@ -0,0 +1,121 @@
+"""Utilities to interact with SNAP.
+
+SNAP is the Stanford Network Analysis Platform. See
+https://snap.stanford.edu.
+
+"""
+import os
+import os.path
+import urllib.request
+import gzip
+import subprocess
+
+# Root directory for LBANN graph application
+_root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+def download_graph(name='ego-Facebook',
+                   graph_file=None):
+    """Download graph edgelist file from SNAP website.
+
+    Args:
+        name (str): Name of graph.
+        graph_file (str, optional): File where uncompressed edge list
+            will be saved (default: in 'data' directory).
+
+    Returns:
+        str: Uncompressed edge list file.
+
+    """
+
+    # Graphs from SNAP
+    download_urls = {
+        'ego-Facebook': 'http://snap.stanford.edu/data/facebook_combined.txt.gz',
+    }
+
+    # Paths
+    if not graph_file:
+        graph_file = os.path.join(_root_dir, 'data', name, 'graph.txt')
+    data_dir = os.path.dirname(graph_file)
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+    data_dir = os.path.realpath(data_dir)
+    graph_file = os.path.realpath(graph_file)
+    compressed_file = graph_file + '.gz'
+
+    # Download and uncompress graph file
+    urllib.request.urlretrieve(download_urls[name],
+                               filename=compressed_file)
+    with gzip.open(compressed_file, 'rb') as in_file:
+        with open(graph_file, 'wb') as out_file:
+            out_file.write(in_file.read())
+
+    return graph_file
+
+
+def node2vec_walk(graph_file,
+                  walk_file,
+                  walk_length,
+                  walks_per_node,
+                  return_param=1.0,
+                  inout_param=1.0,
+                  directed=False,
+                  weighted=False,
+                  verbose=False):
+    """Perform random walk on graph for node2vec.
+
+    See https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf
+
+    Args:
+        graph_file (str): Uncompressed edge list file.
+        walk_file (str): File where random walks will be saved.
+        walk_length (int): Walk length.
+        walks_per_node (int): Number of walks per graph vertex.
+        return_param (float, optional): p-parameter for random walk
+            (default: 1.0).
+        inout_param (float, optional): q-parameter for random walk
+            (default: 1.0).
+        directed (bool, optional): Graph is directed (default: False).
+        weighted (bool, optional): Graph is weighted (default: False).
+        verbose (bool, optional): Verbose output (default: False).
+
+    """
+
+    # Check executable
+    node2vec_exe = os.path.join(_root_dir, 'snap', 'examples',
+                                'node2vec', 'node2vec')
+    if not os.path.isfile(node2vec_exe):
+        raise FileNotFoundError(
+            'Could not find node2vec executable at {}. '
+            'Has SNAP been built?'
+            .format(node2vec_exe)
+        )
+
+    # Make sure output directory exists
+    output_dir = os.path.dirname(os.path.realpath(walk_file))
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+
+    # Construct invocation
+    command = [
+        node2vec_exe,
+        '-i:{}'.format(graph_file),
+        '-o:{}'.format(walk_file),
+        '-d:-1',
+        '-l:{}'.format(walk_length),
+        '-r:{}'.format(walks_per_node),
+        '-k:-1',
+        '-e:-1',
+        '-p:{}'.format(return_param),
+        '-q:{}'.format(inout_param),
+        '-ow',
+    ]
+    if verbose:
+        command.append('-v')
+    if directed:
+        command.append('-dr')
+    if weighted:
+        command.append('-w')
+
+    # Run executable
+    return subprocess.call(command)
diff --git a/applications/nlp/README.md b/applications/nlp/README.md
new file mode 100644
index 00000000000..a60513ee0f1
--- /dev/null
+++ b/applications/nlp/README.md
@@ -0,0 +1,22 @@
+# Example models for natural language processing
+
+This directory contains LBANN experiments with text data, with the
+goal of developing and optimizing NLP functionality. It will
+eventually contain reference implementations of widely-used NLP
+models.
+
+## Dependencies
+
+- PyTorch
+
+- Transformers: NLP library for TensorFlow and PyTorch. Install with:
+
+```bash
+pip3 install transformers
+```
+
+- PyTorch-NLP: PyTorch utilities for NLP applications. Install with:
+
+```bash
+pip3 install pytorch-nlp
+```
diff --git a/applications/nlp/data/.gitignore b/applications/nlp/data/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/applications/nlp/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/applications/nlp/experiments/.gitignore b/applications/nlp/experiments/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/applications/nlp/experiments/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/applications/nlp/rnn/dataset.py b/applications/nlp/rnn/dataset.py
new file mode 100644
index 00000000000..66fb7c0fd88
--- /dev/null
+++ b/applications/nlp/rnn/dataset.py
@@ -0,0 +1,25 @@
+import os.path
+import sys
+
+# Local imports
+current_file = os.path.realpath(__file__)
+root_dir = os.path.dirname(os.path.dirname(current_file))
+sys.path.append(root_dir)
+import utils.gutenberg
+
+# Options
+text_name = 'frankenstein'
+sequence_length = 10
+
+# Download and tokenize text data, if needed
+data_url = utils.gutenberg.get_url(text_name)
+data_dir = os.path.join(root_dir, 'data', text_name)
+corpus = utils.gutenberg.GutenbergCorpus(data_dir, data_url)
+
+# Sample access functions
+def get_sample(index):
+    return corpus[index:index+sequence_length]
+def num_samples():
+    return len(corpus) - sequence_length + 1
+def sample_dims():
+    return (sequence_length,)
diff --git a/applications/nlp/rnn/main.py b/applications/nlp/rnn/main.py
new file mode 100644
index 00000000000..3aa00463919
--- /dev/null
+++ b/applications/nlp/rnn/main.py
@@ -0,0 +1,118 @@
+"""Simple recurrent network on tokenized text data."""
+import argparse
+import os.path
+import sys
+
+import lbann
+import lbann.modules
+import lbann.contrib.launcher
+import lbann.contrib.args
+
+# Local imports
+current_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.dirname(current_dir)
+sys.path.append(root_dir)
+import dataset
+from utils import str_list
+
+# ----------------------------------
+# Options
+# ----------------------------------
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_textrnn', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=20, type=int,
+    help='number of epochs (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--latent-dim', action='store', default=128, type=int,
+    help='latent space dimensions (default: 128)', metavar='NUM')
+args = parser.parse_args()
+
+# ----------------------------------
+# Construct layer graph
+# ----------------------------------
+
+# Dataset properties
+vocab_size = dataset.corpus.vocab_size
+sequence_length = dataset.sample_dims()[0]
+
+# Input is a sequence of token IDs
+input_ = lbann.Identity(lbann.Input())
+input_slice = lbann.Slice(input_,
+                          slice_points=str_list(range(sequence_length+1)))
+tokens_list = [lbann.Identity(input_slice) for _ in range(sequence_length)]
+
+# Get sequence of embedding vectors
+embeddings = lbann.Embedding(input_,
+                             num_embeddings=vocab_size,
+                             embedding_dim=args.latent_dim)
+embeddings_slice = lbann.Slice(embeddings,
+                               axis=0,
+                               slice_points=str_list(range(sequence_length+1)))
+embeddings_list = [lbann.Reshape(embeddings_slice, dims='-1')
+                   for _ in range(sequence_length)]
+
+# Layer modules
+lstm = lbann.modules.LSTMCell(args.latent_dim)
+lstm_state = [lbann.Constant(value=0, num_neurons=str_list(args.latent_dim)),
+              lbann.Constant(value=0, num_neurons=str_list(args.latent_dim))]
+pred_fc = lbann.modules.FullyConnectedModule(vocab_size,
+                                             data_layout='model_parallel')
+
+# Iterate through RNN steps
+loss = []
+for step in range(sequence_length-1):
+
+    # Predict next token with RNN
+    x = embeddings_list[step]
+    x, lstm_state = lstm(x, lstm_state)
+    x = pred_fc(x)
+    pred = lbann.Softmax(x)
+
+    # Evaluate prediction with cross entropy
+    ground_truth = lbann.OneHot(tokens_list[step+1], size=vocab_size)
+    cross_entropy = lbann.CrossEntropy([pred, ground_truth])
+    loss.append(lbann.LayerTerm(cross_entropy, scale=1/(sequence_length-1)))
+
+# ----------------------------------
+# Create data reader
+# ----------------------------------
+
+reader = lbann.reader_pb2.DataReader()
+_reader = reader.reader.add()
+_reader.name = 'python'
+_reader.role = 'train'
+_reader.shuffle = True
+_reader.percent_of_data_to_use = 1.0
+_reader.python.module = 'dataset'
+_reader.python.module_dir = current_dir
+_reader.python.sample_function = 'get_sample'
+_reader.python.num_samples_function = 'num_samples'
+_reader.python.sample_dims_function = 'sample_dims'
+
+# ----------------------------------
+# Run LBANN
+# ----------------------------------
+
+# Create LBANN objects
+trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+model = lbann.Model(args.num_epochs,
+                    layers=lbann.traverse_layer_graph(input_),
+                    objective_function=loss,
+                    callbacks=[lbann.CallbackPrint(),
+                               lbann.CallbackTimer()])
+opt = lbann.SGD(learn_rate=0.01, momentum=0.9)
+
+# Run LBANN
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+lbann.contrib.launcher.run(trainer, model, reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/applications/nlp/transformer/dataset.py b/applications/nlp/transformer/dataset.py
new file mode 100644
index 00000000000..4780a035654
--- /dev/null
+++ b/applications/nlp/transformer/dataset.py
@@ -0,0 +1,129 @@
+"""WMT 2014 dataset for English-German translation."""
+import os.path
+import sys
+
+import numpy as np
+import torchnlp.datasets
+
+# Local imports
+current_file = os.path.realpath(__file__)
+root_dir = os.path.dirname(os.path.dirname(current_file))
+sys.path.append(root_dir)
+import utils.paths
+
+# ----------------------------------------------
+# Options
+# ----------------------------------------------
+
+# Note: Sequence lengths for WMT 2014 have mean 29.05, standard
+# deviation 16.20, and max 484.
+sequence_length = 64
+
+# ----------------------------------------------
+# Setup
+# ----------------------------------------------
+
+# Load WMT 2014 dataset
+data_dir = utils.paths.wmt_dir()
+dataset_train, dataset_val = torchnlp.datasets.wmt_dataset(
+    directory=data_dir,
+    train=True,
+    dev=True,
+)
+
+# Load token vocabulary
+with open(os.path.join(data_dir, 'vocab.bpe.32000')) as f:
+    tokens = f.read().splitlines()
+tokens.extend(['<unk>', '<s>', '</s>', '<pad>'])
+token_indices = dict(zip(tokens, range(len(tokens))))
+unk_index = token_indices.get('<unk>', -1)
+bos_index = token_indices.get('<s>', -1)
+eos_index = token_indices.get('</s>', -1)
+pad_index = token_indices.get('<pad>', -1)
+
+# ----------------------------------------------
+# Tokenization
+# ----------------------------------------------
+
+def tokenize(text):
+    """Convert string to list of token indices.
+
+    WMT 2014 has already been tokenized with byte-pair encoding. We
+    add BOS and EOS tokens.
+
+    """
+    indices = [bos_index]
+    indices.extend(
+        token_indices.get(token, unk_index)
+        for token in text.split(' ')
+    )
+    indices.append(eos_index)
+    return indices
+
+def detokenize(indices):
+    """Convert token indices to string.
+
+    Stops at the first EOS token. All other special tokens are
+    ignored.
+
+    """
+    text = ''
+    for index in indices:
+        if index == eos_index:
+            break
+        elif index in (unk_index, bos_index, pad_index):
+            continue
+        else:
+            text += f' {tokens[index]}'
+    return text
+
+# ----------------------------------------------
+# Sample access functions
+# ----------------------------------------------
+
+def get_train_sample(index):
+    """Token indices for a data sample from the training set.
+
+    The English and German text samples are tokenized,
+    padded/subsampled to sequence_length tokens, and concatenated.
+
+    """
+
+    # Tokenize text data
+    text = dataset_train[index]
+    sample_en = tokenize(text['en'])
+    sample_de = tokenize(text['de'])
+
+    # Randomly subsample sequences if they are too long
+    if len(sample_en) > sequence_length or len(sample_de) > sequence_length:
+        pos = np.random.rand()
+        if len(sample_en) > sequence_length:
+            offset = (len(sample_en) - sequence_length + 1) * pos
+            offset = int(np.floor(offset))
+            sample_en = sample_en[offset:offset+sequence_length]
+        if len(sample_de) > sequence_length:
+            offset = (len(sample_de) - sequence_length + 1) * pos
+            offset = int(np.floor(offset))
+            sample_de = sample_de[offset:offset+sequence_length]
+
+    # Concatenate sequences and return
+    sample = np.full(2*sequence_length, pad_index, dtype=int)
+    sample[0:len(sample_en)] = sample_en
+    sample[sequence_length:sequence_length+len(sample_de)] = sample_de
+    return sample
+
+def get_val_sample(index):
+    """Token indices for a data sample from the validation set."""
+    text = dataset_val[index]
+    sample_en = tokenize(text['en'])
+    sample_de = tokenize(text['de'])
+    return sample_en, sample_de
+
+def num_train_samples():
+    return len(dataset_train)
+def num_val_samples():
+    return len(dataset_val)
+def sample_dims():
+    return (2*sequence_length+1,)
+def vocab_size():
+    return len(tokens)
diff --git a/applications/nlp/transformer/evaluate.py b/applications/nlp/transformer/evaluate.py
new file mode 100644
index 00000000000..91ec00fad95
--- /dev/null
+++ b/applications/nlp/transformer/evaluate.py
@@ -0,0 +1,322 @@
+"""Evaluate Transformer example.
+
+The LBANN Transformer model is assumed to have saved its weights to
+weight files with the "dump weights" callback. These weights are
+loaded into a PyTorch model and English-German translation is
+performed with greedy decoding on the WMT 2014 validation dataset.
+BLEU scores are computed for the predicted translations.
+
+"""
+
+import argparse
+import os.path
+import sys
+
+import numpy as np
+import torch
+import torch.nn
+import torchnlp.metrics
+
+# Local imports
+current_file = os.path.realpath(__file__)
+root_dir = os.path.dirname(os.path.dirname(current_file))
+sys.path.append(root_dir)
+import dataset
+import utils
+import utils.paths
+
+# ----------------------------------------------
+# Options
+# ----------------------------------------------
+
+# Evaluation options
+mini_batch_size = 64    # Doesn't need to match training
+
+# Hard-coded model parameters
+# Note: Must match parameters from training.
+embed_dim = 512
+num_heads = 8
+num_encoder_layers = 6
+num_decoder_layers = 6
+filter_dim = 2048
+dropout = 0.1
+
+# Dataset properties
+vocab_size = dataset.vocab_size()
+max_sequence_length = dataset.sequence_length
+bos_index = dataset.bos_index
+eos_index = dataset.eos_index
+pad_index = dataset.pad_index
+num_samples = dataset.num_val_samples()
+
+# ----------------------------------------------
+# Evaluation data
+# ----------------------------------------------
+
+def get_batch(indices):
+    """Get a batch of samples from the evaluation dataset.
+
+    The sequences are padded to the length of the longest sequence in
+    the batch.
+
+    """
+
+    # Get data samples
+    indices = utils.make_iterable(indices)
+    tokens_list_en = []
+    tokens_list_de = []
+    for index in indices:
+        tokens_en, tokens_de = dataset.get_val_sample(index)
+        tokens_list_en.append(tokens_en)
+        tokens_list_de.append(tokens_de)
+
+    # Convert tokens to PyTorch tensors
+    tokens_en = np.full(
+        (max(len(seq) for seq in tokens_list_en), len(indices)),
+        pad_index,
+        dtype=int,
+    )
+    tokens_de = np.full(
+        (max(len(seq) for seq in tokens_list_de), len(indices)),
+        pad_index,
+        dtype=int,
+    )
+    for i, seq in enumerate(tokens_list_en):
+        tokens_en[:len(seq), i] = seq
+    for i, seq in enumerate(tokens_list_de):
+        tokens_de[:len(seq), i] = seq
+    tokens_en = torch.from_numpy(tokens_en)
+    tokens_de = torch.from_numpy(tokens_de)
+    return tokens_en, tokens_de
+
+# ----------------------------------------------
+# Load model from file
+# ----------------------------------------------
+
+def load_parameter(weight_file):
+    """Create a PyTorch Parameter object with weights from LBANN.
+
+    Weight file is assumed to have been created by the "dump weights"
+    callback in LBANN.
+
+    """
+    data = np.loadtxt(weight_file, dtype=np.float32)
+    return torch.nn.Parameter(
+        data=torch.from_numpy(data),
+        requires_grad=False
+    )
+
+def load_embedding_layer(weights_prefix):
+    """Create a PyTorch embedding layer with weights from LBANN.
+
+    Weight files are assumed to have been created by the "dump
+    weights" callback in LBANN. They should be in the form
+    <weights_prefix>-embeddings-Weights.txt.
+
+    """
+    weight_file = f'{weights_prefix}-embeddings-Weights.txt'
+    weight = load_parameter(weight_file).transpose(1,0)
+    return torch.nn.Embedding(
+        num_embeddings=vocab_size,
+        embedding_dim=embed_dim,
+        padding_idx=pad_index,
+        _weight=weight,
+    )
+
+def load_transformer(weights_prefix):
+    """Create a PyTorch transformer with weights from LBANN.
+
+    Weight files are assumed to have been created by the "dump
+    weights" callback in LBANN. They should be in the form
+    <weights_prefix>-<weight_name>-Weights.txt.
+
+    """
+
+    # PyTorch transformer model
+    transformer = torch.nn.Transformer(
+        d_model=embed_dim,
+        nhead=num_heads,
+        num_encoder_layers=num_encoder_layers,
+        num_decoder_layers=num_decoder_layers,
+        dim_feedforward=filter_dim,
+        dropout=dropout,
+    )
+
+    # Set transformer to evaluation mode
+    transformer.eval()
+
+    # Load weights for encoder
+    for i, layer in enumerate(transformer.encoder.layers):
+
+        # Load weights for self-attention
+        attention = layer.self_attn
+        attention._qkv_same_embed_dim = False
+        prefix = f'{weights_prefix}-transformer_encoder{i}_attention'
+        attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt')
+        attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt')
+        attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt')
+        attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt')
+        attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt')
+        attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt')
+        attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt')
+        attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt')
+
+        # Load weights for feedforward network
+        prefix = f'{weights_prefix}-transformer_encoder{i}'
+        layer.linear1.weight = load_parameter(f'{prefix}_fc1_matrix-Weights.txt')
+        layer.linear1.bias = load_parameter(f'{prefix}_fc1_bias-Weights.txt')
+        layer.linear2.weight = load_parameter(f'{prefix}_fc2_matrix-Weights.txt')
+        layer.linear2.bias = load_parameter(f'{prefix}_fc2_bias-Weights.txt')
+
+    # Load weights for decoder
+    for i, layer in enumerate(transformer.decoder.layers):
+
+        # Load weights for self-attention
+        attention = layer.self_attn
+        attention._qkv_same_embed_dim = False
+        prefix = f'{weights_prefix}-transformer_decoder{i}_attention1'
+        attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt')
+        attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt')
+        attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt')
+        attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt')
+        attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt')
+        attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt')
+        attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt')
+        attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt')
+
+        # Load weights for attention with memory
+        attention = layer.multihead_attn
+        attention._qkv_same_embed_dim = False
+        prefix = f'{weights_prefix}-transformer_decoder{i}_attention2'
+        attention.q_proj_weight = load_parameter(f'{prefix}_query_matrix-Weights.txt')
+        attention.q_proj_bias = load_parameter(f'{prefix}_query_bias-Weights.txt')
+        attention.k_proj_weight = load_parameter(f'{prefix}_key_matrix-Weights.txt')
+        attention.k_proj_bias = load_parameter(f'{prefix}_key_bias-Weights.txt')
+        attention.v_proj_weight = load_parameter(f'{prefix}_value_matrix-Weights.txt')
+        attention.v_proj_bias = load_parameter(f'{prefix}_value_bias-Weights.txt')
+        attention.out_proj_weight = load_parameter(f'{prefix}_output_matrix-Weights.txt')
+        attention.out_proj_bias = load_parameter(f'{prefix}_output_bias-Weights.txt')
+
+        # Load weights for feedforward network
+        prefix = f'{weights_prefix}-transformer_decoder{i}'
+        layer.linear1.weight = load_parameter(f'{prefix}_fc1_matrix-Weights.txt')
+        layer.linear1.bias = load_parameter(f'{prefix}_fc1_bias-Weights.txt')
+        layer.linear2.weight = load_parameter(f'{prefix}_fc2_matrix-Weights.txt')
+        layer.linear2.bias = load_parameter(f'{prefix}_fc2_bias-Weights.txt')
+
+    return transformer
+
+# ----------------------------------------------
+# Evaluate transformer model
+# ----------------------------------------------
+
+def add_positional_encoding(x):
+    """Add positional encoding for transformer model."""
+    sequence_length = x.shape[0]
+    embed_dim = x.shape[2]
+    encoding = np.zeros(x.shape, dtype=np.float32)
+    for i in range((embed_dim+1) // 2):
+        pos = np.arange(sequence_length).reshape(-1,1)
+        encoding[:,:,2*i] = np.sin(pos / 10000**(2*i/embed_dim))
+    for i in range(embed_dim // 2):
+        pos = np.arange(sequence_length).reshape(-1,1)
+        encoding[:,:,2*i+1] = np.cos(pos / 10000**(2*i/embed_dim))
+    return x + torch.from_numpy(encoding)
+
+def greedy_decode(tokens_en, embedding_layer, transformer, classifier):
+    """Generate sequence with transformer.
+
+    Predict tokens one at a time by choosing the one that maximizes
+    the classification score.
+
+    """
+
+    # Encode English sequence
+    embeddings_en = embedding_layer(tokens_en)
+    memory = transformer.encoder(
+        add_positional_encoding(embeddings_en * np.sqrt(embed_dim))
+    )
+
+    # Decode German sequence
+    # TODO: Only perform compute for last sequence entry
+    # TODO: Detect EOS tokens and stop early
+    tokens_de = torch.full((1,tokens_en.shape[1]), bos_index, dtype=int)
+    for i in range(1, max_sequence_length):
+        embeddings_de = embedding_layer(tokens_de)
+        preds = transformer.decoder(
+            add_positional_encoding(embeddings_de * np.sqrt(embed_dim)),
+            memory,
+            tgt_mask=transformer.generate_square_subsequent_mask(i),
+        )
+        preds = classifier(preds[-1,:,:])
+        preds = preds.argmax(dim=1)
+        tokens_de = torch.cat([tokens_de, preds.reshape(1,-1)], dim=0)
+    return tokens_de
+
+def evaluate_transformer(weights_prefix):
+    """Evaluate transformer model with weights from LBANN.
+
+    Weight files are assumed to have been created by the "dump
+    weights" callback in LBANN. They should be in the form
+    <weights_prefix>-<weight_name>-Weights.txt.
+
+    """
+
+    # Load model weights from file
+    embedding_layer = load_embedding_layer(weights_prefix)
+    transformer = load_transformer(weights_prefix)
+    classifier = torch.nn.Linear(embed_dim, vocab_size, bias=False)
+    classifier.weight = embedding_layer.weight
+
+    # Evaluate model
+    bleu_scores = []
+    for batch, index_start in enumerate(range(0, num_samples, mini_batch_size)):
+        index_end = min(index_start+mini_batch_size, num_samples)
+        indices = list(range(index_start, index_end))
+        batch_size = len(indices)
+
+        # Translate English sequence to German
+        # TODO: Decoding with beam search
+        tokens_en, true_tokens_de = get_batch(indices)
+        pred_tokens_de = greedy_decode(
+            tokens_en,
+            embedding_layer,
+            transformer,
+            classifier,
+        )
+
+        # Compute BLEU score
+        for i in range(batch_size):
+            hypothesis = dataset.detokenize(pred_tokens_de[:,i].numpy())
+            reference = dataset.detokenize(true_tokens_de[:,i].numpy())
+            bleu_scores.append(
+                torchnlp.metrics.get_moses_multi_bleu(
+                    [hypothesis],
+                    [reference],
+                )
+            )
+
+    # Print results
+    print(
+        f'BLEU score: '
+        f'mean={np.mean(bleu_scores)}, '
+        f'stdev={np.std(bleu_scores)}, '
+        f'min={np.min(bleu_scores)}, '
+        f'max={np.max(bleu_scores)}'
+    )
+
+# ----------------------------------------------
+# Command-line options if run as script
+# ----------------------------------------------
+
+if __name__ == "__main__":
+
+    # Command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'weights_prefix', type=str,
+        help='prefix for saved weights from LBANN')
+    args = parser.parse_args()
+
+    # Evaluate model
+    evaluate_transformer(args.weights_prefix)
diff --git a/applications/nlp/transformer/main.py b/applications/nlp/transformer/main.py
new file mode 100644
index 00000000000..179fb59afa3
--- /dev/null
+++ b/applications/nlp/transformer/main.py
@@ -0,0 +1,94 @@
+"""Driver script for training Transformer example."""
+import argparse
+import datetime
+import math
+import os
+import os.path
+import sys
+
+import lbann
+import lbann.contrib.args
+
+# Local imports
+current_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.dirname(current_dir)
+sys.path.append(root_dir)
+import train
+import evaluate
+import utils.paths
+
+# ----------------------------------------------
+# Options
+# ----------------------------------------------
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_transformer', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=20, type=int,
+    help='number of epochs (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--num-attention-heads', action='store', default=8, type=int,
+    help='number of parallel attention layers (default: 8)', metavar='NUM')
+parser.add_argument(
+    '--embed-dim', action='store', default=512, type=int,
+    help='embedding space dimensions (default: 512)', metavar='NUM')
+args = parser.parse_args()
+
+# Hard-coded options
+label_smoothing = 0.1
+
+# ----------------------------------------------
+# Work directory
+# ----------------------------------------------
+
+timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+work_dir = os.path.join(
+    utils.paths.root_dir(),
+    'experiments',
+    f'{timestamp}_{args.job_name}',
+)
+os.makedirs(work_dir, exist_ok=True)
+
+# ----------------------------------------------
+# Train
+# ----------------------------------------------
+
+# Create batch script
+trainer_params = {
+    'mini_batch_size': args.mini_batch_size,
+}
+model_params = {
+    'num_epochs': args.num_epochs,
+    'embed_dim': args.embed_dim,
+    'num_heads': args.num_attention_heads,
+    'label_smoothing': label_smoothing,
+}
+script_params = lbann.contrib.args.get_scheduler_kwargs(args)
+script_params['work_dir'] = work_dir
+script_params['job_name'] = args.job_name
+train_script = train.make_batch_script(
+    trainer_params=trainer_params,
+    model_params=model_params,
+    script_params=script_params,
+)
+weights_prefix = os.path.join(
+    work_dir,
+    'weights',
+    f'model0-epoch{args.num_epochs-1}',
+)
+train_script.add_command(
+    f'# python3 {utils.paths.root_dir()}/transformer/evaluate.py {weights_prefix}'
+)
+train_script.run(overwrite=True)
+
+# ----------------------------------------------
+# Evaluate
+# ----------------------------------------------
+evaluate.evaluate_transformer(weights_prefix)
diff --git a/applications/nlp/transformer/train.py b/applications/nlp/transformer/train.py
new file mode 100644
index 00000000000..45c2eb3e237
--- /dev/null
+++ b/applications/nlp/transformer/train.py
@@ -0,0 +1,230 @@
+"""Configure LBANN experiment with Transformer model."""
+import math
+import os.path
+
+import lbann
+import lbann.models
+import lbann.contrib.launcher
+from lbann.util import str_list
+
+import dataset
+
+# ----------------------------------------------
+# Options
+# ----------------------------------------------
+
+# Dataset properties
+vocab_size = dataset.vocab_size()
+sequence_length = dataset.sequence_length
+pad_index = dataset.pad_index
+
+# ----------------------------------------------
+# Model
+# ----------------------------------------------
+
+def make_model(
+    num_epochs,
+    embed_dim,
+    num_heads,
+    label_smoothing,
+):
+
+    # Embedding weights
+    var = 2 / (embed_dim + vocab_size) # Glorot initialization
+    embedding_weights = lbann.Weights(
+        name='embeddings',
+        initializer=lbann.NormalInitializer(standard_deviation=math.sqrt(var)),
+    )
+
+    # Input is two sequences of token IDs
+    input_ = lbann.Identity(lbann.Input())
+
+    # Get sequences of embedding vectors
+    # Note: Scale embeddings by sqrt(embed_dim).
+    # Note: Decoder input is shifted right, so embedding for last
+    # token isn't needed.
+    embeddings_tokens = lbann.Identity(lbann.Slice(
+        input_,
+        axis=0,
+        slice_points=str_list([0, 2*sequence_length-1]),
+    ))
+    embeddings = lbann.Embedding(
+        embeddings_tokens,
+        weights=embedding_weights,
+        num_embeddings=vocab_size,
+        embedding_dim=embed_dim,
+        padding_idx=pad_index,
+    )
+    embeddings = lbann.WeightedSum(
+        embeddings,
+        scaling_factors=str(math.sqrt(embed_dim)),
+    )
+    embeddings_slice = lbann.Slice(
+        embeddings,
+        axis=0,
+        slice_points=str_list([0, sequence_length, 2*sequence_length-1]),
+    )
+    encoder_input = lbann.Identity(embeddings_slice)
+    decoder_input = lbann.Identity(embeddings_slice)
+
+    # Apply transformer model
+    transformer = lbann.models.Transformer(
+        hidden_size=embed_dim,
+        num_heads=num_heads,
+        name='transformer',
+    )
+    result = transformer(
+        encoder_input, sequence_length,
+        decoder_input, sequence_length-1,
+    )
+
+    # Reconstruct decoder input
+    preds = lbann.ChannelwiseFullyConnected(
+        result,
+        weights=embedding_weights,
+        output_channel_dims=[vocab_size],
+        bias=False,
+        transpose=True,
+    )
+    preds = lbann.ChannelwiseSoftmax(preds)
+    preds = lbann.Slice(preds, axis=0, slice_points=str_list(range(sequence_length)))
+    preds = [lbann.Identity(preds) for _ in range(sequence_length-1)]
+
+    # Count number of non-pad tokens
+    label_tokens = lbann.Identity(lbann.Slice(
+        input_,
+        slice_points=str_list([sequence_length+1, 2*sequence_length]),
+    ))
+    pads = lbann.Constant(value=pad_index, num_neurons=str(sequence_length-1))
+    is_not_pad = lbann.NotEqual(label_tokens, pads)
+    num_not_pad = lbann.Reduction(is_not_pad, mode='sum')
+
+    # Cross entropy loss with label smoothing
+    label_tokens = lbann.Slice(
+        label_tokens,
+        slice_points=str_list(range(sequence_length)),
+    )
+    label_tokens = [lbann.Identity(label_tokens) for _ in range(sequence_length-1)]
+    if label_smoothing > 0:
+        uniform_label = lbann.Constant(
+            value=1/vocab_size,
+            num_neurons=str_list([1, vocab_size])
+        )
+    loss = []
+    for i in range(sequence_length-1):
+        label = lbann.OneHot(label_tokens[i], size=vocab_size)
+        label = lbann.Reshape(label, dims=str_list([1, vocab_size]))
+        if label_smoothing > 0:
+            label = lbann.WeightedSum(
+                label,
+                uniform_label,
+                scaling_factors=str_list([1-label_smoothing, label_smoothing]),
+            )
+        loss.append(lbann.CrossEntropy(preds[i], label))
+    loss = lbann.Concatenation(loss)
+
+    # Average cross entropy over non-pad tokens
+    loss_scales = lbann.Divide(
+        is_not_pad,
+        lbann.Tessellate(num_not_pad, hint_layer=is_not_pad),
+    )
+    loss = lbann.Multiply(loss, loss_scales)
+    loss = lbann.Reduction(loss, mode='sum')
+
+    # Construct model
+    metrics = []
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    return lbann.Model(
+        num_epochs,
+        layers=lbann.traverse_layer_graph(input_),
+        objective_function=loss,
+        metrics=metrics,
+        callbacks=callbacks,
+    )
+
+# ----------------------------------------------
+# Data reader
+# ----------------------------------------------
+
+def make_data_reader():
+    reader = lbann.reader_pb2.DataReader()
+    _reader = reader.reader.add()
+    _reader.name = 'python'
+    _reader.role = 'train'
+    _reader.shuffle = True
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = 'dataset'
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = 'get_train_sample'
+    _reader.python.num_samples_function = 'num_train_samples'
+    _reader.python.sample_dims_function = 'sample_dims'
+    return reader
+
+# ----------------------------------------------
+# Batch script
+# ----------------------------------------------
+
+def make_batch_script(trainer_params, model_params, script_params):
+
+    # Create LBANN objects
+    trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size)
+    model = make_model(**model_params)
+    reader = make_data_reader()
+
+    # Optimizer with learning rate schedule
+    # Note: Rough approximation of
+    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
+    # with embed_dim=512 and warmup=4000.
+    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
+    model.callbacks.append(
+        lbann.CallbackDropFixedLearningRate(
+            drop_epoch=[1],
+            amt=2,
+        )
+    )
+    model.callbacks.append(
+        lbann.CallbackDropFixedLearningRate(
+            drop_epoch=[2,4,8,12],
+            amt=0.75,
+        )
+    )
+
+    # Checkpoint after every epoch
+    trainer.callbacks.append(
+        lbann.CallbackCheckpoint(
+            checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'),
+            checkpoint_epochs=1,
+        )
+    )
+
+    # Dump weights after every epoch
+    model.callbacks.append(
+        lbann.CallbackDumpWeights(
+            basename=os.path.join(script_params['work_dir'], 'weights'),
+            epoch_interval=1,
+        )
+    )
+
+    # Create Protobuf file
+    protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext')
+    lbann.proto.save_prototext(
+        protobuf_file,
+        trainer=trainer,
+        model=model,
+        data_reader=reader,
+        optimizer=opt,
+    )
+
+    # Create batch script
+    script = lbann.contrib.launcher.make_batch_script(
+        **script_params,
+    )
+    script.add_command('echo "Started training at $(date)"')
+    script.add_parallel_command([
+        lbann.lbann_exe(),
+        f'--prototext={protobuf_file}',
+    ])
+    script.add_command('status=$?')
+    script.add_command('echo "Finished training at $(date)"')
+    script.add_command('exit ${status}')
+    return script
diff --git a/applications/nlp/utils/__init__.py b/applications/nlp/utils/__init__.py
new file mode 100644
index 00000000000..29d6d9d3e7f
--- /dev/null
+++ b/applications/nlp/utils/__init__.py
@@ -0,0 +1,17 @@
+import collections.abc
+
+def make_iterable(obj):
+    """Convert to an iterable object.
+
+    Simply returns `obj` if it is alredy iterable. Otherwise returns a
+    1-tuple containing `obj`.
+
+    """
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+        return obj
+    else:
+        return (obj,)
+
+def str_list(it):
+    """Convert an iterable object to a space-separated string."""
+    return ' '.join([str(i) for i in make_iterable(it)])
diff --git a/applications/nlp/utils/gutenberg.py b/applications/nlp/utils/gutenberg.py
new file mode 100644
index 00000000000..42d5d6da16d
--- /dev/null
+++ b/applications/nlp/utils/gutenberg.py
@@ -0,0 +1,146 @@
+"""Helper functions for text data from Project Gutenberg."""
+import array
+import os
+import os.path
+import re
+import urllib.request
+import numpy as np
+
+
+def get_url(name):
+    """URL to Project Gutenberg text file."""
+    urls = {
+        'frankenstein': 'https://www.gutenberg.org/files/84/84-0.txt',
+        'shakespeare': 'https://www.gutenberg.org/files/100/100-0.txt',
+    }
+    return urls[name.lower()]
+
+
+def strip_boilerplate(raw_file, stripped_file):
+    """Remove header and footer from Project Gutenberg text file.
+
+    See:
+
+    https://www.gutenberg.org/wiki/Gutenberg:Project_Gutenberg_Header_How-To
+
+    Args:
+        raw_file (str): Text file downloaded from Project Gutenberg.
+        stripped_file (str): Path where the stripped file will be
+            saved.
+
+    """
+    with open(raw_file, 'r') as in_file, \
+         open(stripped_file, 'w') as out_file:
+        started = False
+        begin_regex = re.compile('^\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$')
+        end_regex = re.compile('^\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$')
+        for line in in_file:
+            if started:
+                if end_regex.match(line):
+                    break
+                else:
+                    out_file.write(line)
+            elif begin_regex.match(line):
+                started = True
+
+
+def tokenize(text_file,
+             encoded_file=None,
+             vocab_file=None,
+             ignore_whitespace=True):
+    """Convert text file to sequence of token IDs.
+
+    Tokenization is performed with BERT tokenizer.
+
+    Args:
+        text_file (str): Text file to be encoded.
+        encoded_file (str, optional): If provided, path where the
+           encoded data will be saved as an .npz file. The sequence of
+           token IDs is saved as 'encoded_data' and the vocabulary
+           size is saved as 'vocab_size'.
+        vocab_file (str, optional): If provided, path where the
+            vocabulary will be saved as a text file.
+        ignore_whitespace (bool, optional): Whether to ignore text
+            lines that are purely made of whitespace (default: True).
+
+    Returns:
+        array of int: Sequence of token IDs.
+        int: Number of tokens in vocabulary.
+
+    """
+
+    # Get BERT tokenizer from Transformers
+    import transformers
+    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
+    vocab_size = tokenizer.vocab_size
+    if vocab_file:
+        tokenizer.save_vocabulary(vocab_file)
+
+    # Apply tokenizer to text file
+    encoded_data = array.array('l')
+    with open(text_file) as f:
+        for line in f:
+            if ignore_whitespace and line.isspace():
+                continue
+            encoded_data.extend(tokenizer.encode(line))
+    if encoded_file:
+        np.savez_compressed(encoded_file,
+                            encoded_data=encoded_data,
+                            vocab_size=vocab_size)
+    return encoded_data, vocab_size
+
+
+class GutenbergCorpus():
+    """Tokenized text from Project Gutenberg.
+
+    Args:
+        data_dir (str): Directory for downloading data and
+            intermediate.
+        data_url (str): URL to Project Gutenberg text file.
+
+    Attributes:
+        token_data (array of int): Sequence of token IDs.
+        vocab_size (int): Number of tokens in vocabulary.
+
+    """
+    def __init__(self, data_dir, data_url):
+
+        # Create data directory if needed
+        if not os.path.isdir(data_dir):
+            os.makedirs(data_dir)
+        data_dir = os.path.realpath(data_dir)
+
+        # Load tokenized data
+        # Note: If needed, download the text data from Project
+        # Gutenberg and tokenize it.
+        token_data_file = os.path.join(data_dir, 'token_data.npz')
+        if os.path.isfile(token_data_file):
+            data = np.load(token_data_file)
+            token_data = data['encoded_data']
+            vocab_size = int(data['vocab_size'])
+        else:
+            text_data_file = os.path.join(data_dir, 'text_data.txt')
+            if not os.path.isfile(text_data_file):
+                raw_file = os.path.join(data_dir, 'raw.txt')
+                if not os.path.isfile(raw_file):
+                    urllib.request.urlretrieve(data_url,
+                                               filename=raw_file)
+                strip_boilerplate(raw_file, text_data_file)
+            vocab_file = os.path.join(data_dir, 'vocab.txt')
+            token_data, vocab_size = tokenize(text_data_file,
+                                              token_data_file,
+                                              vocab_file)
+
+        # Class members
+        self.token_data = token_data
+        self.vocab_size = vocab_size
+
+    def __iter__(self):
+        """Iterator through token IDs."""
+        return self.token_data.__iter__()
+    def __getitem__(self, key):
+        """Get token ID."""
+        return self.token_data.__getitem__(key)
+    def __len__(self):
+        """Get total number of tokens in corpus."""
+        return self.token_data.__len__()
diff --git a/applications/nlp/utils/paths.py b/applications/nlp/utils/paths.py
new file mode 100644
index 00000000000..70d968230c5
--- /dev/null
+++ b/applications/nlp/utils/paths.py
@@ -0,0 +1,44 @@
+"""Useful file paths."""
+import os
+import os.path
+import re
+import socket
+
+
+def system():
+    """Name of current compute system.
+
+    Primarily used to detect LLNL LC systems.
+
+    """
+    return re.sub(r'\d+', '', socket.gethostname())
+
+
+def root_dir():
+    """Root directory for LBANN NLP application."""
+    return os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+def wmt_dir(system=system()):
+    """Data directory for the WMT 2014 dataset.
+
+    See https://pytorchnlp.readthedocs.io/en/latest/source/torchnlp.datasets.html#torchnlp.datasets.wmt_dataset.
+
+    The dataset has already been downloaded on LLNL LC systems and is
+    available to anyone in the "brainusr" group. If the dataset is not
+    accessible, a path within the application directory is returned.
+
+    """
+
+    # Cached datasets on LC systems
+    path = None
+    if system in ('lassen', 'sierra'):
+        path = '/p/gpfs1/brainusr/datasets/wmt16_en_de'
+    elif system in ('pascal', 'catalyst', 'quartz', 'surface'):
+        path = '/p/lscratchh/brainusr/datasets/wmt16_en_de'
+
+    # Default path if cached dataset isn't available
+    if not path or not os.access(path, os.R_OK):
+        path = os.path.join(root_dir(), 'data', 'wmt16_en_de')
+
+    return path
diff --git a/applications/physics/ICF/README.md b/applications/physics/ICF/README.md
new file mode 100644
index 00000000000..7f134908581
--- /dev/null
+++ b/applications/physics/ICF/README.md
@@ -0,0 +1,93 @@
+## Surrogate Models for Inertial Confinement Fussion
+
+Scripts in this directory are surrogate deep learning models bridging simulation and experimental datasets from inertial confinement fusion (high energy physics) application. Please see [link](https://github.com/rushilanirudh/macc) for more details on model architectures and dataset description. Also, see LBANN documentations on how to install, build and run LBANN code. 
+
+### Pre-train Wasserstein autoencoder (WAE)
+```bash
+python3 pre_train_jag_wae.py
+```
+### Train surrogate model using pre-trained WAE
+```bash
+python3 train_macc_surrogate.py
+```
+Expected output of pre-training WAE in LBANN (90K training, 10K validation dataset, on a single LLNL Pascal GPU) is shown:
+```
+--------------------------------------------------------------------------------
+
+[0] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79]
+
+            global MB = [ 128/ 128/ 128] global last MB = [  33  / 103  /  16  ]
+
+             local MB = [ 128/ 128/ 128]  local last MB = [  33+0/ 103+0/  16+0]
+
+--------------------------------------------------------------------------------
+
+model0 (instance 0) training epoch 0 objective function : 940.059
+
+model0 (instance 0) training epoch 0 recon_error : 0.0572849
+
+model0 (instance 0) training epoch 0 run time : 5.74964s
+
+model0 (instance 0) validation objective function : 34.3421
+
+model0 (instance 0) validation recon_error : 0.00208194
+
+model0 (instance 0) validation run time : 0.494716s
+
+--------------------------------------------------------------------------------
+
+[1] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79]
+
+            global MB = [ 128/ 128/ 128] global last MB = [  33  / 103  /  16  ]
+
+             local MB = [ 128/ 128/ 128]  local last MB = [  33+0/ 103+0/  16+0]
+
+--------------------------------------------------------------------------------
+
+model0 (instance 0) training epoch 1 objective function : 22.2183
+
+model0 (instance 0) training epoch 1 recon_error : 0.00134448
+
+model0 (instance 0) training epoch 1 run time : 5.53825s
+
+model0 (instance 0) validation objective function : 11.6158
+
+model0 (instance 0) validation recon_error : 0.000693222
+
+model0 (instance 0) validation run time : 0.317699s
+
+--------------------------------------------------------------------------------
+
+[2] Epoch : stats formated [tr/v/te] iter/epoch = [719/80/79]
+
+            global MB = [ 128/ 128/ 128] global last MB = [  33  / 103  /  16  ]
+
+             local MB = [ 128/ 128/ 128]  local last MB = [  33+0/ 103+0/  16+0]
+
+--------------------------------------------------------------------------------
+
+model0 (instance 0) training epoch 2 objective function : 9.18846
+
+model0 (instance 0) training epoch 2 recon_error : 0.000554316
+
+model0 (instance 0) training epoch 2 run time : 5.69306s
+
+model0 (instance 0) validation objective function : 6.96061
+
+model0 (instance 0) validation recon_error : 0.00039013
+
+model0 (instance 0) validation run time : 0.315543s
+
+```
+
+### Visual Outputs 
+##### Ground Truth Images
+ 
+![alt text](lbann_gt_img.png)
+
+##### Predicted Images
+![alt text](lbann_pred_img.png)
+
+##### Ground Truth and Predicted Scalars
+![alt text](lbann_gt_pred_sca.png)
+
diff --git a/applications/physics/ICF/check_all_scalar.py b/applications/physics/ICF/check_all_scalar.py
new file mode 100644
index 00000000000..cf1c068a5e2
--- /dev/null
+++ b/applications/physics/ICF/check_all_scalar.py
@@ -0,0 +1,25 @@
+import numpy as np
+import sys
+import glob
+
+#Check if there are scalars with all zero values
+#Input is scalar values dumped from LBANN input layer
+fdir = sys.argv[1]
+epoch = sys.argv[2]
+print(fdir)
+scalar_files = glob.glob(fdir+"*training-epoch"+str(epoch)+"*gt_sca*.npy")
+scalar_jag = np.load(scalar_files[0])
+print("First JAG param shape " , scalar_jag.shape)
+print("param jag ", scalar_jag)
+for i, f in enumerate(scalar_files):
+        if(i > 0) :
+           scalar_jag = np.concatenate((scalar_jag, np.load(f)))
+
+print("Final JAG param shape " , scalar_jag.shape)
+
+
+num_cols = scalar_jag.shape[1]
+print("Num cols ", num_cols)
+
+zeros =  np.where(np.all(np.isclose(scalar_jag, 0), axis=1))
+print("Num of zerors ", zeros[0].shape , "   ", zeros)
diff --git a/applications/physics/ICF/eval_macc_surrogate.py b/applications/physics/ICF/eval_macc_surrogate.py
new file mode 100644
index 00000000000..53f6665c687
--- /dev/null
+++ b/applications/physics/ICF/eval_macc_surrogate.py
@@ -0,0 +1,218 @@
+import macc_models
+import argparse
+import os
+from os.path import abspath, dirname, join
+import google.protobuf.text_format as txtf
+import lbann.contrib.launcher
+import lbann.contrib.args
+from lbann.util import str_list
+import datetime
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+# Default data reader
+cur_dir = dirname(abspath(__file__))
+data_reader_prototext = join(dirname(cur_dir),
+                             'data',
+                             'jag_conduit_reader.prototext')
+metadata_prototext = join(dirname(cur_dir),
+                             'data',
+                             'jag_100M_metadata.prototext')
+
+# Initialize LBANN inf executable
+lbann_exe = abspath(lbann.lbann_exe())
+lbann_exe = join(dirname(lbann_exe), 'lbann_inf')
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='eval', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=4096, type=int,
+    help='mini-batch size (default: 128)', metavar='NUM')
+parser.add_argument(
+    '--num-nodes', action='store', default=4, type=int,
+    help='number of nodes (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ppn', action='store', default=4, type=int,
+    help='processes per node (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ydim', action='store', default=16399, type=int,
+    help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM')
+parser.add_argument(
+    '--zdim', action='store', default=20, type=int,
+    help='latent space dim (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--xdim', action='store', default=5, type=int,
+    help='input (x) dim (default: 5)', metavar='NUM')
+parser.add_argument(
+    '--lamda-cyc', action='store', default=1e-3, type=float,
+    help='lamda-cyc (default: 1e-3)', metavar='NUM')
+parser.add_argument(
+    '--useCNN', action='store', default=False, type=bool,
+    help='use CNN', metavar='BOOL')
+parser.add_argument(
+    '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str,
+    help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME')
+parser.add_argument(
+    '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str,
+    help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME')
+parser.add_argument(
+    '--index-list-train', action='store', default='index_eight.txt', type=str,
+    help='index list (default index_eight 8 samples)', metavar='NAME')
+parser.add_argument(
+    '--index-list-test', action='store', default='t2_index.txt', type=str,
+    help='index list (default index.txt)', metavar='NAME')
+parser.add_argument(
+    '--percent-of-data-to-use', action='store', default=0.01, type=float,
+    help='percent of data to use (default: 0.01)', metavar='NUM')
+parser.add_argument(
+    '--dump-outputs', action='store', default='dump_outs', type=str,
+    help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME')
+parser.add_argument(
+    '--pretrained-dir', action='store', default=None, type=str,
+    help='pretrained WAE surrogate dir  (default: ' ')', metavar='NAME')
+parser.add_argument(
+    '--procs-per-trainer', action='store', default=0, type=int,
+    help='processes per trainer (default: 0)', metavar='NUM')
+args = parser.parse_args()
+
+print("Pretrained dir ", args.pretrained_dir)
+assert args.pretrained_dir, "evaluate script asssumes a pretrained MaCC model"
+
+def list2str(l):
+    return ' '.join(l)
+
+def construct_model():
+    """Construct MACC surrogate model.
+
+    See https://arxiv.org/pdf/1912.08113.pdf model architecture and other details
+
+    """
+    import lbann
+
+    # Layer graph
+    input = lbann.Input(target_mode='N/A',name='inp_data')
+    # data is 64*64*4 images + 15 scalar + 5 param
+    inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+args.xdim]),name='inp_slice')
+    gt_y = lbann.Identity(inp_slice,name='gt_y')
+    gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used
+
+    zero  = lbann.Constant(value=0.0,num_neurons='1',name='zero')
+    one  = lbann.Constant(value=1.0,num_neurons='1',name='one')
+
+
+    z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
+    wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze
+    inv = macc_models.MACCInverse(args.xdim)
+    fwd = macc_models.MACCForward(args.zdim)
+
+
+    y_pred_fwd = wae.encoder(gt_y)
+
+    param_pred_ = wae.encoder(gt_y)
+    input_fake = inv(param_pred_)
+
+    output_cyc = fwd(input_fake)
+    y_image_re2  = wae.decoder(output_cyc)
+
+    '''**** Train cycleGAN input params <--> latent space of (images, scalars) ****'''
+    output_fake = fwd(gt_x)
+    y_image_re = wae.decoder(output_fake)
+
+    y_out = wae.decoder(y_pred_fwd)
+
+    param_pred2_ = wae.encoder(y_image_re)
+    input_cyc = inv(param_pred2_)
+
+    L_l2_x =  lbann.MeanSquaredError(input_fake,gt_x) #(x,inv(enc(y)), (encoder+)inverse loss
+    L_cyc_x = lbann.MeanSquaredError(input_cyc,gt_x)  #param, x cycle loss, from latent space
+
+    L_l2_y =  lbann.MeanSquaredError(output_fake,y_pred_fwd) #pred error into latent space (enc(y),fw(x))
+    L_cyc_y = lbann.MeanSquaredError(output_cyc,y_pred_fwd) # pred error into latent space (enc(y), fw(inv(enc(y))))
+
+
+    #@todo slice here to separate scalar from image
+    img_sca_loss = lbann.MeanSquaredError(y_image_re,gt_y) # (y,dec(fw(x))) #forward model to decoder, no latent space
+    dec_fw_inv_enc_y = lbann.MeanSquaredError(y_image_re2,gt_y) #(y, dec(fw(inv(enc(y))))) y->enc_z->x'->fw_z->y'
+    wae_loss  = lbann.MeanSquaredError(y_out,gt_y) #(y, dec(enc(y)) '
+    #L_cyc = L_cyc_y + L_cyc_x
+    L_cyc = lbann.Add(L_cyc_y, L_cyc_x)
+
+    #loss_gen0  = L_l2_y + lamda_cyc*L_cyc
+    loss_gen0  = lbann.WeightedSum([L_l2_y,L_cyc], scaling_factors=f'1 {args.lamda_cyc}')
+    loss_gen1  = lbann.WeightedSum([L_l2_x,L_cyc_y], scaling_factors=f'1 {args.lamda_cyc}')
+    #loss_gen1  =  L_l2_x + lamda_cyc*L_cyc_y
+
+
+    conc_out = lbann.Concatenation([gt_x,wae_loss,img_sca_loss,dec_fw_inv_enc_y,
+                                    L_l2_x], name='x_errors')
+    layers = list(lbann.traverse_layer_graph(input))
+    weights = set()
+    for l in layers:
+      weights.update(l.weights)
+
+    # Setup objective function
+    obj = lbann.ObjectiveFunction([loss_gen0,loss_gen1])
+    # Initialize check metric callback
+    metrics = [lbann.Metric(img_sca_loss, name='img_re1'),
+               lbann.Metric(dec_fw_inv_enc_y, name='img_re2'),
+               lbann.Metric(wae_loss, name='wae_loss'),
+               lbann.Metric(L_l2_x, name='inverse loss'),
+               lbann.Metric(L_cyc_y, name='output cycle loss'),
+               lbann.Metric(L_cyc_x, name='param cycle loss')]
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackDumpOutputs(layers=f'{conc_out.name}',
+                                           execution_modes='test',
+                                           directory=args.dump_outputs,
+                                           batch_interval=1,
+                                           format='npy'),
+                 lbann.CallbackTimer()]
+
+    # Construct model
+    num_epochs =1
+    return lbann.Model(num_epochs,
+                       weights=weights,
+                       layers=layers,
+                       serialize_io=True,
+                       metrics=metrics,
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+
+if __name__ == '__main__':
+
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
+                            procs_per_trainer=args.procs_per_trainer)
+    model = construct_model()
+    # Setup optimizer
+    opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8)
+    # Load data reader from prototext
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open(data_reader_prototext, 'r') as f:
+      txtf.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt,
+                       lbann_exe,
+                       scheduler='lsf',
+                       partition='pdebug',
+                       nodes=args.num_nodes,
+                       procs_per_node=args.ppn,
+                       time_limit=30,
+                       setup_only=False,
+                       batch_job=False,
+                       job_name=args.job_name,
+                       lbann_args=['--preload_data_store --use_data_store --load_model_weights_dir_is_complete',
+                                   f'--metadata={metadata_prototext}',
+                                   f'--load_model_weights_dir={args.pretrained_dir}',
+                                   f'--index_list_test={args.index_list_test}',
+                                   f'--data_filedir_test={args.data_filedir_test}'],
+                                   **kwargs)
+    print(status)
diff --git a/applications/physics/ICF/jag_models.py b/applications/physics/ICF/jag_models.py
new file mode 100644
index 00000000000..47399bee33f
--- /dev/null
+++ b/applications/physics/ICF/jag_models.py
@@ -0,0 +1,71 @@
+import lbann
+import lbann.modules.base
+
+
+class WAE(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, encoder_out_dim, decoder_out_dim, name=None):
+       self.instance = 0
+       self.name = (name if name
+                     else 'wae{0}'.format(WAE.global_count))
+
+       fc = lbann.modules.FullyConnectedModule
+       disc_neurons = [128,64,1]
+       encoder_neurons = [32,256,128]
+       decoder_neurons = [64,128,256]
+       
+       #Encoder
+       self.enc_fc0 = fc(encoder_neurons[0],activation=lbann.Elu,name=self.name+'_enc_fc0')
+       self.enc_fc1 = fc(encoder_neurons[1],activation=lbann.Tanh,name=self.name+'_enc_fc1')
+       self.enc_fc2 = fc(encoder_neurons[2],activation=lbann.Tanh,name=self.name+'_enc_fc2')
+       self.enc_out = fc(encoder_out_dim,name='enc_out')
+       
+       #Decoder
+       self.dec_fc0 = fc(decoder_neurons[0],activation=lbann.Elu,name=self.name+'_dec_fc0')
+       self.dec_fc1 = fc(decoder_neurons[1],activation=lbann.Tanh,name=self.name+'_dec_fc1')
+       self.dec_fc2 = fc(decoder_neurons[2],activation=lbann.Tanh,name=self.name+'_dec_fc2')
+       self.dec_out = fc(decoder_out_dim,name='pred_y')
+       
+       #Discriminator1
+       self.d0_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc0_fc0')
+       self.d0_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc0_fc1')
+       self.d0_fc2 = fc(disc_neurons[2],name=self.name+'_disc0_fc2')
+
+       #Discriminator2
+       #stacked_discriminator, this will be frozen, no optimizer, 
+       #layer has to be named for replace layer callback 
+       self.d1_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc1_fc0')
+       self.d1_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc1_fc1')
+       self.d1_fc2 = fc(disc_neurons[2],name=self.name+'_disc1_fc2')
+ 
+
+    def forward(self, z, y):
+    
+        z_sample = self.forward_encoder(y)
+
+        y_recon = self.forward_decoder(z_sample)
+
+        #d real/fake share weights, shared weights is copied to d_adv 
+        #(through replace weight callback) and freeze
+        d_real = self.forward_discriminator0(lbann.Concatenation([y,z],axis=0))  
+        y_z_sample = lbann.Concatenation([y,z_sample],axis=0)
+        d_fake = self.forward_discriminator0(lbann.StopGradient(y_z_sample)) 
+        d_adv = self.forward_discriminator1(y_z_sample) #freeze
+
+        return d_real, d_fake, d_adv,y_recon
+
+    def forward_encoder(self,y):
+        bn = lbann.BatchNormalization
+        return self.enc_out(bn(self.enc_fc2(bn(self.enc_fc1(bn(self.enc_fc0(y),epsilon=1e-3)
+                               ),epsilon=1e-3)),epsilon=1e-3))
+
+    def forward_decoder(self,z):
+        return self.dec_out(self.dec_fc2(self.dec_fc1(self.dec_fc0(z))))
+
+    def forward_discriminator0(self,input):
+        return self.d0_fc2(self.d0_fc1(self.d0_fc0(input)))
+        
+    def forward_discriminator1(self,input):
+        return self.d1_fc2(self.d1_fc1(self.d1_fc0(input)))
diff --git a/applications/physics/ICF/lbann_gt_img.png b/applications/physics/ICF/lbann_gt_img.png
new file mode 100644
index 00000000000..c0cd09203f9
Binary files /dev/null and b/applications/physics/ICF/lbann_gt_img.png differ
diff --git a/applications/physics/ICF/lbann_gt_pred_sca.png b/applications/physics/ICF/lbann_gt_pred_sca.png
new file mode 100644
index 00000000000..961653aac4a
Binary files /dev/null and b/applications/physics/ICF/lbann_gt_pred_sca.png differ
diff --git a/applications/physics/ICF/lbann_pred_img.png b/applications/physics/ICF/lbann_pred_img.png
new file mode 100644
index 00000000000..8395d408889
Binary files /dev/null and b/applications/physics/ICF/lbann_pred_img.png differ
diff --git a/applications/physics/ICF/macc_models.py b/applications/physics/ICF/macc_models.py
new file mode 100644
index 00000000000..bd6e9c9202d
--- /dev/null
+++ b/applications/physics/ICF/macc_models.py
@@ -0,0 +1,156 @@
+import lbann
+import lbann.modules.base
+
+
+#Synonymous to fc_gen0
+class MACCForward(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, out_dim,name=None):
+       self.instance = 0
+       self.name = (name if name
+                     else 'macc_forward{0}'.format(MACCForward.global_count))
+
+       fc = lbann.modules.FullyConnectedModule
+       
+       #generator #fc2_gen0
+       g_neurons = [32,256,1024]
+       self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i))
+                      for i in range(len(g_neurons))]
+       self.predy = fc(out_dim,name=self.name+'pred_out')
+      
+    def forward(self,x):
+        return self.predy(self.gen_fc[2](self.gen_fc[1](self.gen_fc[0](x))))
+ 
+#Synonymous to fc_gen1
+class MACCInverse(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, out_dim,name=None):
+       self.instance = 0
+       self.name = (name if name
+                     else 'macc_inverse{0}'.format(MACCInverse.global_count))
+
+       fc = lbann.modules.FullyConnectedModule
+       
+       #generator #fc_gen1
+       g_neurons = [16,128,64]
+       self.gen_fc = [fc(g_neurons[i],activation=lbann.Relu, name=self.name+'gen_fc'+str(i))
+                      for i in range(len(g_neurons))]
+       self.predx = fc(out_dim,name=self.name+'pred_out')
+
+    def forward(self,y):
+        return self.predx(self.gen_fc[2](self.gen_fc[1](self.gen_fc[0](y))))
+
+
+class MACCWAE(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, encoder_out_dim, decoder_out_dim, scalar_dim = 15, use_CNN=False, name=None):
+       self.instance = 0
+       self.name = (name if name
+                     else 'macc_wae{0}'.format(MACCWAE.global_count))
+
+       self.use_CNN = use_CNN
+
+       fc = lbann.modules.FullyConnectedModule
+       conv = lbann.modules.Convolution2dModule
+
+       disc_neurons = [128,64,1]
+       encoder_neurons = [32,256,128]
+       decoder_neurons = [64,128,256]
+
+       enc_outc = [64,32,16]
+       dec_outc = [32,16,4]
+       
+       #Encoder
+       self.enc_fc0 = fc(encoder_neurons[0],activation=lbann.Elu,name=self.name+'_enc_fc0')
+       self.enc_fc1 = fc(encoder_neurons[1],activation=lbann.Tanh,name=self.name+'_enc_fc1')
+       self.enc_fc2 = fc(encoder_neurons[2],activation=lbann.Tanh,name=self.name+'_enc_fc2')
+       self.enc_out = fc(encoder_out_dim,name=self.name+'enc_out')
+     
+       #Decoder
+       self.dec_fc0 = fc(decoder_neurons[0],activation=lbann.Elu,name=self.name+'_dec_fc0')
+       self.dec_fc1 = fc(decoder_neurons[1],activation=lbann.Tanh,name=self.name+'_dec_fc1')
+       self.dec_fc2 = fc(decoder_neurons[2],activation=lbann.Tanh,name=self.name+'_dec_fc2')
+       self.dec_out = fc(decoder_out_dim,name=self.name+'pred_y')
+       
+       #Discriminator1
+       self.d0_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc0_fc0')
+       self.d0_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc0_fc1')
+       self.d0_fc2 = fc(disc_neurons[2],name=self.name+'_disc0_fc2')
+
+       #Discriminator2
+       #stacked_discriminator, this will be frozen, no optimizer, 
+       #layer has to be named for replace layer callback 
+       self.d1_fc0 = fc(disc_neurons[0],activation=lbann.Relu,name=self.name+'_disc1_fc0')
+       self.d1_fc1 = fc(disc_neurons[1],activation=lbann.Relu,name=self.name+'_disc1_fc1')
+       self.d1_fc2 = fc(disc_neurons[2],name=self.name+'_disc1_fc2')
+
+       #Encoder_CNN
+       self.enc_conv = [conv(enc_outc[i], 4, stride=2, padding=1, activation=lbann.Relu,
+                        name=self.name+'_enc_conv'+str(i)) for i in range(len(enc_outc))] 
+
+       #Decoder_CNN 
+       #Arxiv paper/PNAS configuration is D1: Dense(32,1024)
+       self.dec_cnn_fc = fc(16*8*8,activation=lbann.Relu,name=self.name+'_dec_cnn_fc')
+       self.dec_fc_sca = fc(scalar_dim, name=self.name+'_dec_sca_fc')
+       self.dec_convT = [conv(dec_outc[i], 4, stride=2, padding=1,
+                        transpose=True, name=self.name+'_dec_conv'+str(i))
+                        for i in range(len(dec_outc))]
+ 
+    def forward(self, z, y):
+         
+        z_sample = self.encoder(y)
+
+        y_recon = self.decoder(z_sample)
+
+        #d real/fake share weights, shared weights is copied to d_adv 
+        #(through replace weight callback) and freeze
+        d_real = self.discriminator0(lbann.Concatenation([y,z],axis=0))  
+        y_z_sample = lbann.Concatenation([y,z_sample],axis=0)
+        d_fake = self.discriminator0(lbann.StopGradient(y_z_sample)) 
+        d_adv = self.discriminator1(y_z_sample) #freeze
+
+        return d_real, d_fake, d_adv,y_recon
+
+    def encoder(self, y):
+        return self.encoder_cnn(y) if self.use_CNN else self.encoder_fc(y) 
+
+    def encoder_fc(self,y):
+        return self.enc_out(self.enc_fc2(self.enc_fc1(self.enc_fc0(y))))
+
+    def encoder_cnn(self,y):
+        img_sca = lbann.Slice(y, axis=0, slice_points="0 16384 16399", name=self.name+'_y_slice')
+        #assume C first, is data C first?
+        img = lbann.Reshape(img_sca, dims='4 64 64',name=self.name+'enc_reshape0')
+        x = self.enc_conv[2](self.enc_conv[1](self.enc_conv[0](img)))
+        x = lbann.Reshape(x, dims=str(16*8*8), name=self.name+'enc_reshape1')
+        h_stack = lbann.Concatenation([x,img_sca],axis=0)
+        z = self.enc_out(h_stack)
+        return z
+
+    def decoder(self, z):
+        return self.decoder_cnn(z) if self.use_CNN else self.decoder_fc(z) 
+
+    def decoder_fc(self,z):
+        return self.dec_out(self.dec_fc2(self.dec_fc1(self.dec_fc0(z))))
+   
+    def decoder_cnn(self,z):
+        x = self.dec_cnn_fc(z)
+        sca = self.dec_fc_sca(lbann.Identity(x))
+        img = lbann.Reshape(lbann.Identity(x), dims="16 8 8", name=self.name+'dec_reshape0')
+        img = self.dec_convT[2](lbann.Relu(self.dec_convT[1](lbann.Relu(self.dec_convT[0](img)))))
+        #concat for common interface, slice in output
+        img = lbann.Reshape(img, dims=str(64*64*4), name=self.name+'dec_reshape1') #?? check tensor shape
+        #todo check that concat size == dec_out_dim
+        return lbann.Concatenation([img,sca],axis=0)
+
+    def discriminator0(self,input):
+        return self.d0_fc2(self.d0_fc1(self.d0_fc0(input)))
+        
+    def discriminator1(self,input):
+        return self.d1_fc2(self.d1_fc1(self.d1_fc0(input)))
diff --git a/applications/physics/ICF/pre_train_jag_wae.py b/applications/physics/ICF/pre_train_jag_wae.py
new file mode 100644
index 00000000000..997e7a26a39
--- /dev/null
+++ b/applications/physics/ICF/pre_train_jag_wae.py
@@ -0,0 +1,172 @@
+import macc_models
+import argparse
+from os.path import abspath, dirname, join
+import google.protobuf.text_format as txtf
+import lbann.contrib.launcher
+import lbann.contrib.args
+from lbann.util import str_list
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+# Default data reader
+model_zoo_dir = dirname(dirname(abspath(__file__)))
+data_reader_prototext = join(model_zoo_dir,
+                             'data',
+                             'jag_conduit_reader.prototext')
+metadata_prototext = join(model_zoo_dir,
+                             'data',
+                             'jag_100M_metadata.prototext')
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='wae', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=128, type=int,
+    help='mini-batch size (default: 128)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=100, type=int,
+    help='number of epochs (default: 100)', metavar='NUM')
+parser.add_argument(
+    '--num-nodes', action='store', default=4, type=int,
+    help='number of nodes (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ppn', action='store', default=4, type=int,
+    help='processes per node (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ydim', action='store', default=16399, type=int,
+    help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM')
+parser.add_argument(
+    '--zdim', action='store', default=20, type=int,
+    help='latent space dim (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--useCNN', action='store', default=False, type=bool,
+    help='use CNN', metavar='BOOL')
+parser.add_argument(
+    '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str,
+    help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME')
+parser.add_argument(
+    '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str,
+    help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME')
+parser.add_argument(
+    '--index-list-train', action='store', default='index.txt', type=str,
+    help='index list (default index.txt)', metavar='NAME')
+parser.add_argument(
+    '--index-list-test', action='store', default='t0_sample_list_multi_10K.txt', type=str,
+    help='index list (default t0_sample_list_multi_10K.txt, 100 samples)', metavar='NAME')
+parser.add_argument(
+    '--dump-outputs', action='store', default='dump_outs', type=str,
+    help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME')
+parser.add_argument(
+    '--dump-models', action='store', default='dump_models', type=str,
+    help='dump models dir (default: jobdir/dump_models)', metavar='NAME')
+parser.add_argument(
+    '--procs-per-trainer', action='store', default=0, type=int,
+    help='processes per trainer (default: 0)', metavar='NUM')
+args = parser.parse_args()
+
+
+def list2str(l):
+    return ' '.join(l)
+
+def construct_model():
+    """Construct LBANN model.
+
+    JAG Wasserstein autoencoder  model
+
+    """
+    import lbann
+
+    # Layer graph
+    input = lbann.Input(target_mode='N/A', name='inp_data')
+    # data is 64*64*4 images + 15 scalar + 5 param
+    #inp_slice = lbann.Slice(input, axis=0, slice_points="0 16399 16404",name='inp_slice')
+    inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+5]),name='inp_slice')
+    gt_y = lbann.Identity(inp_slice,name='gt_y')
+    gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used
+
+    zero  = lbann.Constant(value=0.0,num_neurons='1',name='zero')
+    one  = lbann.Constant(value=1.0,num_neurons='1',name='one')
+
+    z_dim = 20  #Latent space dim
+
+    z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
+    model = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN)
+    d1_real, d1_fake, d_adv, pred_y  = model(z,gt_y)
+
+    d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce')
+    d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce')
+    d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce')
+    img_loss = lbann.MeanSquaredError([pred_y,gt_y])
+    rec_error = lbann.L2Norm2(lbann.WeightedSum([pred_y,gt_y], scaling_factors="1 -1"))
+
+    layers = list(lbann.traverse_layer_graph(input))
+    # Setup objective function
+    weights = set()
+    src_layers = []
+    dst_layers = []
+    for l in layers:
+      if(l.weights and "disc0" in l.name and "instance1" in l.name):
+        src_layers.append(l.name)
+      #freeze weights in disc2
+      if(l.weights and "disc1" in l.name):
+        dst_layers.append(l.name)
+        for idx in range(len(l.weights)):
+          l.weights[idx].optimizer = lbann.NoOptimizer()
+      weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4)
+    d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01)
+    obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce,img_loss,rec_error,l2_reg])
+    # Initialize check metric callback
+    metrics = [lbann.Metric(img_loss, name='recon_error')]
+    #pred_y = macc_models.MACCWAE.pred_y_name
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 lbann.CallbackSaveModel(dir=args.dump_models),
+                 lbann.CallbackReplaceWeights(source_layers=list2str(src_layers),
+                                      destination_layers=list2str(dst_layers),
+                                      batch_interval=2)]
+
+    # Construct model
+    return lbann.Model(args.num_epochs,
+                       serialize_io=True,
+                       weights=weights,
+                       layers=layers,
+                       metrics=metrics,
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+
+if __name__ == '__main__':
+    import lbann
+
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
+                            procs_per_trainer=args.procs_per_trainer)
+    model = construct_model()
+    # Setup optimizer
+    opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8)
+    # Load data reader from prototext
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open(data_reader_prototext, 'r') as f:
+      txtf.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt,
+                       nodes=args.num_nodes,
+                       procs_per_node=args.ppn,
+                       time_limit=720,
+                       setup_only=False,
+                       job_name=args.job_name,
+                       lbann_args=['--use_data_store --preload_data_store',
+                                   f'--metadata={metadata_prototext}',
+                                   f'--index_list_train={args.index_list_train}',
+                                   f'--index_list_test={args.index_list_test}',
+                                   f'--data_filedir_train={args.data_filedir_train}',
+                                   f'--data_filedir_test={args.data_filedir_test}'],
+                                   **kwargs)
+    print(status)
diff --git a/applications/physics/ICF/train_jag_wae.py b/applications/physics/ICF/train_jag_wae.py
new file mode 100644
index 00000000000..b6f41e8c493
--- /dev/null
+++ b/applications/physics/ICF/train_jag_wae.py
@@ -0,0 +1,107 @@
+import jag_models
+from os.path import abspath, dirname, join
+import google.protobuf.text_format as txtf
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+# Default data reader
+model_zoo_dir = dirname(dirname(abspath(__file__)))
+data_reader_prototext = join(model_zoo_dir,
+                             'data',
+                             'jag_100Kdata.prototext')
+
+
+def list2str(l):
+    return ' '.join(l)
+
+def construct_model():
+    """Construct LBANN model.
+
+    JAG Wasserstein autoencoder  model
+
+    """
+    import lbann
+
+    # Layer graph
+    input = lbann.Input(target_mode='N/A',name='inp_data')
+    # data is 64*64*4 images + 15 scalar + 5 param
+    inp_slice = lbann.Slice(input, axis=0, slice_points="0 16399 16404",name='inp_slice')
+    gt_y = lbann.Identity(inp_slice,name='gt_y')
+    gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used
+
+    zero  = lbann.Constant(value=0.0,num_neurons='1',name='zero')
+    one  = lbann.Constant(value=1.0,num_neurons='1',name='one')
+
+    y_dim = 16399 #image+scalar shape
+    z_dim = 20  #Latent space dim
+
+    z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
+    d1_real, d1_fake, d_adv, pred_y  = jag_models.WAE(z_dim,y_dim)(z,gt_y)
+
+    d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce')
+    d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce')
+    d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce')
+
+    img_loss = lbann.MeanSquaredError([pred_y,gt_y])
+    rec_error = lbann.L2Norm2(lbann.WeightedSum([pred_y,gt_y], scaling_factors="1 -1"))
+
+    layers = list(lbann.traverse_layer_graph(input))
+    # Setup objective function
+    weights = set()
+    src_layers = []
+    dst_layers = []
+    for l in layers:
+      if(l.weights and "disc0" in l.name and "instance1" in l.name):
+        src_layers.append(l.name)
+      #freeze weights in disc2
+      if(l.weights and "disc1" in l.name):
+        dst_layers.append(l.name)
+        for idx in range(len(l.weights)):
+          l.weights[idx].optimizer = lbann.NoOptimizer()
+      weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4)
+    d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01)
+    obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce,img_loss,rec_error,l2_reg])
+    # Initialize check metric callback
+    metrics = [lbann.Metric(img_loss, name='recon_error')]
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 lbann.CallbackReplaceWeights(source_layers=list2str(src_layers),
+                                      destination_layers=list2str(dst_layers),
+                                      batch_interval=2)]
+
+    # Construct model
+    num_epochs = 100
+    return lbann.Model(num_epochs,
+                       weights=weights,
+                       layers=layers,
+                       metrics=metrics,
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+
+if __name__ == '__main__':
+    import lbann
+
+    mini_batch_size = 128
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model()
+    # Setup optimizer
+    opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8)
+    # Load data reader from prototext
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open(data_reader_prototext, 'r') as f:
+      txtf.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+
+    status = lbann.run(trainer,model, data_reader_proto, opt,
+                       scheduler='slurm',
+                       nodes=1,
+                       procs_per_node=1,
+                       time_limit=360,
+                       setup_only=True,
+                       job_name='jag_wae')
+    print(status)
diff --git a/applications/physics/ICF/train_macc_surrogate.py b/applications/physics/ICF/train_macc_surrogate.py
new file mode 100644
index 00000000000..396465d5838
--- /dev/null
+++ b/applications/physics/ICF/train_macc_surrogate.py
@@ -0,0 +1,218 @@
+import macc_models
+import argparse
+from os.path import abspath, dirname, join
+import google.protobuf.text_format as txtf
+import lbann.contrib.launcher
+import lbann.contrib.args
+from lbann.util import str_list
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+# Default data reader
+cur_dir = dirname(abspath(__file__))
+data_reader_prototext = join(dirname(cur_dir),
+                             'data',
+                             'jag_conduit_reader.prototext')
+metadata_prototext = join(dirname(cur_dir),
+                             'data',
+                             'jag_100M_metadata.prototext')
+
+#model_dir=''
+#Load at least pretrained WAE model
+#assert model_dir, 'pre_trained_dir should not be empty'
+#Assume pre_trained model is in current directory, change path if not
+#pre_trained_dir=join(cur_dir,model_dir)
+
+# Command-line arguments
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='surrogate', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=128, type=int,
+    help='mini-batch size (default: 128)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=100, type=int,
+    help='number of epochs (default: 100)', metavar='NUM')
+parser.add_argument(
+    '--num-nodes', action='store', default=4, type=int,
+    help='number of nodes (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ppn', action='store', default=4, type=int,
+    help='processes per node (default: 4)', metavar='NUM')
+parser.add_argument(
+    '--ydim', action='store', default=16399, type=int,
+    help='image+scalar dim (default: 64*64*4+15=16399)', metavar='NUM')
+parser.add_argument(
+    '--zdim', action='store', default=20, type=int,
+    help='latent space dim (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--xdim', action='store', default=5, type=int,
+    help='input (x) dim (default: 5)', metavar='NUM')
+parser.add_argument(
+    '--lamda-cyc', action='store', default=1e-3, type=float,
+    help='lamda-cyc (default: 1e-3)', metavar='NUM')
+parser.add_argument(
+    '--useCNN', action='store', default=False, type=bool,
+    help='use CNN', metavar='BOOL')
+parser.add_argument(
+    '--data-filedir-train', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_A/', type=str,
+    help='data filedir (default train dir is 10MJAG/1M_A)', metavar='NAME')
+parser.add_argument(
+    '--data-filedir-test', action='store', default='/p/gpfs1/brainusr/datasets/10MJAG/1M_B/', type=str,
+    help='data filedir (default test dir is 10MJAG/1M_B)', metavar='NAME')
+parser.add_argument(
+    '--index-list-train', action='store', default='index.txt', type=str,
+    help='index list (default index.txt)', metavar='NAME')
+parser.add_argument(
+    '--index-list-test', action='store', default='t0_sample_list_multi_10K.txt', type=str,
+    help='index list (default t0_sample_list_multi_10K.txt, 100 samples)', metavar='NAME')
+parser.add_argument(
+    '--dump-outputs', action='store', default='dump_outs', type=str,
+    help='dump outputs dir (default: jobdir/dump_outs)', metavar='NAME')
+parser.add_argument(
+    '--dump-models', action='store', default='dump_models', type=str,
+    help='dump models dir (default: jobdir/dump_models)', metavar='NAME')
+parser.add_argument(
+    '--pretrained-dir', action='store', default=' ', type=str,
+    help='pretrained WAE dir  (default: empty)', metavar='NAME')
+parser.add_argument(
+    '--procs-per-trainer', action='store', default=0, type=int,
+    help='processes per trainer (default: 0)', metavar='NUM')
+args = parser.parse_args()
+
+if not(args.pretrained_dir):
+  print("WARNING pretrained dir ", args.pretrained_dir, " is empty, default option assumes
+         pretrained autoencoder")
+
+def list2str(l):
+    return ' '.join(l)
+
+def construct_model():
+    """Construct MACC surrogate model.
+
+    See https://arxiv.org/pdf/1912.08113.pdf model architecture and other details
+
+    """
+    import lbann
+
+    # Layer graph
+    input = lbann.Input(target_mode='N/A',name='inp_data')
+    # data is 64*64*4 images + 15 scalar + 5 param
+    inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0,args.ydim,args.ydim+args.xdim]),name='inp_slice')
+    gt_y = lbann.Identity(inp_slice,name='gt_y')
+    gt_x = lbann.Identity(inp_slice, name='gt_x') #param not used
+
+    zero  = lbann.Constant(value=0.0,num_neurons='1',name='zero')
+    one  = lbann.Constant(value=1.0,num_neurons='1',name='one')
+
+
+    z = lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="20")
+    wae = macc_models.MACCWAE(args.zdim,args.ydim,use_CNN=args.useCNN) #pretrained, freeze
+    inv = macc_models.MACCInverse(args.xdim)
+    fwd = macc_models.MACCForward(args.zdim)
+
+
+    y_pred_fwd = wae.encoder(gt_y)
+
+    param_pred_ = wae.encoder(gt_y)
+    input_fake = inv(param_pred_)
+
+    output_cyc = fwd(input_fake)
+    y_image_re2  = wae.decoder(output_cyc)
+
+    '''**** Train cycleGAN input params <--> latent space of (images, scalars) ****'''
+    output_fake = fwd(gt_x)
+    y_image_re = wae.decoder(output_fake)
+
+    param_pred2_ = wae.encoder(y_image_re)
+    input_cyc = inv(param_pred2_)
+
+    L_l2_x =  lbann.MeanSquaredError(input_fake,gt_x)
+    L_cyc_x = lbann.MeanSquaredError(input_cyc,gt_x)
+
+    L_l2_y =  lbann.MeanSquaredError(output_fake,y_pred_fwd)
+    L_cyc_y = lbann.MeanSquaredError(output_cyc,y_pred_fwd)
+
+
+    #@todo slice here to separate scalar from image
+    img_sca_loss = lbann.MeanSquaredError(y_image_re,gt_y)
+    #L_cyc = L_cyc_y + L_cyc_x
+    L_cyc = lbann.Add(L_cyc_y, L_cyc_x)
+
+    #loss_gen0  = L_l2_y + lamda_cyc*L_cyc
+    loss_gen0  = lbann.WeightedSum([L_l2_y,L_cyc], scaling_factors=f'1 {args.lamda_cyc}')
+    loss_gen1  = lbann.WeightedSum([L_l2_x,L_cyc_y], scaling_factors=f'1 {args.lamda_cyc}')
+    #loss_gen1  =  L_l2_x + lamda_cyc*L_cyc_y
+
+
+    layers = list(lbann.traverse_layer_graph(input))
+    weights = set()
+    #Freeze appropriate (pretrained) weights
+    pretrained_models = ["wae"]  #add macc?
+    for l in layers:
+      for idx in range(len(pretrained_models)):
+        if(l.weights and pretrained_models[idx] in l.name):
+          for w in range(len(l.weights)):
+            l.weights[w].optimizer = lbann.NoOptimizer()
+      weights.update(l.weights)
+
+    l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4)
+    #d_adv_bce = lbann.LayerTerm(d_adv_bce,scale=0.01)
+    # Setup objective function
+    obj = lbann.ObjectiveFunction([loss_gen0,loss_gen1,l2_reg])
+    # Initialize check metric callback
+    metrics = [lbann.Metric(img_sca_loss, name='fw_loss'),
+               lbann.Metric(L_l2_x, name='inverse loss'),
+               lbann.Metric(L_cyc_y, name='output cycle loss'),
+               lbann.Metric(L_cyc_x, name='param cycle loss')]
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackSaveModel(dir=args.dump_models),
+                 lbann.CallbackLoadModel(dirs=str(args.pretrained_dir)),
+                 lbann.CallbackTimer()]
+
+    # Construct model
+    return lbann.Model(args.num_epochs,
+                       weights=weights,
+                       serialize_io=True,
+                       layers=layers,
+                       metrics=metrics,
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+
+if __name__ == '__main__':
+    import lbann
+
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
+                            procs_per_trainer=args.procs_per_trainer)
+    model = construct_model()
+    # Setup optimizer
+    opt = lbann.Adam(learn_rate=0.0001,beta1=0.9,beta2=0.99,eps=1e-8)
+    # Load data reader from prototext
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open(data_reader_prototext, 'r') as f:
+      txtf.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    status = lbann.contrib.launcher.run(trainer,model, data_reader_proto, opt,
+                       scheduler='lsf',
+                       nodes=args.num_nodes,
+                       procs_per_node=args.ppn,
+                       partition='pbatch',
+                       time_limit=480,
+                       setup_only=False,
+                       job_name=args.job_name,
+                       lbann_args=['--preload_data_store --use_data_store',
+                                   f'--metadata={metadata_prototext}',
+                                   f'--index_list_train={args.index_list_train}',
+                                   f'--index_list_test={args.index_list_test}',
+                                   f'--data_filedir_train={args.data_filedir_train}',
+                                   f'--data_filedir_test={args.data_filedir_test}'],
+                                   **kwargs)
+    print(status)
diff --git a/applications/physics/cosmology/ExaGAN/ExaGAN.py b/applications/physics/cosmology/ExaGAN/ExaGAN.py
new file mode 100644
index 00000000000..17a8bc90dfb
--- /dev/null
+++ b/applications/physics/cosmology/ExaGAN/ExaGAN.py
@@ -0,0 +1,99 @@
+import lbann
+import lbann.modules.base
+import lbann.models.resnet
+
+
+class CosmoGAN(lbann.modules.Module):
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, name=None):
+       self.instance = 0
+       self.name = (name if name
+                     else 'ExaGAN{0}'.format(CosmoGAN.global_count))
+
+       convbnrelu = lbann.models.resnet.ConvBNRelu
+       fc = lbann.modules.FullyConnectedModule
+       conv = lbann.modules.Convolution2dModule
+       #bn_stats_grp_sz = 0 #0 global, 1 local
+       bn_stats_grp_sz = -1 #0 global, 1 local
+
+       ##MCR properties #@todo: make multichannel optional
+       self.datascale = 4 
+       self.linear_scaler=1000.
+
+       self.inits = {'dense': lbann.NormalInitializer(mean=0,standard_deviation=0.02),
+                      'conv': lbann.NormalInitializer(mean=0,standard_deviation=0.02), #should be truncated Normal
+                      'convT':lbann.NormalInitializer(mean=0,standard_deviation=0.02)}
+       
+       d_neurons = [64,128,256,512]
+       self.d1_conv = [convbnrelu(d_neurons[i], 4, 2, 1, False, bn_stats_grp_sz, False,name=self.name+'_disc1_conv'+str(i))
+                   for i in range(len(d_neurons))] 
+       self.d1_fc = fc(1,name=self.name+'_disc1_fc',
+                       weights=[lbann.Weights(initializer=self.inits['dense'])])
+
+       #stacked_discriminator, this will be frozen, no optimizer, 
+       #layer has to be named for callback
+       self.d2_conv = [convbnrelu(d_neurons[i], 4, 2, 1, False, bn_stats_grp_sz, False,name=self.name+'_disc2_conv'+str(i))
+                   for i in range(len(d_neurons))] 
+       self.d2_fc = fc(1,name=self.name+'_disc2_fc',
+                       weights=[lbann.Weights(initializer=self.inits['dense'])])
+       #generator
+       g_neurons = [256,128,64]
+      
+       self.g_convT = [conv(g_neurons[i], 5, stride=2, padding=2, transpose=True,
+                       weights=[lbann.Weights(initializer=self.inits['convT'])])
+                       for i in range(len(g_neurons))] 
+ 
+       self.g_fc1 = fc(32768,name=self.name+'_gen_fc1',
+                       weights=[lbann.Weights(initializer=self.inits['dense'])])
+       self.g_convT3 = conv(1, 5, stride=2, padding=2, activation=lbann.Tanh,name='gen_img',transpose=True,
+                       weights=[lbann.Weights(initializer=self.inits['convT'])])
+
+    def forward(self, img, z):
+    #description
+        d1_real = self.forward_discriminator1(img)  #instance1
+        gen_img = self.forward_generator(z)
+        d1_fake = self.forward_discriminator1(lbann.StopGradient(gen_img)) #instance2
+        d_adv = self.forward_discriminator2(gen_img) #instance 3 //need to freeze
+        #d1s share weights, d1_w is copied to d_adv (through replace weight callback) and freeze
+        return d1_real, d1_fake, d_adv,gen_img
+
+    def forward_discriminator1(self,y):
+        ch2 = self.inv_transform(lbann.Identity(y))
+        y = lbann.Concatenation(lbann.Identity(y),ch2,axis=0)
+        img = lbann.Reshape(y, dims='2 128 128')
+        x = lbann.LeakyRelu(self.d1_conv[0](img), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d1_conv[1](x), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d1_conv[2](x), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d1_conv[3](x), negative_slope=0.2)
+        return self.d1_fc(lbann.Reshape(x,dims='32768')) 
+
+    def forward_discriminator2(self,y):
+        ch2 = self.inv_transform(lbann.Identity(y))
+        y = lbann.Concatenation(lbann.Identity(y),ch2,axis=0)
+        img = lbann.Reshape(y, dims='2 128 128')
+        x = lbann.LeakyRelu(self.d2_conv[0](img), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d2_conv[1](x), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d2_conv[2](x), negative_slope=0.2)
+        x = lbann.LeakyRelu(self.d2_conv[3](x), negative_slope=0.2)
+        return self.d2_fc(lbann.Reshape(x,dims='32768')) 
+ 
+    def forward_generator(self,z):
+        x = lbann.Relu(lbann.BatchNormalization(self.g_fc1(z),decay=0.9,scale_init=1.0,epsilon=1e-5))
+        x = lbann.Reshape(x, dims='512 8 8') #channel first
+        x = lbann.Relu(lbann.BatchNormalization(self.g_convT[0](x),decay=0.9,scale_init=1.0,epsilon=1e-5))
+        x = lbann.Relu(lbann.BatchNormalization(self.g_convT[1](x),decay=0.9,scale_init=1.0,epsilon=1e-5))
+        x = lbann.Relu(lbann.BatchNormalization(self.g_convT[2](x),decay=0.9,scale_init=1.0,epsilon=1e-5))
+        return self.g_convT3(x) 
+
+    def inv_transform(self,y): 
+        inv_transform = lbann.WeightedSum(
+                                      lbann.SafeDivide(
+                                      lbann.Add(lbann.Constant(value=1.0, hint_layer=y),lbann.Identity(y)),
+                                      lbann.Subtract(lbann.Constant(value=1.0, hint_layer=y),lbann.Identity(y))),
+                                      scaling_factors=str(self.datascale))
+        linear_scale = 1/self.linear_scaler
+        CH2 = lbann.Tanh(lbann.WeightedSum(inv_transform,scaling_factors=str(linear_scale)))
+        return CH2  
+  
diff --git a/applications/physics/cosmology/ExaGAN/README.md b/applications/physics/cosmology/ExaGAN/README.md
new file mode 100644
index 00000000000..ca5f0540537
--- /dev/null
+++ b/applications/physics/cosmology/ExaGAN/README.md
@@ -0,0 +1,8 @@
+## Generative Models for Cosmology - Understanding the Nature of the Universe at Exascale 
+
+LBANN implementation of a number of generative models for cosmology. Please see [link](https://github.com/pzharrington/ExaGAN/) for original Keras implementation of code in this directory and other details. Also, see LBANN documentations on how to install, build and run LBANN code. 
+
+### How to Train 
+```bash
+run python3 train_exagan.py
+```
diff --git a/applications/physics/cosmology/ExaGAN/dataset.py b/applications/physics/cosmology/ExaGAN/dataset.py
new file mode 100644
index 00000000000..c55f8c902a4
--- /dev/null
+++ b/applications/physics/cosmology/ExaGAN/dataset.py
@@ -0,0 +1,25 @@
+import numpy as np
+from os.path import abspath, dirname, join
+import google.protobuf.text_format as txtf
+
+# Data paths
+data_dir = '/p/lustre2/brainusr/datasets/cosmoflow/norm_train200K.npy'
+
+samples = np.load(data_dir, allow_pickle=True)
+samples = samples.transpose(0,3,1,2)
+
+
+dims = 128*128*1
+
+# Sample access functions
+def get_sample(index):
+    sample = samples[index].flatten()
+    #normalization here if unnormalized
+    return sample
+
+def num_samples():
+    return samples.shape[0]
+
+def sample_dims():
+    return [dims]
+
diff --git a/applications/physics/cosmology/ExaGAN/train_exagan.py b/applications/physics/cosmology/ExaGAN/train_exagan.py
new file mode 100644
index 00000000000..d27999d82b4
--- /dev/null
+++ b/applications/physics/cosmology/ExaGAN/train_exagan.py
@@ -0,0 +1,126 @@
+import ExaGAN
+import dataset
+import lbann.contrib.launcher
+
+# ==============================================
+# Setup and launch experiment
+# ==============================================
+
+def list2str(l):
+    return ' '.join(l)
+
+def construct_model():
+    """Construct LBANN model.
+
+    ExaGAN  model
+
+    """
+    import lbann
+
+    # Layer graph
+    input = lbann.Input(target_mode='N/A',name='inp_img')
+    #label flipping
+    label_flip_rand = lbann.Uniform(min=0,max=1, neuron_dims='1')
+    label_flip_prob = lbann.Constant(value=0.01, num_neurons='1')
+    one = lbann.GreaterEqual(label_flip_rand,label_flip_prob, name='is_real')
+    zero = lbann.LogicalNot(one,name='is_fake')
+
+    z = lbann.Reshape(lbann.Gaussian(mean=0.0,stdev=1.0, neuron_dims="64", name='noise_vec'),dims='1 64')
+    d1_real, d1_fake, d_adv, gen_img  = ExaGAN.CosmoGAN()(input,z)
+
+    d1_real_bce = lbann.SigmoidBinaryCrossEntropy([d1_real,one],name='d1_real_bce')
+    d1_fake_bce = lbann.SigmoidBinaryCrossEntropy([d1_fake,zero],name='d1_fake_bce')
+    d_adv_bce = lbann.SigmoidBinaryCrossEntropy([d_adv,one],name='d_adv_bce')
+
+    layers = list(lbann.traverse_layer_graph(input))
+    # Setup objective function
+    weights = set()
+    src_layers = []
+    dst_layers = []
+    for l in layers:
+      if(l.weights and "disc1" in l.name and "instance1" in l.name):
+        src_layers.append(l.name)
+      #freeze weights in disc2, analogous to discrim.trainable=False in Keras
+      if(l.weights and "disc2" in l.name):
+        dst_layers.append(l.name)
+        for idx in range(len(l.weights)):
+          l.weights[idx].optimizer = lbann.NoOptimizer()
+      weights.update(l.weights)
+    #l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4)
+    obj = lbann.ObjectiveFunction([d1_real_bce,d1_fake_bce,d_adv_bce])
+    # Initialize check metric callback
+    metrics = [lbann.Metric(d1_real_bce,name='d_real'),
+               lbann.Metric(d1_fake_bce, name='d_fake'),
+               lbann.Metric(d_adv_bce,name='gen')]
+
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 #Uncomment to dump output for plotting and further statistical analysis
+                 #lbann.CallbackDumpOutputs(layers='inp_img gen_img_instance1_activation',
+                 #                          execution_modes='train validation',
+                 #                          directory='dump_outs',
+                 #                          batch_interval=100,
+                 #                          format='npy'),
+                 lbann.CallbackReplaceWeights(source_layers=list2str(src_layers),
+                                      destination_layers=list2str(dst_layers),
+                                      batch_interval=2)]
+
+    # Construct model
+    num_epochs = 20
+    return lbann.Model(num_epochs,
+                       weights=weights,
+                       layers=layers,
+                       metrics=metrics,
+                       objective_function=obj,
+                       callbacks=callbacks)
+
+def construct_data_reader():
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import this Python file to access the
+    sample access functions.
+
+    """
+    import os.path
+    import lbann
+    module_file = os.path.abspath(__file__)
+    module_name = os.path.splitext(os.path.basename(module_file))[0]
+    module_dir = os.path.dirname(module_file)
+
+    # Base data reader message
+    message = lbann.reader_pb2.DataReader()
+
+    # Training set data reader
+    data_reader = message.reader.add()
+    data_reader.name = 'python'
+    data_reader.role = 'train'
+    data_reader.shuffle = True
+    data_reader.percent_of_data_to_use = 1.0
+    data_reader.validation_percent = 0.1
+    data_reader.python.module = 'dataset'
+    data_reader.python.module_dir = module_dir
+    data_reader.python.sample_function = 'get_sample'
+    data_reader.python.num_samples_function = 'num_samples'
+    data_reader.python.sample_dims_function = 'sample_dims'
+
+    return message
+
+if __name__ == '__main__':
+    import lbann
+
+    mini_batch_size = 64
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model()
+    # Setup optimizer
+    opt = lbann.Adam(learn_rate=0.0002,beta1=0.5,beta2=0.99,eps=1e-8)
+    # Load data reader from prototext
+    data_reader = construct_data_reader()
+    status = lbann.contrib.launcher.run(trainer,model, data_reader, opt,
+                       scheduler='slurm',
+                       #account='lbpm',
+                       nodes=1,
+                       procs_per_node=1,
+                       time_limit=1440,
+                       setup_only=False,
+                       job_name='exagan')
+    print(status)
diff --git a/applications/physics/data/hydra_metadata.prototext b/applications/physics/data/hydra_metadata.prototext
new file mode 100644
index 00000000000..ec1701f9042
--- /dev/null
+++ b/applications/physics/data/hydra_metadata.prototext
@@ -0,0 +1,114 @@
+########################################################################
+# The HYDRA normalization values were computed over the 
+# the 00008 set of 100 files (10K samples), June, 2019
+# John Field cautions that the HYDRA schema will change in
+# the future
+########################################################################
+
+data_set_metadata {
+  schema {
+    split_jag_image_channels: false
+
+    # JAG_Image, JAG_Scalar, JAG_Input
+    independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ]
+    dependent: [ { pieces: [ JAG_Input ] } ]
+
+    image_prefix: "/images/"
+
+    ## all hydra image keys: 
+    #                    "(90,0)/bang/image/data" 
+    #                    "(90,0)/0.03/image/data" 
+    #                    "(90,0)/0.02/image/data" 
+    #                    "(90,0)/0.01/image/data" 
+    #
+    #                    "(0,0)/bang/image/data" 
+    #                    "(0,0)/0.03/image/data" 
+    #                    "(0,0)/0.02/image/data" 
+    #                    "(0,0)/0.01/image/data"
+    jag_image_keys: ["(90,0)/bang/image/data", "(0,0)/bang/image/data"]
+
+    scalar_prefix: "/scalars/"
+
+    # An empty list indicates to use all
+    # The commented out variables are not on the Jim's original list but used in the numpy-based format
+    jag_scalar_keys:
+      [ "avg_rhor",
+        "peak_eprod",
+        "peak_tion_bw_DT",
+        "bt_tion_bw_DT",
+        "avg_tion_bw_DT",
+        "adiabat",
+        "bangt",
+        "burnwidth",
+        "bt_rhor",
+        "bt_eprodr",
+        "peak_eprodr"
+      ]
+
+    # When using all the keys without explicit selection, key filters can be used
+    # to explicitly exclude the particular variables with keys that matches a filter.
+    # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching.
+    # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as
+    # the pair of a prefix substring and the minimum key length.
+    # For example, with the example below, any key that has a length no shorter
+    # than 26 and starts with the substring "image_(" is excluded.
+
+    jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ]
+    jag_scalar_filters: [ "iBT" ]
+
+    input_prefix: "/inputs/"
+
+    jag_input_keys: ["preheat",
+                     "sc_peak",
+                     "t_3rd",
+                     "t_end"
+                    ]  
+  }
+
+   jag_input_normalization_params: [
+      { scale: 0.0337373  bias: -0.0105617 }, #p_preheat avg= 15.4355
+      { scale: 1.04127  bias: 0.49368 }, #sc_peak avg= 0.00650919
+      { scale: 1.00482  bias: 0.499533 }, #t_3rd avg= -0.0241983
+      { scale: 1.00725  bias: 0.496931 } #t_end avg= -0.00750582
+    ]
+
+    jag_scalar_normalization_params: [
+      { scale: 1.82482  bias: -0.511432 }, #avg_rhor avg= 0.529763
+      { scale: 0.681226  bias: -0.0150223 }, #peak_eprod avg= 0.201066
+      { scale: 0.198934  bias: -0.801525 }, #peak_tion_bw_DT avg= 6.37529
+      { scale: 0.244173  bias: -0.604468 }, #bt_tion_bw_DT avg= 4.0855
+      { scale: 0.269391  bias: -0.656678 }, #avg_tion_bw_DT avg= 3.91583
+      { scale: 0.0492209  bias: -0.186354 }, #adiabat avg= 10.6166
+      { scale: 522.423  bias: -3.80809 }, #bangt avg= 0.00814444
+      { scale: 3787.06  bias: -0.274563 }, #burnwidth avg= 0.000173271
+      { scale: 1.68807  bias: -0.510794 }, #bt_rhor avg= 0.578218
+      { scale: 5.27623e-05  bias: -0.00320741 }, #bt_eprodr avg= 1572.53
+      { scale: 5.21263e-05  bias: -0.00322019 } #peak_eprodr avg= 1587.55
+    ]
+
+    # image data shape is (3,3,64,64)
+    # from John Field: sets of three: {absorption, emission forward, 
+    # and emission back} # Since we are in 1D, forward and back emission 
+    # are the same. 
+    jag_image_normalization_params: [
+      { scale: 1.31227  bias: -5.2241e-05 }, #(90,0)/bang/image/data
+      { scale: 1.5386e-05  bias: 8.4296e-05 }, #(90,0)/bang/image/data
+      { scale: 1.5386e-05  bias: 8.4296e-05 }, #(90,0)/bang/image/data
+      { scale: 1.28446  bias: -0.18841 }, #(90,0)/bang/image/data
+      { scale: 4.06761e-05  bias: 1.03167e-06 }, #(90,0)/bang/image/data
+      { scale: 4.06761e-05  bias: 1.03167e-06 }, #(90,0)/bang/image/data
+      { scale: 1.44979  bias: -0.289003 }, #(90,0)/bang/image/data
+      { scale: 0.00024344  bias: 7.96652e-08 }, #(90,0)/bang/image/data
+      { scale: 0.00024344  bias: 7.96652e-08 }, #(90,0)/bang/image/data
+      { scale: 1.31227  bias: -5.2241e-05 } #(0,0)/bang/image/data
+      { scale: 1.5386e-05  bias: 8.4296e-05 } #(0,0)/bang/image/data
+      { scale: 1.5386e-05  bias: 8.4296e-05 } #(0,0)/bang/image/data
+      { scale: 1.28446  bias: -0.18841 } #(0,0)/bang/image/data
+      { scale: 4.06761e-05  bias: 1.03167e-06 } #(0,0)/bang/image/data
+      { scale: 4.06761e-05  bias: 1.03167e-06 } #(0,0)/bang/image/data
+      { scale: 1.44979  bias: -0.289003 } #(0,0)/bang/image/data
+      { scale: 0.00024344  bias: 7.96652e-08 } #(0,0)/bang/image/data
+      { scale: 0.00024344  bias: 7.96652e-08 } #(0,0)/bang/image/data
+    ]
+
+}
diff --git a/applications/physics/data/jag_100Kdata.prototext b/applications/physics/data/jag_100Kdata.prototext
new file mode 100644
index 00000000000..da5ac912dd8
--- /dev/null
+++ b/applications/physics/data/jag_100Kdata.prototext
@@ -0,0 +1,12 @@
+data_reader {
+  reader {
+    name: "numpy"
+    role: "train"
+    shuffle: true
+    data_filename: "/p/lustre2/brainusr/datasets/jag/jag100K1vw_train.npy"
+    validation_percent: 0.1
+    percent_of_data_to_use: 1.0
+    disable_responses: true 
+    disable_labels: true
+  }
+}
diff --git a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext b/applications/physics/data/jag_100M_metadata.prototext
similarity index 95%
rename from model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext
rename to applications/physics/data/jag_100M_metadata.prototext
index 1643b6db51a..7e22e71f0a9 100644
--- a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext
+++ b/applications/physics/data/jag_100M_metadata.prototext
@@ -20,7 +20,12 @@ data_set_metadata {
 
     image_prefix: "/outputs/images/"
 
-    jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"]
+    image_width: 64
+    image_height: 64
+    image_num_channels: 4
+
+    #jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] #3 views
+    jag_image_keys: ["(0.0, 0.0)/0.0/emi"] #1 view, default
 
     scalar_prefix: "/outputs/scalars/"
 
diff --git a/applications/physics/data/jag_conduit_reader.prototext b/applications/physics/data/jag_conduit_reader.prototext
new file mode 100644
index 00000000000..9e72ce41fb8
--- /dev/null
+++ b/applications/physics/data/jag_conduit_reader.prototext
@@ -0,0 +1,50 @@
+########################################################################
+# The JAG normalization values were computed over the 10M + 1MA + 1MB random
+# pulls from the 100M data set.  They are valid for the directories:
+# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B)
+# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
+# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B
+# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
+########################################################################
+
+data_reader {
+  requires_data_set_metadata: true
+
+  reader {
+    name: "jag_conduit"
+    role: "train"
+    shuffle: true
+    # change to a lustre path
+    data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_A/"
+    index_list: "index.txt"
+    index_list_per_trainer: true
+    index_list_per_model: false
+
+    validation_percent: 0.1
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+    disable_responses: true
+    disable_labels: true
+
+    num_labels: 5
+  }
+
+  reader {
+    name: "jag_conduit"
+    role: "test"
+    shuffle: true
+    # change to a lustre path
+    data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_B"
+    index_list: "t0_sample_list_multi_10K.txt" #100 samples
+    index_list_per_trainer: false
+    index_list_per_model: false
+
+    validation_percent: 0
+    absolute_sample_count: 0
+    percent_of_data_to_use: 0.1
+    disable_responses: true
+    disable_labels: true
+
+    num_labels: 5
+  }
+}
diff --git a/applications/selfsupervised/.gitignore b/applications/selfsupervised/.gitignore
new file mode 100644
index 00000000000..3dd91f59ec8
--- /dev/null
+++ b/applications/selfsupervised/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+experiments
diff --git a/applications/selfsupervised/README.md b/applications/selfsupervised/README.md
new file mode 100644
index 00000000000..d40bb6aec1c
--- /dev/null
+++ b/applications/selfsupervised/README.md
@@ -0,0 +1,26 @@
+# Experiments in self-supervised learning with Siamese networks
+
+Reference:
+
+Nathan T. Mundhenk, Daniel Ho, and Barry Y. Chen. "Improvements to
+context based self-supervised learning." In Computer Vision and
+Pattern Recognition (CVPR). 2018.
+
+## Dependencies
+
+- NumPy
+- SciPy
+- OpenCV
+
+```
+pip3 install numpy scipy opencv-python
+```
+
+## Usage
+
+```
+python3 main.py
+```
+
+Data paths are hardcoded for the Pascal cluster at LLNL. Users must be
+in the `brainusr` group.
diff --git a/applications/selfsupervised/classifier.py b/applications/selfsupervised/classifier.py
new file mode 100644
index 00000000000..a6585144811
--- /dev/null
+++ b/applications/selfsupervised/classifier.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+import os.path
+import google.protobuf.text_format
+import lbann
+import modules
+
+def setup(data_reader_file,
+          name='classifier',
+          num_labels=200,
+          mini_batch_size=128,
+          num_epochs=1000,
+          learning_rate=0.1,
+          bn_statistics_group_size=2,
+          fc_data_layout='model_parallel',
+          warmup_epochs=50,
+          learning_rate_drop_interval=50,
+          learning_rate_drop_factor=0.25,
+          checkpoint_interval=None):
+
+    # Setup input data
+    input = lbann.Input()
+    images = lbann.Identity(input)
+    labels = lbann.Identity(input)
+
+    # Classification network
+    head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size)
+    class_fc = lbann.modules.FullyConnectedModule(num_labels,
+                                                  activation=lbann.Softmax,
+                                                  name=f'{name}_fc',
+                                                  data_layout=fc_data_layout)
+    x = head_cnn(images)
+    probs = class_fc(x)
+
+    # Setup objective function
+    cross_entropy = lbann.CrossEntropy([probs, labels])
+    l2_reg_weights = set()
+    for l in lbann.traverse_layer_graph(input):
+        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+            l2_reg_weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002)
+    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+    # Setup model
+    metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]),
+                            name='accuracy', unit='%')]
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    if checkpoint_interval:
+        callbacks.append(
+            lbann.CallbackCheckpoint(
+                checkpoint_dir='ckpt',
+                checkpoint_epochs=5
+            )
+        )
+
+    # Learning rate schedules
+    if warmup_epochs:
+        callbacks.append(
+            lbann.CallbackLinearGrowthLearningRate(
+                target=learning_rate * mini_batch_size / 128,
+                num_epochs=warmup_epochs
+            )
+        )
+    if learning_rate_drop_factor:
+        callbacks.append(
+            lbann.CallbackDropFixedLearningRate(
+                drop_epoch=list(range(0, num_epochs, learning_rate_drop_interval)),
+                amt=learning_rate_drop_factor)
+        )
+
+    # Construct model
+    model = lbann.Model(num_epochs,
+                        layers=lbann.traverse_layer_graph(input),
+                        objective_function=obj,
+                        metrics=metrics,
+                        callbacks=callbacks)
+
+    # Setup optimizer
+    # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8)
+    opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9)
+
+    # Load data reader from prototext
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open(data_reader_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+    for reader_proto in data_reader_proto.reader:
+        reader_proto.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+
+    # Return experiment objects
+    return model, data_reader_proto, opt
+
+if __name__ == "__main__":
+    import argparse
+    import lbann.contrib.args
+    import lbann.contrib.launcher
+
+    # Command-line arguments
+    parser = argparse.ArgumentParser()
+    lbann.contrib.args.add_scheduler_arguments(parser)
+    parser.add_argument(
+        '--job-name', action='store', default='lbann_siamese_finetune', type=str,
+        help=('scheduler job name'))
+    parser.add_argument(
+        '--mini-batch-size', action='store', default=128, type=int,
+        help='mini-batch size (default: 128)', metavar='NUM')
+    parser.add_argument(
+        '--num-epochs', action='store', default=1000, type=int,
+        help='number of epochs (default: 1000)', metavar='NUM')
+    parser.add_argument(
+        '--learning-rate', action='store', default=0.1, type=float,
+        help='learning rate (default: 0.1)', metavar='LR')
+    parser.add_argument(
+        '--bn-statistics-group-size', action='store', default=2, type=int,
+        help=('group size for batch norm statistics (default: 2)'))
+    parser.add_argument(
+        '--fc-data-layout', action='store', default='model_parallel', type=str,
+        help=('data layout for fully-connected layers '
+              '(default: "model_parallel")'))
+    args = parser.parse_args()
+
+    # Setup experiment
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    data_reader_file = os.path.join(current_dir, 'data_reader_cub.prototext')
+    model, data_reader, opt = setup(
+        data_reader_file=data_reader_file,
+        mini_batch_size=args.mini_batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        fc_data_layout=args.fc_data_layout,
+    )
+
+    # Run experiment
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    lbann.contrib.launcher.run(
+        trainer, model, data_reader, opt,
+        job_name=args.job_name,
+        **kwargs,
+    )
diff --git a/applications/selfsupervised/data_reader_cub.prototext b/applications/selfsupervised/data_reader_cub.prototext
new file mode 100644
index 00000000000..b63e19e8cef
--- /dev/null
+++ b/applications/selfsupervised/data_reader_cub.prototext
@@ -0,0 +1,61 @@
+data_reader {
+  reader {
+    name: "imagenet"
+    role: "train"
+    shuffle: true
+    data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/"
+    data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/train_list.txt"
+    validation_percent: 0.0
+    percent_of_data_to_use: 1.0
+    num_labels: 200
+
+    transforms {
+      random_resized_crop {
+        height: 224
+        width: 224
+      }
+    }
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+
+  }
+
+  reader {
+    name: "imagenet"
+    role: "test"
+    shuffle: true
+    data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/"
+    data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/test_list.txt"
+    percent_of_data_to_use: 1.0
+    num_labels: 200
+
+    transforms {
+      center_crop {
+        height: 224
+        width: 224
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+
+  }
+}
diff --git a/applications/selfsupervised/data_reader_imagenet.prototext b/applications/selfsupervised/data_reader_imagenet.prototext
new file mode 100644
index 00000000000..733b8fa80bf
--- /dev/null
+++ b/applications/selfsupervised/data_reader_imagenet.prototext
@@ -0,0 +1,33 @@
+data_reader {
+  reader {
+    name: "imagenet"
+    role: "train"
+    shuffle: true
+    data_filedir: "/p/lscratchh/brainusr/ILSVRC2012/original/train/"
+    data_filename: "/p/lscratchh/brainusr/ILSVRC2012/labels/train.txt"
+    #data_filename: "/p/lscratchh/brainusr/ILSVRC2012/labels/train_c0-9.txt"
+    percent_of_data_to_use: 1.0
+    num_labels: 1000
+
+    transforms {
+      random_resized_crop {
+        height: 224
+        width: 224
+      }
+    }
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+  }
+}
diff --git a/applications/selfsupervised/main.py b/applications/selfsupervised/main.py
new file mode 100644
index 00000000000..6364df48b6d
--- /dev/null
+++ b/applications/selfsupervised/main.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+import os.path
+import argparse
+import random
+import lbann
+import lbann.contrib.launcher
+import lbann.contrib.args
+import lbann.proto
+import classifier
+import pretrain_siamese
+import util
+
+# Paths
+current_dir = os.path.dirname(os.path.realpath(__file__))
+
+# ==============================================
+# Options
+# ==============================================
+
+# Command-line options
+parser = argparse.ArgumentParser()
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_siamese', type=str,
+    help='job name', metavar='NAME')
+parser.add_argument(
+    '--pretrain', action='store', default='siamese', type=str,
+    help='pretraining model (default: siamese)')
+parser.add_argument(
+    '--num-patches', action='store', default=3, type=int,
+    help='number of patches and Siamese heads (default: 3)', metavar='NUM')
+parser.add_argument(
+    '--pretrain-epochs', action='store', default=20, type=int,
+    help='number of pretraining epochs (default: 20)', metavar='NUM')
+parser.add_argument(
+    '--batch-job', action='store_true',
+    help='submit script as batch job')
+parser.add_argument(
+    '--checkpoint-interval', action='store', default=0, type=int,
+    help='epoch frequency for checkpointing')
+args = parser.parse_args()
+
+# ==============================================
+# Setup experiment
+# ==============================================
+
+# Pretraining model
+if not args.pretrain or args.pretrain == 'siamese':
+    model1, reader1, opt1 = pretrain_siamese.setup(
+        num_patches=args.num_patches,
+        mini_batch_size=512,
+        num_epochs=args.pretrain_epochs,
+        learning_rate=0.005,
+        checkpoint_interval=args.checkpoint_interval,
+    )
+elif args.pretrain == 'supervised':
+    data_reader_file = os.path.join(current_dir, 'data_reader_imagenet.prototext')
+    model1, reader1, opt1 = classifier.setup(
+        data_reader_file=data_reader_file,
+        name='supervised',
+        num_labels=1000,
+        mini_batch_size=512,
+        num_epochs=args.pretrain_epochs,
+        learning_rate=0.1,
+        warmup_epochs=5,
+        learning_rate_drop_interval=30,
+        learning_rate_drop_factor=0.1,
+        checkpoint_interval=args.checkpoint_interval,
+    )
+else:
+    raise Exception(f'"{args.pretrain}" is an invalid pretraining model')
+model1.random_seed = random.getrandbits(32)
+
+# Fine-tuning model
+data_reader_file = os.path.join(current_dir, 'data_reader_cub.prototext')
+model2, reader2, opt2 = classifier.setup(
+    data_reader_file=data_reader_file,
+    name='finetune',
+    num_labels=200,
+    mini_batch_size=128,
+    num_epochs=500,
+    learning_rate=0.1,
+    warmup_epochs=50,
+    learning_rate_drop_interval=50,
+    learning_rate_drop_factor=0.25,
+)
+
+# ==============================================
+# Construct LBANN invocation
+# ==============================================
+
+# Initialize LBANN executable and command-line arguments
+lbann_exe = os.path.realpath(lbann.lbann_exe())
+lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2')
+lbann_command = [lbann_exe]
+
+# Construct experiment directory
+experiment_dir = util.make_experiment_dir(args.job_name)
+
+# Export model prototext files
+# Note: lbann2 driver doesn't have a command-line argument to get
+# trainer.
+file1 = os.path.join(experiment_dir, 'model1.prototext')
+file2 = os.path.join(experiment_dir, 'model2.prototext')
+lbann.proto.save_prototext(file1, model=model1, trainer=lbann.Trainer(mini_batch_size=512))
+lbann.proto.save_prototext(file2, model=model2, trainer=lbann.Trainer(mini_batch_size=512))
+lbann_command.append(f'--model={{{file1},{file2}}}')
+
+# Export data reader prototext files
+file1 = os.path.join(experiment_dir, 'reader1.prototext')
+file2 = os.path.join(experiment_dir, 'reader2.prototext')
+lbann.proto.save_prototext(file1, data_reader=reader1)
+lbann.proto.save_prototext(file2, data_reader=reader2)
+lbann_command.append(f'--reader={{{file1},{file2}}}')
+
+# Export optimizer prototext files
+file1 = os.path.join(experiment_dir, 'opt1.prototext')
+file2 = os.path.join(experiment_dir, 'opt2.prototext')
+lbann.proto.save_prototext(file1, optimizer=opt1)
+lbann.proto.save_prototext(file2, optimizer=opt2)
+lbann_command.append(f'--optimizer={{{file1},{file2}}}')
+
+# ==============================================
+# Launch experiment
+# ==============================================
+
+# Construct batch script
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+script = lbann.contrib.launcher.make_batch_script(
+    work_dir=experiment_dir,
+    job_name=args.job_name,
+    **kwargs,
+)
+script.add_parallel_command(lbann_command)
+
+# Launch LBANN
+if args.batch_job:
+    script.submit()
+else:
+    script.run()
diff --git a/applications/selfsupervised/modules.py b/applications/selfsupervised/modules.py
new file mode 100644
index 00000000000..fe5c1f66c1e
--- /dev/null
+++ b/applications/selfsupervised/modules.py
@@ -0,0 +1,190 @@
+import lbann
+import lbann.modules
+import resnet
+
+class BatchNormModule(lbann.modules.Module):
+
+    global_count = 0    # Static counter, used for default names
+
+    def __init__(self,
+                 statistics_group_size=1,
+                 name=None,
+                 data_layout='data_parallel'):
+        super().__init__()
+        BatchNormModule.global_count += 1
+        self.instance = 0
+        self.statistics_group_size = statistics_group_size
+        self.name = (name
+                     if name
+                     else 'bnmodule{0}'.format(BatchNormModule.global_count))
+        self.data_layout = data_layout
+
+        # Initialize weights
+        self.scale = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=1.0),
+            name=self.name + '_scale')
+        self.bias = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=0.0),
+            name=self.name + '_bias')
+        self.running_mean = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=0.0),
+            name=self.name + '_running_mean')
+        self.running_variance = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=1.0),
+            name=self.name + '_running_variance')
+
+    def forward(self, x):
+        self.instance += 1
+        name = '{0}_instance{1}'.format(self.name, self.instance)
+        return lbann.BatchNormalization(
+            x,
+            weights=[self.scale, self.bias,
+                     self.running_mean, self.running_variance],
+            decay=0.9,
+            scale_init=1.0,
+            bias_init=0.0,
+            epsilon=1e-5,
+            statistics_group_size=self.statistics_group_size,
+            name=name,
+            data_layout=self.data_layout)
+
+class ConvBnRelu(lbann.modules.Module):
+
+    global_count = 0    # Static counter, used for default names
+
+    def __init__(self,
+                 out_channels, kernel_size,
+                 stride=1, padding=0,
+                 statistics_group_size=1,
+                 name=None):
+        super().__init__()
+        ConvBnRelu.global_count += 1
+        self.instance = 0
+        self.name = (name
+                     if name
+                     else 'convbnrelu{0}'.format(ConvBnRelu.global_count))
+        self.conv = lbann.modules.Convolution2dModule(out_channels,
+                                                      kernel_size,
+                                                      stride=stride,
+                                                      padding=padding,
+                                                      bias=False,
+                                                      name=self.name+'_conv')
+        self.bn = BatchNormModule(statistics_group_size=statistics_group_size,
+                                  name=self.name+'_bn')
+
+    def forward(self, x):
+        self.instance += 1
+        x = self.conv(x)
+        x = self.bn(x)
+        return lbann.Relu(x, name='{0}_relu_instance{1}'.format(self.name, self.instance))
+
+class FcBnRelu(lbann.modules.Module):
+
+    global_count = 0    # Static counter, used for default names
+
+    def __init__(self,
+                 size,
+                 statistics_group_size=1,
+                 name=None,
+                 data_layout='data_parallel'):
+        super().__init__()
+        FcBnRelu.global_count += 1
+        self.instance = 0
+        self.name = (name
+                     if name
+                     else 'fcbnrelu{0}'.format(FcBnRelu.global_count))
+        self.data_layout = data_layout
+        self.fc = lbann.modules.FullyConnectedModule(size,
+                                                     bias=False,
+                                                     name=self.name+'_fc',
+                                                     data_layout=self.data_layout)
+
+        # Weights for batchnorm
+        scalebias_vals = [1.0] * size + [0.0] * size
+        self.bn_weights = [
+            lbann.Weights(
+                name='{0}_bn_running_mean'.format(self.name),
+                initializer=lbann.ConstantInitializer(value=0.0)),
+            lbann.Weights(
+                name='{0}_bn_running_var'.format(self.name),
+                initializer=lbann.ConstantInitializer(value=1.0)),
+            lbann.Weights(
+                name='{0}_bn_scalebias'.format(self.name),
+                initializer=lbann.ValueInitializer(values=' '.join([str(x) for x in scalebias_vals])))]
+
+    def forward(self, x):
+        self.instance += 1
+        x = self.fc(x)
+        x = lbann.EntrywiseBatchNormalization(x,
+                                              weights=[self.bn_weights[0], self.bn_weights[1]],
+                                              decay=0.9,
+                                              epsilon=1e-5,
+                                              name='{0}_bn_instance{1}'.format(self.name, self.instance),
+                                              data_layout=self.data_layout)
+        x = lbann.EntrywiseScaleBias(x,
+                                     weights=self.bn_weights[2],
+                                     name='{0}_bn_scalebias_instance{1}'.format(self.name, self.instance),
+                                     data_layout=self.data_layout)
+        return lbann.Relu(x,
+                          name='{0}_relu_instance{1}'.format(self.name, self.instance),
+                          data_layout=self.data_layout)
+
+class AlexNetCNN(lbann.modules.Module):
+    """AlexNet CNN with batch norm.
+
+    FC network at end of AlexNet is not included.
+
+    """
+
+    def __init__(self, bn_statistics_group_size=1):
+        self.name = 'alexnet'
+        self.conv1 = ConvBnRelu(96, 11,
+                                stride=4,
+                                padding=5,
+                                statistics_group_size=bn_statistics_group_size,
+                                name='{0}_conv1'.format(self.name))
+        self.conv2 = ConvBnRelu(256, 3,
+                                padding=1,
+                                statistics_group_size=bn_statistics_group_size,
+                                name='{0}_conv2'.format(self.name))
+        self.conv3 = ConvBnRelu(384, 3,
+                                padding=1,
+                                statistics_group_size=bn_statistics_group_size,
+                                name='{0}_conv3'.format(self.name))
+        self.conv4 = ConvBnRelu(384, 3,
+                                padding=1,
+                                statistics_group_size=bn_statistics_group_size,
+                                name='{0}_conv4'.format(self.name))
+        self.conv5 = ConvBnRelu(256, 3,
+                                padding=1,
+                                statistics_group_size=bn_statistics_group_size,
+                                name='{0}_conv5'.format(self.name))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = lbann.Pooling(x, num_dims=2, has_vectors=False,
+                          pool_dims_i=3, pool_pads_i=0, pool_strides_i=2,
+                          pool_mode='max')
+        x = self.conv2(x)
+        x = lbann.Pooling(x, num_dims=2, has_vectors=False,
+                          pool_dims_i=3, pool_pads_i=0, pool_strides_i=2,
+                          pool_mode='max')
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        x = lbann.Pooling(x, num_dims=2, has_vectors=False,
+                          pool_dims_i=3, pool_pads_i=0, pool_strides_i=2,
+                          pool_mode='max')
+        return x
+
+class ResNet(lbann.modules.Module):
+
+    def __init__(self, bn_statistics_group_size=1):
+        self.name = 'resnet'
+        self.cnn = resnet.ResNet34(bn_statistics_group_size=bn_statistics_group_size,
+                                   name=self.name)
+
+    def forward(self, x):
+        x = self.cnn(x)
+        x = lbann.ChannelwiseMean(x)
+        return x
diff --git a/applications/selfsupervised/patch_generator/__init__.py b/applications/selfsupervised/patch_generator/__init__.py
new file mode 100644
index 00000000000..b8e81340273
--- /dev/null
+++ b/applications/selfsupervised/patch_generator/__init__.py
@@ -0,0 +1,136 @@
+import functools
+import operator
+import os.path
+import random
+import sys
+import cv2
+import numpy as np
+from .extract_patches import extract_patches
+from .patterns import patterns_2patch, patterns_3patch, patterns_4patch, patterns_5patch
+from .chroma_blur import chroma_blur
+
+# Data paths
+label_file = '/p/lscratchh/brainusr/ILSVRC2012/labels/train.txt'
+data_dir = '/p/lscratchh/brainusr/ILSVRC2012/original/train'
+
+# Read label files
+samples = []
+with open(label_file) as f:
+    for line in f:
+        line = line.split(' ')
+        samples.append((line[0], int(line[1])))
+
+# Get sample function
+def get_sample_2patch(index):
+    return get_sample(index, 2)
+def get_sample_3patch(index):
+    return get_sample(index, 3)
+def get_sample_4patch(index):
+    return get_sample(index, 4)
+def get_sample_5patch(index):
+    return get_sample(index, 5)
+def get_sample(index, num_patches):
+    """Generate data sample.
+
+    Extract patches and apply preprocessing tricks.
+    """
+
+    # Read image from file
+    file_name, _ = samples[index]
+    file_name = os.path.join(data_dir, file_name)
+    img = cv2.imdecode(np.fromfile(file_name, dtype=np.uint8),
+                       cv2.IMREAD_COLOR)
+
+    # Crop to get square image
+    size = min(img.shape[0], img.shape[1])
+    y = (img.shape[0] - size) // 2
+    x = (img.shape[1] - size) // 2
+    img = img[y:y+size, x:x+size, :]
+
+    # Extract patches
+    patterns = None
+    if num_patches == 2:
+        patterns = patterns_2patch
+    if num_patches == 3:
+        patterns = patterns_3patch
+    if num_patches == 4:
+        patterns = patterns_4patch
+    if num_patches == 5:
+        patterns = patterns_5patch
+    patches, label = extract_patches(img, patterns)
+
+    # Randomly rotate patches
+    rotate_type = random.randint(0, 3)
+    for i, patch in enumerate(patches):
+        patch = np.rot90(patch, rotate_type, axes=(0,1))
+        patches[i] = patch
+    label = label + rotate_type * len(patterns)
+
+    # Convert patch to float32
+    for i, patch in enumerate(patches):
+        if patch.dtype == np.uint8:
+            patches[i] = patch.astype(np.float32) / 255
+
+    # Chroma blur
+    for i, patch in enumerate(patches):
+        patches[i] = chroma_blur(patch)
+
+    # Transform to CHW format and normalize
+    for i, patch in enumerate(patches):
+        patch = np.transpose(patch, axes=(2, 0, 1))
+        means = np.array([0.406, 0.456, 0.485]).reshape((3,1,1))
+        stdevs = np.array([0.225, 0.224, 0.229]).reshape((3,1,1))
+        patch -= means
+        patch /= stdevs
+        patches[i] = patch
+
+    # Random aperture
+    for i, patch in enumerate(patches):
+        if i == 0:
+            continue
+        size = random.randint(64, 96)
+        y = random.randint(0, 96-size)
+        x = random.randint(0, 96-size)
+        new_patch = np.zeros((3, 96, 96), dtype=np.float32)
+        new_patch[:, y:y+size, x:x+size] = patch[:, y:y+size, x:x+size]
+        patches[i] = new_patch
+
+    # Construct one-hot label vector
+    label_vec = np.zeros(num_labels(num_patches), dtype=np.float32)
+    label_vec[label] = 1
+
+    # Return flattened data tensors
+    flat_data = []
+    for patch in patches:
+        flat_data.append(patch.reshape(-1))
+    flat_data.append(label_vec)
+    return np.concatenate(flat_data)
+
+# Get sample dims functions
+patch_dims = (3, 96, 96)
+def num_labels(num_patches):
+    num_patterns = 0
+    if num_patches == 2:
+        num_patterns = len(patterns_2patch)
+    if num_patches == 3:
+        num_patterns = len(patterns_3patch)
+    if num_patches == 4:
+        num_patterns = len(patterns_4patch)
+    if num_patches == 5:
+        num_patterns = len(patterns_5patch)
+    return 4 * num_patterns
+def sample_dims(num_patches):
+    patch_size = functools.reduce(operator.mul, patch_dims)
+    return (num_patches*patch_size + num_labels(num_patches),)
+def sample_dims_2patch():
+    return sample_dims(2)
+def sample_dims_3patch():
+    return sample_dims(3)
+def sample_dims_4patch():
+    return sample_dims(4)
+def sample_dims_5patch():
+    return sample_dims(5)
+
+# Get num samples function
+def num_samples():
+    return len(samples)
diff --git a/applications/selfsupervised/patch_generator/chroma_blur.py b/applications/selfsupervised/patch_generator/chroma_blur.py
new file mode 100644
index 00000000000..ee9e4a57406
--- /dev/null
+++ b/applications/selfsupervised/patch_generator/chroma_blur.py
@@ -0,0 +1,16 @@
+import numpy as np
+import scipy.ndimage.filters
+import cv2
+
+def chroma_blur(img):
+    """Blur chroma channels to hide chromatic aberration.
+
+    Convert to CIE Lab format and apply box filter to a and b
+    channels.
+
+    """
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)
+    img[:,:,1] = scipy.ndimage.filters.uniform_filter(img[:,:,1], 13)
+    img[:,:,2] = scipy.ndimage.filters.uniform_filter(img[:,:,2], 13)
+    img = cv2.cvtColor(img, cv2.COLOR_Lab2BGR)
+    return img
diff --git a/applications/selfsupervised/patch_generator/extract_patches.py b/applications/selfsupervised/patch_generator/extract_patches.py
new file mode 100644
index 00000000000..e24733ae23f
--- /dev/null
+++ b/applications/selfsupervised/patch_generator/extract_patches.py
@@ -0,0 +1,116 @@
+import enum
+import math
+import random
+import cv2
+import numpy as np
+
+# ----------------------------------------------
+# Patch type specification
+# ----------------------------------------------
+# Note: Sizes and positions are in pixels.
+
+class PatchType(enum.Enum):
+    _3X3 = 1
+    _2X2 = 2
+    OVERLAP = 3
+
+# 3x3-type patches
+_3x3_patch_pos = ((0.0,     0.0), (0.0,     137/384), (0.0,     274/384),
+                  (137/384, 0.0), (137/384, 137/384), (137/384, 274/384),
+                  (274/384, 0.0), (274/384, 137/384), (274/384, 274/384))
+_3x3_patch_size = 110/384
+
+# 2x2-type patches
+_2x2_patch_pos = ((0,       0), (0,       146/256),
+                  (146/256, 0), (146/256, 146/256))
+_2x2_patch_size = 110/256
+
+# Overlap-type patches
+overlap_patch_pos = ((0,      0), (0,      86/196),
+                     (86/196, 0), (86/196, 86/196))
+overlap_patch_size = 110/196
+
+# ----------------------------------------------
+# Patch extraction
+# ----------------------------------------------
+
+def extract_patch(img, patch_type, index, zoom, jitter):
+    """Extract a patch from image and resize.
+
+    Args:
+        img (ndarry): Image in HWC format.
+        patch_type (PatchType): Desired patch type.
+        index (int): Patch index.
+        zoom (float): Zoom factor.
+        jitter ((float, float)): Jitter positions, normalized in
+            [0,1).
+
+    Returns:
+        ndarray: Patch in HWC format.
+
+    """
+
+    # Get patch position
+    if patch_type == PatchType._3X3:
+        posy = _3x3_patch_pos[index][0]
+        posx = _3x3_patch_pos[index][1]
+        patch_size = _3x3_patch_size
+    if patch_type == PatchType._2X2:
+        posy = _2x2_patch_pos[index][0]
+        posx = _2x2_patch_pos[index][1]
+        patch_size = _2x2_patch_size
+    if patch_type == PatchType.OVERLAP:
+        posy = overlap_patch_pos[index][0]
+        posx = overlap_patch_pos[index][1]
+        patch_size = overlap_patch_size
+
+    # Apply zoom and jitter to patch position
+    posy += (1-1/zoom) * patch_size * jitter[0]
+    posx += (1-1/zoom) * patch_size * jitter[1]
+    patch_size /= zoom
+
+    # Identify patch pixels
+    img_size = img.shape[0]
+    y0 = math.floor(posy * img_size)
+    y1 = math.ceil((posy + patch_size) * img_size)
+    x0 = math.floor(posx * img_size)
+    x1 = math.ceil((posx + patch_size) * img_size)
+    y0 = max(0, min(img_size-1, y0))
+    y1 = max(1, min(img_size, y1))
+    x0 = max(0, min(img_size-1, x0))
+    x1 = max(1, min(img_size, x1))
+
+    # Extract patch from image
+    interp_methods = (cv2.INTER_LINEAR, cv2.INTER_AREA,
+                      cv2.INTER_CUBIC, cv2.INTER_LANCZOS4)
+    patch = cv2.resize(img[y0:y1, x0:x1, :],
+                       (96, 96),
+                       interpolation=random.choice(interp_methods))
+
+    # Randomly apply horizontal flip
+    if random.choice([True, False]):
+        patch = np.fliplr(patch)
+
+    return patch
+
+def extract_patches(img, patterns):
+    """Extract patches from image.
+
+    Args:
+        img (ndarry): Image in HWC format.
+        patterns (list of (list of (PatchType, int))): Patch patterns.
+            See patterns.py.
+
+    Returns:
+        list of ndarray: Patches in HWC format.
+        int: Patch pattern label.
+
+    """
+
+    label = random.randint(0, len(patterns)-1)
+    zoom = random.uniform(1, 128/96)
+    jitter = (random.random(), random.random())
+    patches = [extract_patch(img, p[0], p[1], zoom, jitter)
+               for p in patterns[label]]
+    random.shuffle(patches)
+    return patches, label
diff --git a/applications/selfsupervised/patch_generator/patterns.py b/applications/selfsupervised/patch_generator/patterns.py
new file mode 100644
index 00000000000..6a65223c6d6
--- /dev/null
+++ b/applications/selfsupervised/patch_generator/patterns.py
@@ -0,0 +1,147 @@
+from .extract_patches import PatchType
+_3X3 = PatchType._3X3
+_2X2 = PatchType._2X2
+OVERLAP = PatchType.OVERLAP
+
+# 2-patch configurations
+# See: Carl Doersch, Abhinav Gupta, and Alexei A. Efros. "Unsupervised
+#   visual representation learning by context prediction." In
+#   Proceedings of the IEEE International Conference on Computer
+#   Vision, pp. 1422-1430. 2015.
+patterns_2patch = (
+    ((_3X3, 4), (_3X3, 0)),
+    ((_3X3, 4), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 2)),
+    ((_3X3, 4), (_3X3, 5)),
+    ((_3X3, 4), (_3X3, 8)),
+    ((_3X3, 4), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 6)),
+    ((_3X3, 4), (_3X3, 3))
+)
+
+# 3-patch configurations
+# See: T. Nathan Mundhenk, Daniel Ho, and Barry Y. Chen. "Improvements
+#   to Context Based Self-Supervised Learning." In CVPR, pp.
+#   9339-9348. 2018.
+patterns_3patch = (
+
+    # Line
+    ((_3X3, 4), (_3X3, 0), (_3X3, 8)),
+    ((_3X3, 4), (_3X3, 1), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 6)),
+    ((_3X3, 4), (_3X3, 5), (_3X3, 3)),
+
+    # L-shape
+    ((_2X2, 0), (_2X2, 1), (_2X2, 3)),
+    ((_2X2, 1), (_2X2, 3), (_2X2, 2)),
+    ((_2X2, 3), (_2X2, 2), (_2X2, 0)),
+    ((_2X2, 2), (_2X2, 0), (_2X2, 1)),
+
+    # Hybrid scale patches
+    ((OVERLAP, 0), (_3X3, 2), (_3X3, 5)),
+    ((OVERLAP, 0), (_3X3, 6), (_3X3, 7)),
+    ((OVERLAP, 1), (_3X3, 8), (_3X3, 7)),
+    ((OVERLAP, 1), (_3X3, 0), (_3X3, 3)),
+    ((OVERLAP, 3), (_3X3, 6), (_3X3, 3)),
+    ((OVERLAP, 3), (_3X3, 2), (_3X3, 1)),
+    ((OVERLAP, 2), (_3X3, 0), (_3X3, 1)),
+    ((OVERLAP, 2), (_3X3, 8), (_3X3, 5))
+
+)
+
+# 4-patch configurations
+patterns_4patch = (
+
+    # T-shape
+    ((_3X3, 4), (_3X3, 1), (_3X3, 5), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 5), (_3X3, 7), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 7), (_3X3, 3), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 3), (_3X3, 1), (_3X3, 5)),
+
+    # Z-shape
+    ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 5)),
+    ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 5)),
+
+    # L-shape
+    ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5)),
+    ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 5)),
+
+    # Square
+    ((_2X2, 0), (_2X2, 1), (_2X2, 3), (_2X2, 2)),
+
+    # Hybrid scale
+    ((OVERLAP, 0), (_3X3, 2), (_3X3, 5), (_3X3, 8)),
+    ((OVERLAP, 1), (_3X3, 8), (_3X3, 7), (_3X3, 6)),
+    ((OVERLAP, 3), (_3X3, 6), (_3X3, 3), (_3X3, 0)),
+    ((OVERLAP, 2), (_3X3, 0), (_3X3, 1), (_3X3, 2)),
+    ((OVERLAP, 0), (_3X3, 6), (_3X3, 7), (_3X3, 8)),
+    ((OVERLAP, 1), (_3X3, 0), (_3X3, 3), (_3X3, 6)),
+    ((OVERLAP, 3), (_3X3, 2), (_3X3, 1), (_3X3, 0)),
+    ((OVERLAP, 2), (_3X3, 8), (_3X3, 5), (_3X3, 2)),
+    ((OVERLAP, 0), (_3X3, 5), (_3X3, 8), (_3X3, 7)),
+    ((OVERLAP, 1), (_3X3, 7), (_3X3, 6), (_3X3, 3)),
+    ((OVERLAP, 3), (_3X3, 3), (_3X3, 0), (_3X3, 1)),
+    ((OVERLAP, 2), (_3X3, 1), (_3X3, 2), (_3X3, 5)),
+
+)
+
+# 5-patch configurations
+patterns_5patch = (
+
+    # Cross
+    ((_3X3, 4), (_3X3, 1), (_3X3, 5), (_3X3, 7), (_3X3, 3)),
+
+    # X-shape
+    ((_3X3, 4), (_3X3, 0), (_3X3, 2), (_3X3, 8), (_3X3, 6)),
+
+    # T-shape
+    ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 2), (_3X3, 7)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 8), (_3X3, 3)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 6), (_3X3, 1)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 0), (_3X3, 5)),
+
+    # Z-shape
+    ((_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 7), (_3X3, 8)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 3), (_3X3, 6)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 1), (_3X3, 0)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 5), (_3X3, 2)),
+    ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5), (_3X3, 8)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7), (_3X3, 6)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3), (_3X3, 0)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1), (_3X3, 2)),
+
+    # U-shape
+    ((_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 5), (_3X3, 2)),
+    ((_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 7), (_3X3, 8)),
+    ((_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 3), (_3X3, 6)),
+    ((_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 1), (_3X3, 0)),
+
+    # V-shape
+    ((_3X3, 0), (_3X3, 4), (_3X3, 8), (_3X3, 7), (_3X3, 6)),
+    ((_3X3, 2), (_3X3, 4), (_3X3, 6), (_3X3, 3), (_3X3, 0)),
+    ((_3X3, 8), (_3X3, 4), (_3X3, 0), (_3X3, 1), (_3X3, 2)),
+    ((_3X3, 6), (_3X3, 4), (_3X3, 2), (_3X3, 5), (_3X3, 8)),
+    ((_3X3, 0), (_3X3, 4), (_3X3, 8), (_3X3, 5), (_3X3, 2)),
+    ((_3X3, 2), (_3X3, 4), (_3X3, 6), (_3X3, 7), (_3X3, 8)),
+    ((_3X3, 8), (_3X3, 4), (_3X3, 0), (_3X3, 3), (_3X3, 6)),
+    ((_3X3, 6), (_3X3, 4), (_3X3, 2), (_3X3, 1), (_3X3, 0)),
+
+    # Hybrid scale
+    ((OVERLAP, 0), (_3X3, 2), (_3X3, 5), (_3X3, 6), (_3X3, 7)),
+    ((OVERLAP, 1), (_3X3, 8), (_3X3, 7), (_3X3, 0), (_3X3, 3)),
+    ((OVERLAP, 3), (_3X3, 6), (_3X3, 3), (_3X3, 2), (_3X3, 1)),
+    ((OVERLAP, 2), (_3X3, 0), (_3X3, 1), (_3X3, 8), (_3X3, 5))
+
+)
diff --git a/applications/selfsupervised/pretrain_siamese.py b/applications/selfsupervised/pretrain_siamese.py
new file mode 100644
index 00000000000..a818f855126
--- /dev/null
+++ b/applications/selfsupervised/pretrain_siamese.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+import functools
+import operator
+import os.path
+import google.protobuf.text_format as txtf
+import lbann
+import modules
+from util import str_list
+import patch_generator
+
+def setup(num_patches=3,
+          mini_batch_size=512,
+          num_epochs=75,
+          learning_rate=0.005,
+          bn_statistics_group_size=2,
+          fc_data_layout='model_parallel',
+          warmup=True,
+          checkpoint_interval=None):
+
+    # Data dimensions
+    patch_dims = patch_generator.patch_dims
+    num_labels = patch_generator.num_labels(num_patches)
+
+    # Extract tensors from data sample
+    input = lbann.Input()
+    slice_points = [0]
+    for _ in range(num_patches):
+        patch_size = functools.reduce(operator.mul, patch_dims)
+        slice_points.append(slice_points[-1] + patch_size)
+    slice_points.append(slice_points[-1] + num_labels)
+    sample = lbann.Slice(input, slice_points=str_list(slice_points))
+    patches = [lbann.Reshape(sample, dims=str_list(patch_dims))
+               for _ in range(num_patches)]
+    labels = lbann.Identity(sample)
+
+    # Siamese network
+    head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size)
+    heads = [head_cnn(patch) for patch in patches]
+    heads_concat = lbann.Concatenation(heads)
+
+    # Classification network
+    class_fc1 = modules.FcBnRelu(4096,
+                                 statistics_group_size=bn_statistics_group_size,
+                                 name='siamese_class_fc1',
+                                 data_layout=fc_data_layout)
+    class_fc2 = modules.FcBnRelu(4096,
+                                 statistics_group_size=bn_statistics_group_size,
+                                 name='siamese_class_fc2',
+                                 data_layout=fc_data_layout)
+    class_fc3 = lbann.modules.FullyConnectedModule(num_labels,
+                                                   activation=lbann.Softmax,
+                                                   name='siamese_class_fc3',
+                                                   data_layout=fc_data_layout)
+    x = class_fc1(heads_concat)
+    x = class_fc2(x)
+    probs = class_fc3(x)
+
+    # Setup objective function
+    cross_entropy = lbann.CrossEntropy([probs, labels])
+    l2_reg_weights = set()
+    for l in lbann.traverse_layer_graph(input):
+        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+            l2_reg_weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002)
+    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+    # Setup model
+    metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]),
+                            name='accuracy', unit='%')]
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    if checkpoint_interval:
+        callbacks.append(
+            lbann.CallbackCheckpoint(
+                checkpoint_dir='ckpt',
+                checkpoint_epochs=5
+            )
+        )
+
+    # Learning rate schedules
+    if warmup:
+        callbacks.append(
+            lbann.CallbackLinearGrowthLearningRate(
+                target=learning_rate * mini_batch_size / 128,
+                num_epochs=5
+            )
+        )
+    callbacks.append(
+        lbann.CallbackDropFixedLearningRate(
+            drop_epoch=list(range(0, 100, 15)), amt=0.25)
+    )
+
+    # Construct model
+    model = lbann.Model(num_epochs,
+                        layers=lbann.traverse_layer_graph(input),
+                        objective_function=obj,
+                        metrics=metrics,
+                        callbacks=callbacks)
+
+    # Setup optimizer
+    opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9)
+    # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8)
+
+    # Setup data reader
+    data_reader = make_data_reader(num_patches)
+
+    # Return experiment objects
+    return model, data_reader, opt
+
+def make_data_reader(num_patches):
+    message = lbann.reader_pb2.DataReader()
+    data_reader = message.reader.add()
+    data_reader.name = 'python'
+    data_reader.role = 'train'
+    data_reader.shuffle = True
+    data_reader.percent_of_data_to_use = 1.0
+    data_reader.python.module = 'patch_generator'
+    data_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    data_reader.python.num_samples_function = 'num_samples'
+    if num_patches == 2:
+        data_reader.python.sample_function = 'get_sample_2patch'
+        data_reader.python.sample_dims_function = 'sample_dims_2patch'
+    if num_patches == 3:
+        data_reader.python.sample_function = 'get_sample_3patch'
+        data_reader.python.sample_dims_function = 'sample_dims_3patch'
+    if num_patches == 4:
+        data_reader.python.sample_function = 'get_sample_4patch'
+        data_reader.python.sample_dims_function = 'sample_dims_4patch'
+    if num_patches == 5:
+        data_reader.python.sample_function = 'get_sample_5patch'
+        data_reader.python.sample_dims_function = 'sample_dims_5patch'
+    return message
+
+if __name__ == "__main__":
+    import argparse
+    import lbann.contrib.args
+    import lbann.contrib.launcher
+
+    # Command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--job-name', action='store', default='lbann_siamese', type=str,
+        help='scheduler job name  (default: lbann_siamese)', metavar='NAME')
+    parser.add_argument(
+        '--num-patches', action='store', default=3, type=int,
+        help='number of patches and Siamese heads (default: 3)', metavar='NUM')
+    lbann.contrib.args.add_scheduler_arguments(parser)
+    parser.add_argument(
+        '--mini-batch-size', action='store', default=512, type=int,
+        help='mini-batch size (default: 512)', metavar='NUM')
+    parser.add_argument(
+        '--num-epochs', action='store', default=75, type=int,
+        help='number of epochs (default: 75)', metavar='NUM')
+    parser.add_argument(
+        '--learning-rate', action='store', default=0.005, type=float,
+        help='learning rate (default: 0.005)', metavar='LR')
+    parser.add_argument(
+        '--bn-statistics-group-size', action='store', default=2, type=int,
+        help=('group size for batch norm statistics (default: 2)'))
+    parser.add_argument(
+        '--fc-data-layout', action='store', default='model_parallel', type=str,
+        help=('data layout for fully-connected layers '
+              '(default: "model_parallel")'))
+    parser.add_argument(
+        '--warmup', action='store', default=True, type=bool,
+        help='use learning rate warmup (default: True)')
+    args = parser.parse_args()
+
+    # Setup experiment
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+    model, data_reader, opt = setup(
+        num_patches=args.num_patches,
+        mini_batch_size=args.mini_batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        fc_data_layout=args.fc_data_layout,
+        warmup=args.warmup,
+    )
+
+    # Run experiment
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    lbann.contrib.launcher.run(
+        trainer, model, data_reader, opt,
+        job_name = args.job_name,
+        **kwargs,
+    )
diff --git a/applications/selfsupervised/resnet.py b/applications/selfsupervised/resnet.py
new file mode 100644
index 00000000000..2ab7911f53e
--- /dev/null
+++ b/applications/selfsupervised/resnet.py
@@ -0,0 +1,498 @@
+"""Copy-pasted from lbann.models.resnet."""
+import lbann
+import lbann.modules
+
+# ==============================================
+# Helper modules
+# ==============================================
+
+class ConvBNRelu(lbann.modules.Module):
+    """Convolution -> Batch normalization -> ReLU
+
+    Basic unit for ResNets. Assumes image data in NCHW format.
+
+    """
+
+    def __init__(self, out_channels, kernel_size, stride, padding,
+                 bn_zero_init, bn_statistics_group_size,
+                 relu, name):
+        """Initialize ConvBNRelu module.
+
+        Args:
+            out_channels (int): Number of output channels, i.e. number
+                of convolution filters.
+            kernel_size (int): Size of convolution kernel.
+            stride (int): Convolution stride.
+            padding (int): Convolution padding.
+            bn_zero_init (bool): Zero-initialize batch normalization
+                scale.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
+            relu (bool): Apply ReLU activation.
+            name (str): Module name.
+
+        """
+        super().__init__()
+        self.name = name
+        self.instance = 0
+
+        # Initialize convolution
+        self.conv = lbann.modules.Convolution2dModule(
+            out_channels, kernel_size,
+            stride=stride, padding=padding,
+            bias=False,
+            name=self.name + '_conv')
+
+        # Initialize batch normalization
+        bn_scale_init = 0.0 if bn_zero_init else 1.0
+        bn_scale = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=bn_scale_init),
+            name=self.name + '_bn_scale')
+        bn_bias = lbann.Weights(
+            initializer=lbann.ConstantInitializer(value=0.0),
+            name=self.name + '_bn_bias')
+        self.bn_weights = [bn_scale, bn_bias]
+        self.bn_statistics_group_size = bn_statistics_group_size
+
+        # Initialize ReLU
+        self.relu = relu
+
+    def forward(self, x):
+        self.instance += 1
+        conv = self.conv(x)
+        bn = lbann.BatchNormalization(
+            conv, weights=self.bn_weights,
+            statistics_group_size=(-1 if self.bn_statistics_group_size == 0
+                                   else self.bn_statistics_group_size),
+            name='{0}_bn_instance{1}'.format(self.name,self.instance))
+        if self.relu:
+            return lbann.Relu(
+                bn, name='{0}_relu_instance{1}'.format(self.name,self.instance))
+        else:
+            return bn
+
+class BasicBlock(lbann.modules.Module):
+    """Residual block without bottlenecking.
+
+    The number of output channels is the same as the number of
+    internal channels. Assumes image data in NCHW format. This is the
+    residual block used in ResNet-{18,34}.
+
+    """
+
+    def __init__(self, in_channels, mid_channels,
+                 downsample, zero_init_residual,
+                 bn_statistics_group_size, name, width=1):
+        """Initialize residual block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            mid_channels (int): Number of channels in residual branch.
+            downsample (bool): Perform spatial downsampling (by a
+                factor of 2 in each spatial dimension).
+            zero_init_residual (bool): Zero-initialize the scale in
+                the final batch normalization in the residual branch.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
+            name (str): Module name.
+            width (float, optional): Width growth factor for 3x3
+                convolutions.
+
+        """
+        super().__init__()
+        self.name = name
+        self.instance = 0
+        mid_channels = int(mid_channels * width)
+        self.out_channels = mid_channels
+
+        # Skip connection
+        if downsample:
+            self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0,
+                                      False, bn_statistics_group_size,
+                                      False, self.name + '_branch1')
+        elif in_channels != self.out_channels:
+            self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0,
+                                      False, bn_statistics_group_size,
+                                      False, self.name + '_branch1')
+        else:
+            self.branch1 = None
+
+        # Residual branch
+        self.branch2a = ConvBNRelu(mid_channels, 3,
+                                   (2 if downsample else 1), 1,
+                                   False, bn_statistics_group_size,
+                                   True, self.name + '_branch2a')
+        self.branch2b = ConvBNRelu(self.out_channels, 3, 1, 1,
+                                   zero_init_residual,
+                                   bn_statistics_group_size,
+                                   False, self.name + '_branch2b')
+
+    def forward(self, x):
+        self.instance += 1
+        y1 = self.branch1(x) if self.branch1 else x
+        y2 = self.branch2b(self.branch2a(x))
+        z = lbann.Add([y1, y2],
+                      name='{0}_sum_instance{1}'.format(self.name,self.instance))
+        return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance))
+
+class BottleneckBlock(lbann.modules.Module):
+    """Residual block with bottlenecking.
+
+    The number of output channels is four times the number of internal
+    channels. Assumes image data in NCHW format. This is the residual
+    block used in ResNet-{50,101,152}.
+
+    """
+
+    def __init__(self, in_channels, mid_channels,
+                 downsample, zero_init_residual,
+                 bn_statistics_group_size, name, width=1):
+        """Initialize residual block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            mid_channels (int): Number of channels in residual branch.
+            downsample (bool): Perform spatial downsampling (by a
+                factor of 2 in each spatial dimension).
+            zero_init_residual (bool): Zero-initialize the scale in
+                the final batch normalization in the residual branch.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
+            name (str): Module name.
+            width (float, optional): Width growth factor for 3x3
+                convolutions.
+
+        """
+        super().__init__()
+        self.name = name
+        self.instance = 0
+        self.out_channels = 4 * mid_channels
+        # Width factor does not grow the output channel size.
+        mid_channels = int(mid_channels * width)
+
+        # Skip connection
+        if downsample:
+            self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0,
+                                      False, bn_statistics_group_size,
+                                      False, self.name + '_branch1')
+        elif in_channels != self.out_channels:
+            self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0,
+                                      False, bn_statistics_group_size,
+                                      False, self.name + '_branch1')
+        else:
+            self.branch1 = None
+
+        # Residual branch
+        self.branch2a = ConvBNRelu(mid_channels, 1, 1, 0,
+                                   False, bn_statistics_group_size,
+                                   True, self.name + '_branch2a')
+        self.branch2b = ConvBNRelu(mid_channels, 3,
+                                   (2 if downsample else 1), 1,
+                                   False, bn_statistics_group_size,
+                                   True, self.name + '_branch2b')
+        self.branch2c = ConvBNRelu(self.out_channels, 1, 1, 0,
+                                   zero_init_residual,
+                                   bn_statistics_group_size,
+                                   False, self.name + '_branch2c')
+
+    def forward(self, x):
+        self.instance += 1
+        y1 = self.branch1(x) if self.branch1 else x
+        y2 = self.branch2c(self.branch2b(self.branch2a(x)))
+        z = lbann.Add([y1, y2],
+                      name='{0}_sum_instance{1}'.format(self.name,self.instance))
+        return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance))
+
+# ==============================================
+# ResNet modules
+# ==============================================
+
+class ResNet(lbann.modules.Module):
+    """Residual neural network.
+
+    A ResNet is comprised of residual blocks, which are small
+    convolutional networks with a skip connection. These blocks are
+    grouped into "layers" (this is a horribly overloaded term, but we
+    are following the common usage). At the first block in each layer
+    (except the first), the spatial dimensions are all downsampled by
+    a factor of 2.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+
+    def __init__(self, block,
+                 layer_sizes, layer_channels,
+                 zero_init_residual, bn_statistics_group_size,
+                 name, width=1):
+        """Initialize ResNet.
+
+        Args:
+            block (type): Residual block type, which should be a
+                `lbann.modules.Module`.
+            layer_sizes (`Iterable` containing `int`s): Number of
+                blocks in each ResNet layer.
+            layer_channels (`Iterable` containing `int`s): Number of
+                internal channels in each ResNet layer.
+            zero_init_residual (bool): Whether to initialize the final
+                batch normalization in residual branches with zeros.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
+            name (str): Module name.
+            width (float, optional): Width growth factor.
+
+        """
+        super().__init__()
+        self.name = name
+        self.instance = 0
+        self.conv1 = ConvBNRelu(layer_channels[0], 7, 2, 3,
+                                False, bn_statistics_group_size,
+                                True, self.name + '_conv1')
+        self.blocks = []
+        for layer in range(len(layer_sizes)):
+            mid_channels = layer_channels[layer]
+            for i in range(layer_sizes[layer]):
+                in_channels = (self.blocks[-1].out_channels
+                               if self.blocks
+                               else mid_channels)
+                downsample = (i == 0 and layer > 0)
+                b = block(in_channels, mid_channels,
+                          downsample, zero_init_residual,
+                          bn_statistics_group_size,
+                          '{0}_layer{1}_block{2}'.format(self.name, layer, i),
+                          width=width)
+                self.blocks.append(b)
+
+    def forward(self, x):
+        self.instance += 1
+        x = self.conv1(x)
+        x = lbann.Pooling(x, num_dims=2, has_vectors=False,
+                          pool_dims_i=3, pool_pads_i=1, pool_strides_i=2,
+                          pool_mode='max',
+                          name='{0}_pool1_instance{1}'.format(self.name,self.instance))
+        for b in self.blocks:
+            x = b(x)
+        return x
+
+class ResNet18(ResNet):
+    """ResNet-18 neural network.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 zero_init_residual=True,
+                 bn_statistics_group_size=1,
+                 name=None,
+                 width=1):
+        """Initialize ResNet-18.
+
+        Args:
+            zero_init_residual (bool, optional): Whether to initialize
+                the final batch normalization in residual branches
+                with zeros.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
+            name (str, optional): Module name
+                (default: 'resnet18_module<index>')
+            width (float, optional): Width growth factor.
+
+        """
+        ResNet18.global_count += 1
+        if name is None:
+            name = 'resnet18_module{0}'.format(ResNet18.global_count)
+        super().__init__(BasicBlock,
+                         (2,2,2,2), (64,128,256,512),
+                         zero_init_residual, bn_statistics_group_size,
+                         name, width=width)
+
+class ResNet34(ResNet):
+    """ResNet-34 neural network.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 zero_init_residual=True,
+                 bn_statistics_group_size=1,
+                 name=None,
+                 width=1):
+        """Initialize ResNet-34.
+
+        Args:
+            zero_init_residual (bool, optional): Whether to initialize
+                the final batch normalization in residual branches
+                with zeros.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
+            name (str, optional): Module name
+                (default: 'resnet34_module<index>')
+            width (float, optional): Width growth factor.
+
+        """
+        ResNet34.global_count += 1
+        if name is None:
+            name = 'resnet34_module{0}'.format(ResNet34.global_count)
+        super().__init__(BasicBlock,
+                         (3,4,6,3), (64,128,256,512),
+                         zero_init_residual, bn_statistics_group_size,
+                         name, width=width)
+
+class ResNet50(ResNet):
+    """ResNet-50 neural network.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 zero_init_residual=True,
+                 bn_statistics_group_size=1,
+                 name=None,
+                 width=1):
+        """Initialize ResNet-50.
+
+        Args:
+            zero_init_residual (bool, optional): Whether to initialize
+                the final batch normalization in residual branches
+                with zeros.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
+            name (str, optional): Module name
+                (default: 'resnet50_module<index>')
+            width (float, optional): Width growth factor.
+
+        """
+        ResNet50.global_count += 1
+        if name is None:
+            name = 'resnet50_module{0}'.format(ResNet50.global_count)
+        super().__init__(BottleneckBlock,
+                         (3,4,6,3), (64,128,256,512),
+                         zero_init_residual, bn_statistics_group_size,
+                         name, width=width)
+
+class ResNet101(ResNet):
+    """ResNet-101 neural network.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 zero_init_residual=True,
+                 bn_statistics_group_size=1,
+                 name=None, width=1):
+        """Initialize ResNet-101.
+
+        Args:
+            zero_init_residual (bool, optional): Whether to initialize
+                the final batch normalization in residual branches
+                with zeros.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
+            name (str, optional): Module name
+                (default: 'resnet101_module<index>')
+            width (float, optional): Width growth factor.
+
+        """
+        ResNet101.global_count += 1
+        if name is None:
+            name = 'resnet101_module{0}'.format(ResNet101.global_count)
+        super().__init__(BottleneckBlock,
+                         (3,4,23,3), (64,128,256,512),
+                         zero_init_residual, bn_statistics_group_size,
+                         name, width=width)
+
+class ResNet152(ResNet):
+    """ResNet-152 neural network.
+
+    This does not include the fully-connected network that is commonly
+    applied following the convolutional network. Input data is assumed
+    to be image data in NCHW format.
+
+    See:
+        Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. "Deep
+        residual learning for image recognition." In Proceedings of
+        the IEEE Conference on Computer Vision and Pattern
+        Recognition, pp. 770-778. 2016.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 zero_init_residual=True,
+                 bn_statistics_group_size=1,
+                 name=None,
+                 width=1):
+        """Initialize ResNet-152.
+
+        Args:
+            zero_init_residual (bool, optional): Whether to initialize
+                the final batch normalization in residual branches
+                with zeros.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
+            name (str, optional): Module name
+                (default: 'resnet152_module<index>')
+            width (float, optional): Width growth factor.
+
+        """
+        ResNet152.global_count += 1
+        if name is None:
+            name = 'resnet152_module{0}'.format(ResNet152.global_count)
+        super().__init__(BottleneckBlock,
+                         (3,8,36,3), (64,128,256,512),
+                         zero_init_residual, bn_statistics_group_size,
+                         name, width=width)
diff --git a/applications/selfsupervised/util.py b/applications/selfsupervised/util.py
new file mode 100644
index 00000000000..7b4f16dcd4e
--- /dev/null
+++ b/applications/selfsupervised/util.py
@@ -0,0 +1,27 @@
+import os
+import os.path
+import datetime
+
+def str_list(l):
+    """Convert iterable to a space-separated string."""
+    return ' '.join([str(i) for i in l])
+
+def make_experiment_dir(job_name=None):
+    if job_name is None:
+        job_name = 'lbann_siamese'
+    if 'LBANN_EXPERIMENT_DIR' in os.environ:
+        experiment_dir = os.environ['LBANN_EXPERIMENT_DIR']
+    else:
+        experiment_dir = os.path.join(os.getcwd())
+    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+    experiment_dir = os.path.join(experiment_dir,
+                                  '{}_{}'.format(timestamp, job_name))
+    i = 1
+    while os.path.lexists(experiment_dir):
+        i += 1
+        experiment_dir = os.path.join(
+            os.path.dirname(experiment_dir),
+            '{}_{}_{}'.format(timestamp, job_name, i))
+    experiment_dir = os.path.abspath(experiment_dir)
+    os.makedirs(experiment_dir, exist_ok=True)
+    return experiment_dir
diff --git a/applications/vision/README.md b/applications/vision/README.md
new file mode 100644
index 00000000000..fc05737bf67
--- /dev/null
+++ b/applications/vision/README.md
@@ -0,0 +1,21 @@
+# Example models for computer vision
+
+This directory contains LBANN implementations of widely-used vision
+models. They are intended to validate and benchmark LBANN's vision
+functionality, and are also suitable as pedagogical tools for using
+LBANN.
+
+## LeNet
+
+`lenet.py` trains a LeNet model on MNIST data. It is a simple script
+intended to demonstrate LBANN's Python API. It calls helper functions
+in `data/mnist/__init__.py` to download MNIST data and construct MNIST
+data readers.
+
+## ImageNet models
+
+`alexnet.py`, `resnet.py`, and `densenet.py` are primarily used for
+performance benchmarks and scaling studies. It uses LLNL-specific
+features and the helper functions in `data/imagenet/__init__.py`
+assume that the user is on an LLNL LC system and belongs to the
+`brainusr` group.
diff --git a/applications/vision/alexnet.py b/applications/vision/alexnet.py
new file mode 100644
index 00000000000..c836e90adbd
--- /dev/null
+++ b/applications/vision/alexnet.py
@@ -0,0 +1,81 @@
+import argparse
+import lbann
+import lbann.models
+import lbann.contrib.args
+import lbann.contrib.launcher
+import data.imagenet
+
+# Command-line arguments
+desc = ('Construct and run AlexNet on ImageNet-1K data. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_alexnet', type=str,
+    help='scheduler job name (default: lbann_alexnet)')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=100, type=int,
+    help='number of epochs (default: 100)', metavar='NUM')
+parser.add_argument(
+    '--num-classes', action='store', default=1000, type=int,
+    help='number of ImageNet classes (default: 1000)', metavar='NUM')
+lbann.contrib.args.add_optimizer_arguments(parser)
+parser.add_argument(
+    '--setup_only', action='store_true',
+    help='setup LBANN experiment without running it')
+args = parser.parse_args()
+
+# Due to a data reader limitation, the actual model realization must be
+# hardcoded to 1000 labels for ImageNet.
+imagenet_labels = 1000
+
+# Construct layer graph
+input_ = lbann.Input()
+images = lbann.Identity(input_)
+labels = lbann.Identity(input_)
+preds = lbann.models.AlexNet(imagenet_labels)(images)
+probs = lbann.Softmax(preds)
+cross_entropy = lbann.CrossEntropy(probs, labels)
+top1 = lbann.CategoricalAccuracy(probs, labels)
+top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5)
+layers = list(lbann.traverse_layer_graph(input_))
+
+# Setup objective function
+weights = set()
+for l in layers:
+    weights.update(l.weights)
+l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4)
+obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+# Setup model
+metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
+           lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),
+             lbann.CallbackDropFixedLearningRate(
+                 drop_epoch=[20,40,60], amt=0.1)]
+model = lbann.Model(args.num_epochs,
+                    layers=layers,
+                    weights=weights,
+                    objective_function=obj,
+                    metrics=metrics,
+                    callbacks=callbacks)
+
+# Setup optimizer
+opt = lbann.contrib.args.create_optimizer(args)
+
+# Setup data reader
+data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes)
+
+# Setup trainer
+trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+
+# Run experiment
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           setup_only=args.setup_only,
+                           **kwargs)
diff --git a/applications/vision/data/__init__.py b/applications/vision/data/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/applications/vision/data/cifar10/__init__.py b/applications/vision/data/cifar10/__init__.py
new file mode 100644
index 00000000000..9fa71d684ca
--- /dev/null
+++ b/applications/vision/data/cifar10/__init__.py
@@ -0,0 +1,28 @@
+import os
+import os.path
+
+import google.protobuf.text_format
+import lbann
+import lbann.contrib.lc.paths
+
+def make_data_reader(num_classes=10):
+
+    # Load Protobuf message from file
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    protobuf_file = os.path.join(current_dir, 'data_reader.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Check if data paths are accessible
+    data_dir = lbann.contrib.lc.paths.cifar10_dir()
+
+    if not os.path.isdir(data_dir):
+        raise FileNotFoundError('could not access {}'.format(data_dir))
+
+    # Set paths
+    message.reader[0].data_filedir = data_dir
+    message.reader[1].data_filedir = data_dir
+
+    return message
diff --git a/applications/vision/data/cifar10/data_reader.prototext b/applications/vision/data/cifar10/data_reader.prototext
new file mode 100644
index 00000000000..2867c622231
--- /dev/null
+++ b/applications/vision/data/cifar10/data_reader.prototext
@@ -0,0 +1,43 @@
+data_reader {
+  reader {
+    name: "cifar10"
+    role: "train"
+    shuffle: true
+    data_filedir: "path/to/cifar10/data"
+    validation_percent: 0.1
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.44653 0.48216 0.4914"
+        stddevs: "0.26159 0.24349 0.24703"
+      }
+    }
+  }
+  reader {
+    name: "cifar10"
+    role: "test"
+    shuffle: true
+    data_filedir: "path/to/cifar10/data"
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.44653 0.48216 0.4914"
+        stddevs: "0.26159 0.24349 0.24703"
+      }
+    }
+  }
+}
diff --git a/applications/vision/data/imagenet/__init__.py b/applications/vision/data/imagenet/__init__.py
new file mode 100644
index 00000000000..dc3d46552c4
--- /dev/null
+++ b/applications/vision/data/imagenet/__init__.py
@@ -0,0 +1,56 @@
+import os
+import os.path
+
+import google.protobuf.text_format
+import lbann
+import lbann.contrib.launcher
+
+def make_data_reader(num_classes=1000):
+
+    # Load Protobuf message from file
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    protobuf_file = os.path.join(current_dir, 'data_reader.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Paths to ImageNet data
+    # Note: Paths are only known for some compute centers
+    compute_center = lbann.contrib.launcher.compute_center()
+    if compute_center == 'lc':
+        from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels
+        train_data_dir = imagenet_dir(data_set='train',
+                                      num_classes=num_classes)
+        train_label_file = imagenet_labels(data_set='train',
+                                           num_classes=num_classes)
+        test_data_dir = imagenet_dir(data_set='val',
+                                     num_classes=num_classes)
+        test_label_file = imagenet_labels(data_set='val',
+                                          num_classes=num_classes)
+    elif compute_center == 'nersc':
+        from lbann.contrib.nersc.paths import imagenet_dir, imagenet_labels
+        train_data_dir = imagenet_dir(data_set='train')
+        train_label_file = imagenet_labels(data_set='train')
+        test_data_dir = imagenet_dir(data_set='val')
+        test_label_file = imagenet_labels(data_set='val')
+    else:
+        raise RuntimeError(f'ImageNet data paths are unknown for current compute center ({compute_center})')
+
+    # Check that data paths are accessible
+    if not os.path.isdir(train_data_dir):
+        raise FileNotFoundError('could not access {}'.format(train_data_dir))
+    if not os.path.isfile(train_label_file):
+        raise FileNotFoundError('could not access {}'.format(train_label_file))
+    if not os.path.isdir(test_data_dir):
+        raise FileNotFoundError('could not access {}'.format(test_data_dir))
+    if not os.path.isfile(test_label_file):
+        raise FileNotFoundError('could not access {}'.format(test_label_file))
+
+    # Set paths
+    message.reader[0].data_filedir = train_data_dir
+    message.reader[0].data_filename = train_label_file
+    message.reader[1].data_filedir = test_data_dir
+    message.reader[1].data_filename = test_label_file
+
+    return message
diff --git a/applications/vision/data/imagenet/data_reader.prototext b/applications/vision/data/imagenet/data_reader.prototext
new file mode 100644
index 00000000000..3810e28046c
--- /dev/null
+++ b/applications/vision/data/imagenet/data_reader.prototext
@@ -0,0 +1,61 @@
+data_reader {
+  reader {
+    name: "imagenet"
+    role: "train"
+    shuffle: true
+    data_filedir: "path/to/ILSVRC2012/train"
+    data_filename: "path/to/ILSVRC2012/labels/train.txt"
+    validation_percent: 0.0
+    percent_of_data_to_use: 1.0
+    num_labels: 1000
+
+    transforms {
+      random_resized_crop {
+        height: 224
+        width: 224
+      }
+    }
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+  }
+
+  reader {
+    name: "imagenet"
+    role: "validate"
+    shuffle: true
+    data_filedir: "path/to/ILSVRC2012/val"
+    data_filename: "path/to/ILSVRC2012/labels/val.txt"
+    percent_of_data_to_use: 1.0
+    num_labels: 1000
+
+    transforms {
+      resized_center_crop {
+        height: 256
+        width: 256
+        crop_height: 224
+        crop_width: 224
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+  }
+}
diff --git a/applications/vision/data/mnist/.gitignore b/applications/vision/data/mnist/.gitignore
new file mode 100644
index 00000000000..10c191aa77f
--- /dev/null
+++ b/applications/vision/data/mnist/.gitignore
@@ -0,0 +1,5 @@
+*.gz
+train-images-idx3-ubyte
+train-labels-idx1-ubyte
+t10k-images-idx3-ubyte
+t10k-labels-idx1-ubyte
diff --git a/applications/vision/data/mnist/__init__.py b/applications/vision/data/mnist/__init__.py
new file mode 100644
index 00000000000..271ccf0f61e
--- /dev/null
+++ b/applications/vision/data/mnist/__init__.py
@@ -0,0 +1,59 @@
+import gzip
+import os
+import os.path
+import urllib.request
+
+import google.protobuf.text_format
+import lbann
+
+# Paths
+data_dir = os.path.dirname(os.path.realpath(__file__))
+
+def download_data():
+    """Download MNIST data files, if needed.
+
+    Data files are downloaded from http://yann.lecun.com/exdb/mnist/
+    and uncompressed. Does nothing if the files already exist.
+
+    """
+
+    # MNIST data files and associated URLs
+    urls = {
+        'train-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
+        'train-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
+        't10k-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
+        't10k-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
+    }
+
+    # Download and uncompress MNIST data files, if needed
+    for data_file, url in urls.items():
+        data_file = os.path.join(data_dir, data_file)
+        compressed_file = data_file + '.gz'
+        if not os.path.isfile(data_file):
+            urllib.request.urlretrieve(url, filename=compressed_file)
+            with gzip.open(compressed_file, 'rb') as in_file:
+                with open(data_file, 'wb') as out_file:
+                    out_file.write(in_file.read())
+
+def make_data_reader():
+    """Make Protobuf message for MNIST data reader.
+
+    MNIST data is downloaded if needed.
+
+    """
+
+    # Download MNIST data files
+    download_data()
+
+    # Load Protobuf message from file
+    protobuf_file = os.path.join(data_dir, 'data_reader.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Set paths
+    for reader in message.reader:
+        reader.data_filedir = data_dir
+
+    return message
diff --git a/applications/vision/data/mnist/data_reader.prototext b/applications/vision/data/mnist/data_reader.prototext
new file mode 100644
index 00000000000..61c3b32cf42
--- /dev/null
+++ b/applications/vision/data/mnist/data_reader.prototext
@@ -0,0 +1,30 @@
+data_reader {
+  reader {
+    name: "mnist"
+    role: "train"
+    shuffle: true
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "train-images-idx3-ubyte"
+    label_filename: "train-labels-idx1-ubyte"
+    validation_percent: 0.1
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+  reader {
+    name: "mnist"
+    role: "test"
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "t10k-images-idx3-ubyte"
+    label_filename: "t10k-labels-idx1-ubyte"
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+}
diff --git a/applications/vision/densenet.py b/applications/vision/densenet.py
new file mode 100644
index 00000000000..7e70c4d23a0
--- /dev/null
+++ b/applications/vision/densenet.py
@@ -0,0 +1,470 @@
+import argparse
+import lbann
+import lbann.contrib.args
+import lbann.contrib.launcher
+import data.imagenet
+
+LOG = True
+
+
+def log(string):
+    if LOG:
+        print(string)
+
+
+# DenseNet #####################################################################
+# See src/proto/lbann.proto for possible functions to call.
+# See PyTorch DenseNet:
+# https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py
+# See "Densely Connected Convolutional Networks" by Huang et. al p.4
+def densenet(statistics_group_size,
+             version,
+             cumulative_layer_num,
+             images_node
+             ):
+    if version == 121:
+        growth_rate = 32  # k in the paper
+        layers_per_block = (6, 12, 24, 16)
+        num_initial_features = 64
+    elif version == 161:
+        growth_rate = 48  # k in the paper
+        layers_per_block = (96, 48, 36, 24)
+        num_initial_features = 96
+    else:
+        raise Exception('Invalid version={v}.'.format(v=version))
+    batch_norm_size = 4
+
+    parent_node, cumulative_layer_num = initial_layer(
+        statistics_group_size,
+        cumulative_layer_num, images_node,
+        num_initial_features)
+    num_features = num_initial_features
+    # Start counting dense blocks at 1.
+    for current_block_num, num_layers in enumerate(layers_per_block, 1):
+        parent_nodes, cumulative_layer_num = dense_block(
+            statistics_group_size,
+            cumulative_layer_num,
+            parent_node,
+            batch_norm_size=batch_norm_size,
+            current_block_num=current_block_num,
+            growth_rate=growth_rate,
+            num_layers=num_layers,
+            num_initial_channels=num_initial_features
+        )
+        # num_features += num_layers * growth_rate
+        for node in parent_nodes[1:]:
+            num_features += node.num_output_channels
+        parent_node = lbann.Concatenation(parent_nodes)
+        cumulative_layer_num += 1
+        log('densenet Concatenation. cumulative_layer_num={n}'.format(
+            b=current_block_num, n=cumulative_layer_num))
+        if current_block_num != len(layers_per_block):
+            parent_node, cumulative_layer_num = transition_layer(
+                statistics_group_size,
+                current_block_num,
+                cumulative_layer_num,
+                parent_node,
+                # In Python 3, this is integer division.
+                num_output_channels=num_features//2,
+            )
+            num_features //= 2
+
+    batch_normalization_node = standard_batchnorm(statistics_group_size,
+                                                  parent_node)
+    cumulative_layer_num += 1
+    log('densenet BatchNormalization. cumulative_layer_num={n}'.format(
+        b=current_block_num, n=cumulative_layer_num))
+
+    relu_node = lbann.Relu(batch_normalization_node)
+    cumulative_layer_num += 1
+    log('densenet Relu. cumulative_layer_num={n}'.format(
+        b=current_block_num, n=cumulative_layer_num))
+
+    probs = classification_layer(
+        cumulative_layer_num,
+        relu_node
+    )
+    return probs
+
+
+def initial_layer(statistics_group_size,
+                  cumulative_layer_num,
+                  images_node,
+                  num_initial_channels
+                  ):
+    # 7x7 conv, stride 2
+    convolution_node = lbann.Convolution(
+        images_node,
+        conv_dims_i=7,
+        conv_pads_i=3,
+        conv_strides_i=2,
+        has_bias=False,
+        num_dims=2,
+        num_output_channels=num_initial_channels
+    )
+    cumulative_layer_num += 1
+    log('initial_layer Convolution. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    batch_normalization_node = standard_batchnorm(statistics_group_size,
+                                                  convolution_node)
+    cumulative_layer_num += 1
+    log('initial_layer BatchNormalization. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    relu_node = lbann.Relu(batch_normalization_node)
+    cumulative_layer_num += 1
+    log('initial_layer Relu. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    # 3x3 max pool, stride 2
+    pooling_node = lbann.Pooling(
+        relu_node,
+        num_dims=2,
+        pool_dims_i=3,
+        pool_mode='max',
+        pool_pads_i=1,
+        pool_strides_i=2
+        )
+    cumulative_layer_num += 1
+    log('initial_layer Pooling. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    return pooling_node, cumulative_layer_num
+
+
+def standard_batchnorm(statistics_group_size, parent_node):
+    return lbann.BatchNormalization(
+        parent_node,
+        bias_init=0.0,
+        decay=0.9,
+        epsilon=1e-5,
+        scale_init=1.0,
+        statistics_group_size=statistics_group_size
+    )
+
+
+def dense_block(statistics_group_size,
+                cumulative_layer_num,
+                parent_node,
+                batch_norm_size,
+                current_block_num,
+                growth_rate,
+                num_layers,
+                num_initial_channels
+                ):
+    parent_nodes = [parent_node]
+    # Start counting dense layers at 1.
+    for current_layer_num in range(1, num_layers + 1):
+        # channels from before block + (each dense layer has k=growth_rate channels)
+        num_input_channels = num_initial_channels + (current_layer_num - 1) * growth_rate
+        print('num_input_channels={c}'.format(c=num_input_channels))
+        parent_node, cumulative_layer_num = dense_layer(
+            statistics_group_size,
+            current_block_num,
+            current_layer_num,
+            cumulative_layer_num,
+            parent_nodes,
+            batch_norm_size=batch_norm_size,
+            growth_rate=growth_rate
+        )
+        parent_nodes.append(parent_node)
+    return parent_nodes, cumulative_layer_num
+
+
+def dense_layer(statistics_group_size,
+                current_block_num,
+                current_layer_num,
+                cumulative_layer_num,
+                parent_nodes,
+                batch_norm_size,
+                growth_rate
+                ):
+    concatenation_node = lbann.Concatenation(parent_nodes)
+    cumulative_layer_num += 1
+    log('dense_block={b} dense_layer={l} Concatenation. cumulative_layer_num={n}'.format(
+        b=current_block_num, l=current_layer_num, n=cumulative_layer_num))
+    conv_block_1_node, cumulative_layer_num = conv_block(
+        statistics_group_size,
+        current_block_num,
+        current_layer_num,
+        cumulative_layer_num,
+        concatenation_node,
+        conv_dims_i=1,
+        conv_pads_i=0,
+        num_output_channels=batch_norm_size * growth_rate
+    )
+    conv_block_2_node, cumulative_layer_num = conv_block(
+        statistics_group_size,
+        current_block_num,
+        current_layer_num,
+        cumulative_layer_num,
+        conv_block_1_node,
+        conv_dims_i=3,
+        conv_pads_i=1,
+        num_output_channels=growth_rate
+    )
+    return conv_block_2_node, cumulative_layer_num
+
+
+def conv_block(statistics_group_size,
+               current_block_num,
+               current_layer_num,
+               cumulative_layer_num,
+               parent_node,
+               conv_dims_i,
+               conv_pads_i,
+               num_output_channels
+               ):
+    batch_normalization_node = standard_batchnorm(statistics_group_size,
+                                                  parent_node)
+    cumulative_layer_num += 1
+    log('dense_block={b} dense_layer={l} BatchNormalization. cumulative_layer_num={n}'.format(
+        b=current_block_num, l=current_layer_num, n=cumulative_layer_num))
+
+    relu_node = lbann.Relu(batch_normalization_node)
+    cumulative_layer_num += 1
+    log(
+        'dense_block={b} dense_layer={l} Relu. cumulative_layer_num={n}'.format(
+            b=current_block_num, l=current_layer_num, n=cumulative_layer_num))
+
+    convolution_node = lbann.Convolution(
+        relu_node,
+        conv_dims_i=conv_dims_i,
+        conv_pads_i=conv_pads_i,
+        conv_strides_i=1,
+        has_bias=False,
+        num_dims=2,
+        num_output_channels=num_output_channels
+    )
+    cumulative_layer_num += 1
+    log('dense_block={b} dense_layer={l} Convolution. cumulative_layer_num={n}'.format(
+        b=current_block_num, l=current_layer_num, n=cumulative_layer_num))
+
+    return convolution_node, cumulative_layer_num
+
+
+def transition_layer(statistics_group_size,
+                     current_block_num,
+                     cumulative_layer_num,
+                     parent_node,
+                     num_output_channels
+                     ):
+    batch_normalization_node = standard_batchnorm(statistics_group_size,
+                                                  parent_node)
+    cumulative_layer_num += 1
+    log('dense_block={b} > transition_layer BatchNormalization. cumulative_layer_num={n}'.format(
+        b=current_block_num,  n=cumulative_layer_num))
+
+    relu_node = lbann.Relu(batch_normalization_node)
+    cumulative_layer_num += 1
+    log('dense_block={b} > transition_layer Relu. cumulative_layer_num={n}'.format(
+        b=current_block_num, n=cumulative_layer_num))
+
+    convolution_node = lbann.Convolution(
+        relu_node,
+        conv_dims_i=1,
+        conv_pads_i=0,
+        conv_strides_i=1,
+        has_bias=False,
+        num_dims=2,
+        num_output_channels=num_output_channels
+    )
+    cumulative_layer_num += 1
+    log('dense_block={b} > transition_layer Convolution. cumulative_layer_num={n}'.format(
+        b=current_block_num, n=cumulative_layer_num))
+
+    # 2x2 average pool, stride 2
+    pooling_node = lbann.Pooling(
+        convolution_node,
+        num_dims=2,
+        pool_dims_i=2,
+        pool_mode='average',
+        pool_pads_i=0,
+        pool_strides_i=2
+    )
+    cumulative_layer_num += 1
+    log('dense_block={b} > transition_layer Pooling. cumulative_layer_num={n}'.format(
+        b=current_block_num, n=cumulative_layer_num))
+
+    return pooling_node, cumulative_layer_num
+
+
+def classification_layer(cumulative_layer_num,
+                         parent_node):
+    # 7x7 global average pool
+    pooling_node = lbann.Pooling(
+        parent_node,
+        num_dims=2,
+        pool_dims_i=7,
+        pool_mode='average',
+        pool_pads_i=1,
+        pool_strides_i=1
+    )
+    cumulative_layer_num += 1
+    log('classification_layer Pooling. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    fully_connected_node = lbann.FullyConnected(
+        pooling_node,
+        num_neurons=1000,
+        has_bias=False
+    )
+    cumulative_layer_num += 1
+    log('classification_layer FullyConnected. cumulative_layer_num={n}'.format(
+        n=cumulative_layer_num))
+
+    probabilities = lbann.Softmax(fully_connected_node)
+    return probabilities
+
+
+# Helpful Functions ############################################################
+def get_args():
+    desc = ('Construct and run DenseNet on ImageNet data. '
+            'Running the experiment is only supported on LC systems.')
+    parser = argparse.ArgumentParser(description=desc)
+    lbann.contrib.args.add_scheduler_arguments(parser)
+    parser.add_argument(
+        '--job-name', action='store', default='lbann_densenet', type=str,
+        help='scheduler job name (default: lbann_densenet)')
+    parser.add_argument(
+        '--mini-batch-size', action='store', default=256, type=int,
+        help='mini-batch size (default: 256)', metavar='NUM')
+    parser.add_argument(
+        '--num-epochs', action='store', default=90, type=int,
+        help='number of epochs (default: 90)', metavar='NUM')
+    parser.add_argument(
+        '--num-classes', action='store', default=1000, type=int,
+        help='number of ImageNet classes (default: 1000)', metavar='NUM')
+    lbann.contrib.args.add_optimizer_arguments(
+        parser,
+        default_optimizer='sgd',
+        default_learning_rate=0.1
+    )
+    parser.add_argument(
+        '--setup_only', action='store_true',
+        help='do not run experiment (e.g. if only the prototext is desired)')
+    args = parser.parse_args()
+    return args
+
+
+def construct_layer_graph(
+        statistics_group_size,
+        version,
+        cumulative_layer_num,
+        input_node):
+    # Input data
+    images_node = lbann.Identity(input_node)
+    cumulative_layer_num += 1
+    log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num))
+
+    # Use input_node, not images_node.
+    image_labels_node = lbann.Identity(input_node)
+    cumulative_layer_num += 1
+    log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num))
+
+    # Use images_node, not image_labels_node.
+    probabilities = densenet(statistics_group_size, version,
+                             cumulative_layer_num, images_node)
+
+    return probabilities, image_labels_node
+
+
+def set_up_experiment(args,
+                      input_,
+                      probs,
+                      labels):
+    # Set up objective function
+    cross_entropy = lbann.CrossEntropy([probs, labels])
+    layers = list(lbann.traverse_layer_graph(input_))
+    l2_reg_weights = set()
+    for l in layers:
+        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+            l2_reg_weights.update(l.weights)
+    # scale = weight decay
+    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
+    objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+    # Set up model
+    top1 = lbann.CategoricalAccuracy([probs, labels])
+    top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
+    metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
+               lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 lbann.CallbackDropFixedLearningRate(
+                     drop_epoch=[30, 60], amt=0.1)]
+    model = lbann.Model(args.num_epochs,
+                        layers=layers,
+                        objective_function=objective_function,
+                        metrics=metrics,
+                        callbacks=callbacks)
+
+    # Set up data reader
+    data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes)
+
+    # Set up optimizer
+    if args.optimizer == 'sgd':
+        print('Creating sgd optimizer')
+        optimizer = lbann.optimizer.SGD(
+            learn_rate=args.optimizer_learning_rate,
+            momentum=0.9,
+            nesterov=True
+        )
+    else:
+        optimizer = lbann.contrib.args.create_optimizer(args)
+
+    # Setup trainer
+    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)
+
+    return trainer, model, data_reader, optimizer
+
+
+def run_experiment(args,
+                   trainer,
+                   model,
+                   data_reader,
+                   optimizer):
+    # Note: Use `lbann.run` instead for non-LC systems.
+    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+    lbann.contrib.launcher.run(trainer, model, data_reader, optimizer,
+                               job_name=args.job_name,
+                               **kwargs)
+
+
+# Main function ################################################################
+def main():
+    # ----------------------------------
+    # Command-line arguments
+    # ----------------------------------
+
+    args = get_args()
+
+    # ----------------------------------
+    # Construct layer graph
+    # ----------------------------------
+
+    input_node = lbann.Input()
+    # Start counting cumulative layers at 1.
+    cumulative_layer_num = 1
+    log('Input. cumulative_layer_num={n}'.format(n=cumulative_layer_num))
+    (probs, labels) = construct_layer_graph(
+        args.procs_per_node,
+        121, cumulative_layer_num, input_node)
+
+    # ----------------------------------
+    # Setup experiment
+    # ----------------------------------
+
+    (trainer, model, data_reader_proto, optimizer) = set_up_experiment(
+        args, input_node, probs, labels)
+
+    # ----------------------------------
+    # Run experiment
+    # ----------------------------------
+
+    run_experiment(args, trainer, model, data_reader_proto, optimizer)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/applications/vision/lenet.py b/applications/vision/lenet.py
new file mode 100644
index 00000000000..47e6819edfe
--- /dev/null
+++ b/applications/vision/lenet.py
@@ -0,0 +1,98 @@
+import argparse
+import lbann
+import data.mnist
+import lbann.contrib.args
+import lbann.contrib.launcher
+
+# ----------------------------------
+# Command-line arguments
+# ----------------------------------
+
+desc = ('Train LeNet on MNIST data using LBANN.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_lenet', type=str,
+    help='scheduler job name (default: lbann_lenet)')
+args = parser.parse_args()
+
+# ----------------------------------
+# Construct layer graph
+# ----------------------------------
+
+# Input data
+input_ = lbann.Input()
+images = lbann.Identity(input_)
+labels = lbann.Identity(input_)
+
+# LeNet
+x = lbann.Convolution(images,
+                      num_dims = 2,
+                      num_output_channels = 6,
+                      num_groups = 1,
+                      conv_dims_i = 5,
+                      conv_strides_i = 1,
+                      conv_dilations_i = 1,
+                      has_bias = True)
+x = lbann.Relu(x)
+x = lbann.Pooling(x,
+                  num_dims = 2,
+                  pool_dims_i = 2,
+                  pool_strides_i = 2,
+                  pool_mode = "max")
+x = lbann.Convolution(x,
+                      num_dims = 2,
+                      num_output_channels = 16,
+                      num_groups = 1,
+                      conv_dims_i = 5,
+                      conv_strides_i = 1,
+                      conv_dilations_i = 1,
+                      has_bias = True)
+x = lbann.Relu(x)
+x = lbann.Pooling(x,
+                  num_dims = 2,
+                  pool_dims_i = 2,
+                  pool_strides_i = 2,
+                  pool_mode = "max")
+x = lbann.FullyConnected(x, num_neurons = 120, has_bias = True)
+x = lbann.Relu(x)
+x = lbann.FullyConnected(x, num_neurons = 84, has_bias = True)
+x = lbann.Relu(x)
+x = lbann.FullyConnected(x, num_neurons = 10, has_bias = True)
+probs = lbann.Softmax(x)
+
+# Loss function and accuracy
+loss = lbann.CrossEntropy(probs, labels)
+acc = lbann.CategoricalAccuracy(probs, labels)
+
+# ----------------------------------
+# Setup experiment
+# ----------------------------------
+
+# Setup model
+mini_batch_size = 64
+num_epochs = 20
+model = lbann.Model(num_epochs,
+                    layers=lbann.traverse_layer_graph(input_),
+                    objective_function=loss,
+                    metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
+                    callbacks=[lbann.CallbackPrintModelDescription(),
+                               lbann.CallbackPrint(),
+                               lbann.CallbackTimer()])
+
+# Setup optimizer
+opt = lbann.SGD(learn_rate=0.01, momentum=0.9)
+
+# Setup data reader
+data_reader = data.mnist.make_data_reader()
+
+# Setup trainer
+trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+
+# ----------------------------------
+# Run experiment
+# ----------------------------------
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/applications/vision/resnet.py b/applications/vision/resnet.py
new file mode 100644
index 00000000000..3181d29f936
--- /dev/null
+++ b/applications/vision/resnet.py
@@ -0,0 +1,158 @@
+import argparse
+import lbann
+import lbann.models
+import lbann.models.resnet
+import lbann.contrib.args
+import lbann.contrib.models.wide_resnet
+import lbann.contrib.launcher
+import data.imagenet
+
+# Command-line arguments
+desc = ('Construct and run ResNet on ImageNet-1K data. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_resnet', type=str,
+    help='scheduler job name (default: lbann_resnet)')
+parser.add_argument(
+    '--resnet', action='store', default=50, type=int,
+    choices=(18, 34, 50, 101, 152),
+    help='ResNet variant (default: 50)')
+parser.add_argument(
+    '--width', action='store', default=1, type=float,
+    help='Wide ResNet width factor (default: 1)')
+parser.add_argument(
+    '--block-type', action='store', default=None, type=str,
+    choices=('basic', 'bottleneck'),
+    help='ResNet block type')
+parser.add_argument(
+    '--blocks', action='store', default=None, type=str,
+    help='ResNet block counts (comma-separated list)')
+parser.add_argument(
+    '--block-channels', action='store', default=None, type=str,
+    help='Internal channels in each ResNet block (comma-separated list)')
+parser.add_argument(
+    '--bn-statistics-group-size', action='store', default=1, type=int,
+    help=('Group size for aggregating batch normalization statistics '
+          '(default: 1)'))
+parser.add_argument(
+    '--warmup', action='store_true', help='use a linear warmup')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=90, type=int,
+    help='number of epochs (default: 90)', metavar='NUM')
+parser.add_argument(
+    '--num-classes', action='store', default=1000, type=int,
+    help='number of ImageNet classes (default: 1000)', metavar='NUM')
+parser.add_argument(
+    '--random-seed', action='store', default=0, type=int,
+    help='random seed for LBANN RNGs', metavar='NUM')
+lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1)
+args = parser.parse_args()
+
+# Due to a data reader limitation, the actual model realization must be
+# hardcoded to 1000 labels for ImageNet.
+imagenet_labels = 1000
+
+# Choose ResNet variant
+resnet_variant_dict = {18: lbann.models.ResNet18,
+                       34: lbann.models.ResNet34,
+                       50: lbann.models.ResNet50,
+                       101: lbann.models.ResNet101,
+                       152: lbann.models.ResNet152}
+wide_resnet_variant_dict = {50: lbann.contrib.models.wide_resnet.WideResNet50_2}
+block_variant_dict = {
+    'basic': lbann.models.resnet.BasicBlock,
+    'bottleneck': lbann.models.resnet.BottleneckBlock
+}
+
+if (any([args.block_type, args.blocks, args.block_channels])
+    and not all([args.block_type, args.blocks, args.block_channels])):
+    raise RuntimeError('Must specify all of --block-type, --blocks, --block-channels')
+if args.block_type and args.blocks and args.block_channels:
+    # Build custom ResNet.
+    resnet = lbann.models.ResNet(
+        block_variant_dict[args.block_type],
+        imagenet_labels,
+        list(map(int, args.blocks.split(','))),
+        list(map(int, args.block_channels.split(','))),
+        zero_init_residual=True,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        name='custom_resnet',
+        width=args.width)
+elif args.width == 1:
+    # Vanilla ResNet.
+    resnet = resnet_variant_dict[args.resnet](
+        imagenet_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size)
+elif args.width == 2 and args.resnet == 50:
+    # Use pre-defined WRN-50-2.
+    resnet = wide_resnet_variant_dict[args.resnet](
+        imagenet_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size)
+else:
+    # Some other Wide ResNet.
+    resnet = resnet_variant_dict[args.resnet](
+        imagenet_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        width=args.width)
+
+# Construct layer graph
+input_ = lbann.Input()
+images = lbann.Identity(input_)
+labels = lbann.Identity(input_)
+preds = resnet(images)
+probs = lbann.Softmax(preds)
+cross_entropy = lbann.CrossEntropy(probs, labels)
+top1 = lbann.CategoricalAccuracy(probs, labels)
+top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5)
+layers = list(lbann.traverse_layer_graph(input_))
+
+# Setup tensor core operations (just to demonstrate enum usage)
+tensor_ops_mode = lbann.ConvTensorOpsMode.NO_TENSOR_OPS
+for l in layers:
+    if type(l) == lbann.Convolution:
+        l.conv_tensor_op_mode=tensor_ops_mode
+
+# Setup objective function
+l2_reg_weights = set()
+for l in layers:
+    if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+        l2_reg_weights.update(l.weights)
+l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
+obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+# Setup model
+metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
+           lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),
+             lbann.CallbackDropFixedLearningRate(
+                 drop_epoch=[30, 60, 80], amt=0.1)]
+if args.warmup:
+    callbacks.append(
+        lbann.CallbackLinearGrowthLearningRate(
+            target=0.1 * args.mini_batch_size / 256, num_epochs=5))
+model = lbann.Model(args.num_epochs,
+                    layers=layers,
+                    objective_function=obj,
+                    metrics=metrics,
+                    callbacks=callbacks)
+
+# Setup optimizer
+opt = lbann.contrib.args.create_optimizer(args)
+
+# Setup data reader
+data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes)
+
+# Setup trainer
+trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size, random_seed=args.random_seed)
+
+# Run experiment
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/applications/vision/summarizing_images/autoencoder_conv_summarize.py b/applications/vision/summarizing_images/autoencoder_conv_summarize.py
new file mode 100644
index 00000000000..a1e2f67f751
--- /dev/null
+++ b/applications/vision/summarizing_images/autoencoder_conv_summarize.py
@@ -0,0 +1,281 @@
+################################################################################
+# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+# the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+#
+# LLNL-CODE-697807.
+# All rights reserved.
+#
+# This file is part of LBANN: Livermore Big Artificial Neural Network
+# Toolkit. For details, see http://software.llnl.gov/LBANN or
+# https://github.com/LLNL/LBANN.
+#
+# Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+# may not use this file except in compliance with the License.  You may
+# obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the license.
+#
+# autoencoder_conv_summarize.py - A simple autoencoder for image data
+# (Supports CIFAR-10 or Imagenet 1K)
+#
+# This example demonstrates the use of the image summarizer in
+# autoencoder mode.
+#
+################################################################################
+
+import os.path
+import sys
+import argparse
+import lbann
+import lbann.models
+import lbann.contrib.args
+import lbann.contrib.models.wide_resnet
+import lbann.contrib.launcher
+
+# Get relative path to data
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data'))
+import cifar10
+import imagenet
+
+# Command-line arguments
+desc = ('Construct and run ResNet on ImageNet-1K data. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_image_ae', type=str,
+    help='scheduler job name (default: lbann_resnet)')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=90, type=int,
+    help='number of epochs (default: 90)', metavar='NUM')
+parser.add_argument(
+    '--num-classes', action='store', default=1000, type=int,
+    help='number of ImageNet classes (default: 1000)', metavar='NUM')
+parser.add_argument(
+    '--random-seed', action='store', default=0, type=int,
+    help='random seed for LBANN RNGs', metavar='NUM')
+parser.add_argument(
+    '--dataset', action='store', default='imagenet', type=str,
+    help='dataset to use; \"cifar10\" or \"imagenet\"')
+parser.add_argument(
+    '--data-reader-percent', action='store',
+    default=1.0, type=float,
+    help='the percent of the data to use (default: 1.0)', metavar='NUM')
+lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1)
+args = parser.parse_args()
+
+# Due to a data reader limitation, the actual model realization must be
+# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10.
+dataset = args.dataset;
+if dataset == 'imagenet':
+    num_labels=1000
+elif dataset == 'cifar10':
+    num_labels=10
+else:
+    print("Dataset must be cifar10 or imagenet. Try again.")
+    exit()
+
+# Construct layer graph
+input_ = lbann.Input(name='input')
+image = lbann.Identity(input_, name='images')
+dummy = lbann.Dummy(input_, name='labels')
+
+# Encoder
+
+conv1 = lbann.Convolution(image,
+                          name="conv1",
+                          num_dims=2,
+                          num_output_channels=16,
+                          conv_dims='3 3',
+                          conv_pads='0 0',
+                          conv_strides='1 1',
+                          has_bias=True,
+                          has_vectors=True)
+
+relu1 = lbann.Relu(conv1, name="relu1")
+
+pool1 = lbann.Pooling(relu1,
+                      name="pool1",
+                      num_dims=2,
+                      pool_dims='2 2',
+                      pool_pads='0 0',
+                      pool_strides='1 1',
+                      pool_mode="max",
+                      has_vectors=True)
+
+
+conv2 = lbann.Convolution(pool1,
+                          name="conv2",
+                          num_dims=2,
+                          num_output_channels=8,
+                          conv_dims='3 3',
+                          conv_pads='0 0',
+                          conv_strides='1 1',
+                          has_bias=True,
+                          has_vectors=True)
+
+relu2 = lbann.Relu(conv2, name="relu2")
+
+pool2 = lbann.Pooling(relu2,
+                      name="pool2",
+                      num_dims=2,
+                      pool_dims='2 2',
+                      pool_pads='0 0',
+                      pool_strides='1 1',
+                      pool_mode="max",
+                      has_vectors=True)
+
+conv3 = lbann.Convolution(pool2,
+                          name="conv3",
+                          num_dims=2,
+                          num_output_channels=8,
+                          conv_dims='3 3',
+                          conv_pads='0 0',
+                          conv_strides='1 1',
+                          has_bias=True,
+                          has_vectors=True)
+
+relu3 = lbann.Relu(conv3, name="relu3")
+
+pool3 = lbann.Pooling(relu3,
+                      name="pool3",
+                      num_dims=2,
+                      pool_dims='2 2',
+                      pool_pads='0 0',
+                      pool_strides='1 1',
+                      pool_mode="max",
+                      has_vectors=True)
+
+unpool3 = lbann.Unpooling(pool3,
+                          name="unpool3",
+                          num_dims=2,
+                          pooling_layer=pool3.name)
+
+deconv3 = lbann.Deconvolution(unpool3,
+                              name="deconv3",
+                              num_dims=2,
+                              num_output_channels=8,
+                              conv_dims='3 3',
+                              conv_pads='0 0',
+                              conv_strides='1 1',
+                              has_bias=True,
+                              has_vectors=True)
+
+relu4 = lbann.Relu(deconv3, name="relu4")
+
+unpool2 = lbann.Unpooling(relu4,
+                          name="unpool2",
+                          num_dims=2,
+                          pooling_layer=pool2.name)
+
+deconv2 = lbann.Deconvolution(unpool2,
+                              name="deconv2",
+                              num_dims=2,
+                              num_output_channels=16,
+                              conv_dims='3 3',
+                              conv_pads='0 0',
+                              conv_strides='1 1',
+                              has_bias=True,
+                              has_vectors=True)
+
+relu5 = lbann.Relu(deconv2, name="relu5")
+
+unpool1 = lbann.Unpooling(relu5,
+                          name="unpool1",
+                          num_dims=2,
+                          pooling_layer=pool1.name)
+
+deconv1 = lbann.Deconvolution(unpool1,
+                              name="deconv1",
+                              num_dims=2,
+                              num_output_channels=3,
+                              conv_dims='3 3',
+                              conv_pads='0 0',
+                              conv_strides='1 1',
+                              has_bias=True,
+                              has_vectors=True)
+
+relu6 = lbann.Relu(deconv1, name="relu6")
+
+decode1 = lbann.FullyConnected(relu6,
+                               name="decode1",
+                               hint_layer=image,
+                               has_bias=True)
+
+reconstruction = lbann.Sigmoid(decode1,
+                               name="reconstruction")
+
+
+# Reconstruction
+mean_squared_error = lbann.MeanSquaredError([reconstruction, image],
+                             name="mean_squared_error")
+
+layer_term = lbann.LayerTerm(mean_squared_error)
+scale_factor = lbann.L2WeightRegularization(scale=0.0005)
+obj = lbann.ObjectiveFunction([layer_term, scale_factor])
+
+metrics = [lbann.Metric(mean_squared_error, name=mean_squared_error.name)]
+
+img_strategy = lbann.TrackSampleIDsStrategy(
+    input_layer_name=input_.name,
+    num_tracked_images=20)
+
+summarize_images = lbann.CallbackSummarizeImages(
+    selection_strategy=img_strategy,
+    image_source_layer_name=reconstruction.name,
+    epoch_interval=10)
+
+# Dump original image from input layer one time (high epoch interval)
+summarize_input_layer = lbann.CallbackSummarizeImages(
+    selection_strategy=img_strategy,
+    image_source_layer_name=input_.name,
+    epoch_interval=10000)
+
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),
+             summarize_images,
+             summarize_input_layer]
+
+layer_list = list(lbann.traverse_layer_graph(input_))
+model = lbann.Model(
+                    args.num_epochs,
+                    layers=layer_list,
+                    objective_function=obj,
+                    metrics=metrics,
+                    callbacks=callbacks,
+                    summary_dir=".")
+
+# Setup optimizer
+opt = lbann.contrib.args.create_optimizer(args)
+
+# Setup data reader
+num_classes=min(args.num_classes, num_labels)
+
+if dataset == "cifar10":
+    data_reader = cifar10.make_data_reader(num_classes=num_classes)
+else:
+    data_reader = imagenet.make_data_reader(num_classes=num_classes)
+
+# Setup trainer
+trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size)
+
+# Run experiment
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent)+' --disable_cuda=1'
+
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/applications/vision/summarizing_images/autoencoder_summarize.py b/applications/vision/summarizing_images/autoencoder_summarize.py
new file mode 100644
index 00000000000..ba967e4f031
--- /dev/null
+++ b/applications/vision/summarizing_images/autoencoder_summarize.py
@@ -0,0 +1,192 @@
+################################################################################
+# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+# the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+#
+# LLNL-CODE-697807.
+# All rights reserved.
+#
+# This file is part of LBANN: Livermore Big Artificial Neural Network
+# Toolkit. For details, see http://software.llnl.gov/LBANN or
+# https://github.com/LLNL/LBANN.
+#
+# Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+# may not use this file except in compliance with the License.  You may
+# obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the license.
+#
+# autoencoder_summarize.py - A simple autoencoder for image data
+# (Supports CIFAR-10 or Imagenet 1K)
+#
+# This example demonstrates the use of the image summarizer in
+# autoencoder mode.
+#
+################################################################################
+
+import os.path
+import sys
+import argparse
+import lbann
+import lbann.models
+import lbann.contrib.args
+import lbann.contrib.models.wide_resnet
+import lbann.contrib.launcher
+
+# Get relative path to data
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data'))
+import cifar10
+import imagenet
+
+# Command-line arguments
+desc = ('Construct and run ResNet on ImageNet-1K data. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_image_ae', type=str,
+    help='scheduler job name (default: lbann_resnet)')
+parser.add_argument(
+    '--width', action='store', default=1, type=float,
+    help='Wide ResNet width factor (default: 1)')
+parser.add_argument(
+    '--bn-statistics-group-size', action='store', default=1, type=int,
+    help=('Group size for aggregating batch normalization statistics '
+          '(default: 1)'))
+parser.add_argument(
+    '--warmup', action='store_true', help='use a linear warmup')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=90, type=int,
+    help='number of epochs (default: 90)', metavar='NUM')
+parser.add_argument(
+    '--num-classes', action='store', default=1000, type=int,
+    help='number of ImageNet classes (default: 1000)', metavar='NUM')
+parser.add_argument(
+    '--random-seed', action='store', default=0, type=int,
+    help='random seed for LBANN RNGs', metavar='NUM')
+parser.add_argument(
+    '--dataset', action='store', default='imagenet', type=str,
+    help='dataset to use; \"cifar10\" or \"imagenet\"')
+parser.add_argument(
+    '--data-reader-percent', action='store',
+    default=1.0, type=float,
+    help='the percent of the data to use (default: 1.0)', metavar='NUM')
+lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1)
+args = parser.parse_args()
+
+# Due to a data reader limitation, the actual model realization must be
+# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10.
+dataset = args.dataset;
+if dataset == 'imagenet':
+    num_labels=1000
+elif dataset == 'cifar10':
+    num_labels=10
+else:
+    print("Dataset must be cifar10 or imagenet. Try again.")
+    exit()
+
+# Construct layer graph
+input_ = lbann.Input(name='input')
+image = lbann.Identity(input_, name='images')
+dummy = lbann.Dummy(input_, name='labels')
+
+# Encoder
+encode1 = lbann.FullyConnected(image,
+                               name="encode1",
+                               data_layout="model_parallel",
+                               num_neurons=1000,
+                               has_bias=True)
+
+relu1 = lbann.Relu(encode1, name="relu1", data_layout="model_parallel")
+
+dropout1 = lbann.Dropout(relu1,
+                         name="dropout1",
+                         data_layout="model_parallel",
+                         keep_prob=0.8)
+
+decode1 = lbann.FullyConnected(dropout1,
+                               name="decode1",
+                               data_layout="model_parallel",
+                               hint_layer=image,
+                               has_bias=True)
+
+reconstruction = lbann.Sigmoid(decode1,
+                               name="reconstruction",
+                               data_layout="model_parallel")
+
+dropout2 = lbann.Dropout(reconstruction,
+                         name="dropout2",
+                         data_layout="model_parallel",
+                         keep_prob=0.8)
+
+
+# Reconstruction
+mean_squared_error = lbann.MeanSquaredError([dropout2, image],
+                             name="mean_squared_error")
+
+layer_term = lbann.LayerTerm(mean_squared_error)
+obj = lbann.ObjectiveFunction(layer_term)
+
+metrics = [lbann.Metric(mean_squared_error, name=mean_squared_error.name)]
+
+img_strategy = lbann.TrackSampleIDsStrategy(
+    input_layer_name=input_.name,
+    num_tracked_images=10)
+
+summarize_images = lbann.CallbackSummarizeImages(
+    selection_strategy=img_strategy,
+    image_source_layer_name=reconstruction.name,
+    epoch_interval=1)
+
+# Dump original image from input layer one time
+summarize_input_layer = lbann.CallbackSummarizeImages(
+    selection_strategy=img_strategy,
+    image_source_layer_name=input_.name,
+    epoch_interval=10000)
+
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),
+             summarize_input_layer,
+             summarize_images]
+
+layer_list = list(lbann.traverse_layer_graph(input_))
+model = lbann.Model(args.num_epochs,
+                    layers=layer_list,
+                    objective_function=obj,
+                    metrics=metrics,
+                    callbacks=callbacks,
+                    summary_dir=".")
+
+# Setup optimizer
+opt = lbann.contrib.args.create_optimizer(args)
+
+# Setup data reader
+num_classes=min(args.num_classes, num_labels)
+
+if dataset == "cifar10":
+    data_reader = cifar10.make_data_reader(num_classes=num_classes)
+else:
+    data_reader = imagenet.make_data_reader(num_classes=num_classes)
+
+# Setup trainer
+trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size)
+
+# Run experiment
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent)
+
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/applications/vision/summarizing_images/resnet_summarize.py b/applications/vision/summarizing_images/resnet_summarize.py
new file mode 100644
index 00000000000..1690dcc1f22
--- /dev/null
+++ b/applications/vision/summarizing_images/resnet_summarize.py
@@ -0,0 +1,226 @@
+################################################################################
+# Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+# the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+#
+# LLNL-CODE-697807.
+# All rights reserved.
+#
+# This file is part of LBANN: Livermore Big Artificial Neural Network
+# Toolkit. For details, see http://software.llnl.gov/LBANN or
+# https://github.com/LLNL/LBANN.
+#
+# Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+# may not use this file except in compliance with the License.  You may
+# obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the license.
+#
+# resnet_summarize.py - A simple residual learning model for image data
+# (Supports CIFAR-10 or Imagenet 1K)
+#
+# This example demonstrates the use of the image summarizer in
+# categorical accuracy mode.
+#
+################################################################################
+
+import os.path
+import sys
+import argparse
+import lbann
+import lbann.models
+import lbann.models.resnet
+import lbann.contrib.args
+import lbann.contrib.models.wide_resnet
+import lbann.contrib.launcher
+
+# Get relative path to data
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'data'))
+import cifar10
+import imagenet
+
+# Command-line arguments
+desc = ('Construct and run ResNet on ImageNet-1K data. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+lbann.contrib.args.add_scheduler_arguments(parser)
+parser.add_argument(
+    '--job-name', action='store', default='lbann_resnet', type=str,
+    help='scheduler job name (default: lbann_resnet)')
+parser.add_argument(
+    '--resnet', action='store', default=50, type=int,
+    choices=(18, 34, 50, 101, 152),
+    help='ResNet variant (default: 50)')
+parser.add_argument(
+    '--width', action='store', default=1, type=float,
+    help='Wide ResNet width factor (default: 1)')
+parser.add_argument(
+    '--block-type', action='store', default=None, type=str,
+    choices=('basic', 'bottleneck'),
+    help='ResNet block type')
+parser.add_argument(
+    '--blocks', action='store', default=None, type=str,
+    help='ResNet block counts (comma-separated list)')
+parser.add_argument(
+    '--block-channels', action='store', default=None, type=str,
+    help='Internal channels in each ResNet block (comma-separated list)')
+parser.add_argument(
+    '--bn-statistics-group-size', action='store', default=1, type=int,
+    help=('Group size for aggregating batch normalization statistics '
+          '(default: 1)'))
+parser.add_argument(
+    '--warmup', action='store_true', help='use a linear warmup')
+parser.add_argument(
+    '--mini-batch-size', action='store', default=256, type=int,
+    help='mini-batch size (default: 256)', metavar='NUM')
+parser.add_argument(
+    '--num-epochs', action='store', default=90, type=int,
+    help='number of epochs (default: 90)', metavar='NUM')
+parser.add_argument(
+    '--num-classes', action='store', default=1000, type=int,
+    help='number of ImageNet classes (default: 1000)', metavar='NUM')
+parser.add_argument(
+    '--random-seed', action='store', default=0, type=int,
+    help='random seed for LBANN RNGs', metavar='NUM')
+parser.add_argument(
+    '--dataset', action='store', default='imagenet', type=str,
+    help='dataset to use; \"cifar10\" or \"imagenet\"')
+parser.add_argument(
+    '--data-reader-percent', action='store',
+    default=1.0, type=float,
+    help='the percent of the data to use (default: 1.0)', metavar='NUM')
+lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1)
+args = parser.parse_args()
+
+# Due to a data reader limitation, the actual model realization must be
+# hardcoded to 1000 labels for ImageNet; 10 for CIFAR10.
+dataset = args.dataset;
+if dataset == 'imagenet':
+    num_labels=1000
+elif dataset == 'cifar10':
+    num_labels=10
+else:
+    print("Dataset must be cifar10 or imagenet. Try again.")
+    exit()
+
+# Choose ResNet variant
+resnet_variant_dict = {18: lbann.models.ResNet18,
+                       34: lbann.models.ResNet34,
+                       50: lbann.models.ResNet50,
+                       101: lbann.models.ResNet101,
+                       152: lbann.models.ResNet152}
+wide_resnet_variant_dict = {50: lbann.contrib.models.wide_resnet.WideResNet50_2}
+block_variant_dict = {
+    'basic': lbann.models.resnet.BasicBlock,
+    'bottleneck': lbann.models.resnet.BottleneckBlock
+}
+
+if (any([args.block_type, args.blocks, args.block_channels])
+    and not all([args.block_type, args.blocks, args.block_channels])):
+    raise RuntimeError('Must specify all of --block-type, --blocks, --block-channels')
+if args.block_type and args.blocks and args.block_channels:
+    # Build custom ResNet.
+    resnet = lbann.models.ResNet(
+        block_variant_dict[args.block_type],
+        num_labels,
+        list(map(int, args.blocks.split(','))),
+        list(map(int, args.block_channels.split(','))),
+        zero_init_residual=True,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        name='custom_resnet',
+        width=args.width)
+elif args.width == 1:
+    # Vanilla ResNet.
+    resnet = resnet_variant_dict[args.resnet](
+        num_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size)
+elif args.width == 2 and args.resnet == 50:
+    # Use pre-defined WRN-50-2.
+    resnet = wide_resnet_variant_dict[args.resnet](
+        num_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size)
+else:
+    # Some other Wide ResNet.
+    resnet = resnet_variant_dict[args.resnet](
+        num_labels,
+        bn_statistics_group_size=args.bn_statistics_group_size,
+        width=args.width)
+
+# Construct layer graph
+input_ = lbann.Input(name='input')
+images = lbann.Identity(input_, name='images')
+labels = lbann.Identity(input_, name='labels')
+preds = resnet(images)
+probs = lbann.Softmax(preds)
+cross_entropy = lbann.CrossEntropy(probs, labels)
+top1 = lbann.CategoricalAccuracy(probs, labels, name='louise')
+top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5)
+layer_list = list(lbann.traverse_layer_graph(input_))
+
+# Setup objective function
+l2_reg_weights = set()
+for l in layer_list:
+    if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+        l2_reg_weights.update(l.weights)
+l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
+obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+# Setup model
+metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
+           lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+
+img_strategy = lbann.CategoricalAccuracyStrategy(
+    accuracy_layer_name=top1.name,
+    match_type=lbann.CategoricalAccuracyStrategy.MatchType.NOMATCH,
+    num_images_per_epoch=10)
+
+summarize_images = lbann.CallbackSummarizeImages(
+    selection_strategy=img_strategy,
+    image_source_layer_name=images.name,
+    epoch_interval=5)
+
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),
+             lbann.CallbackDropFixedLearningRate(
+                 drop_epoch=[30, 60, 80], amt=0.1),
+             summarize_images]
+if args.warmup:
+    callbacks.append(
+        lbann.CallbackLinearGrowthLearningRate(
+            target=0.1 * args.mini_batch_size / 256, num_epochs=5))
+model = lbann.Model(args.num_epochs,
+                    layers=layer_list,
+                    objective_function=obj,
+                    metrics=metrics,
+                    callbacks=callbacks,
+                    summary_dir=".")
+
+# Setup optimizer
+opt = lbann.contrib.args.create_optimizer(args)
+
+# Setup data reader
+num_classes=min(args.num_classes, num_labels)
+
+if dataset == "cifar10":
+    data_reader = cifar10.make_data_reader(num_classes=num_classes)
+else:
+    data_reader = imagenet.make_data_reader(num_classes=num_classes)
+
+# Setup trainer
+trainer = lbann.Trainer(random_seed=args.random_seed, mini_batch_size=args.mini_batch_size)
+
+# Run experiment
+kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
+kwargs['lbann_args'] = '--data_reader_percent='+str(args.data_reader_percent)
+lbann.contrib.launcher.run(trainer, model, data_reader, opt,
+                           job_name=args.job_name,
+                           **kwargs)
diff --git a/bamboo/README.md b/bamboo/README.md
index c317c496379..ccb1813e878 100644
--- a/bamboo/README.md
+++ b/bamboo/README.md
@@ -1,93 +1,3 @@
-# LBANN CI
-
-Bamboo is the continuous integration (CI) framework we use. A Bamboo plan consists of stages (which run sequentially), which consist of jobs (which run in parallel), which consist of tasks (which run sequentially).
-
-The LBANN build project has many plans. Two plans run off of [`LLNL/lbann/develop`](https://github.com/LLNL/lbann/tree/develop "https://github.com/LLNL/lbann/tree/develop") - Nightly Develop and Weekly Develop. Nightly Develop runs every night (except Saturday) at midnight. Weekly Develop runs every Saturday at midnight. The other plans in the build project are for each individual LBANN developer's fork of LBANN.
-
-All plans run off the latest *pushed* commits to the repository. That means if you have local commits that you have not pushed to your fork, these commits will *not* be tested by Bamboo. If you have pushed commits to your fork but have not merged your branch into the main repository's `develop`, your commits will be tested on your individual plan, but not on Nightly Develop or Weekly Develop.
-
-## Plan Configuration
-Each plan is identical (except Weekly Develop, which will be explained below). The plans consist of a single stage `Tests`. The stage consists of three jobs - `ppc64le_gpu`, `x86_cpu`, and `x86_gpu`. Each of these three jobs can run in parallel. They consist of an identical list of tasks:
-1. Checkout Default Repository (checkout the repository)
-2. Remove Generated Files (each build creates a large number of files. We may look at these files between builds, so we cannot delete them at the end of a build. So, instead we delete them before doing any real work in the next build. This also ensures the generated files came from the latest build and not a previous build).
-3. Compiler Tests (run tests in `bamboo/compiler_tests`)
-4. Integration Tests (run tests in `bamboo/integration_tests`)
-5. Unit Tests (run tests in `bamboo/unit_tests`)
-6. JUnit Parser (this allows Bamboo to render test results in a nice UI)
-
-The three testing tasks differ somewhat between jobs. However, they all execute some variant of `python -m pytest -s --junitxml=results.xml`, which will run all the pytests in the job's associated directory.
-
-Weekly Develop adds the `--weekly` option (`python -m pytest -s --weekly --junitxml=results.xml`). Many (mostly longer-running) tests are set to not run unless this option is on. Weekly Develop runs a superset of the tests that Nightly Develop runs.
-
-## Directory Structure
-
-`bamboo/compiler_tests`, `bamboo/integration_tests`, `bamboo/unit_tests` each have a `conftest.py` that pytest requires. They also contain one or more python files. Each of these files have a number of tests to run. 
-
-## Writing Your Own Tests
-
-A side effect of our Bamboo setup is that tests must be written using pytest. Test files must begin with `test_` to be recognized by pytest. Individual test methods must also begin with `test_`. Test methods should use the `assert` keyword. A test will only fail if the assertion turns out to be false. Not putting an assertion will automatically cause the test to pass.
-
-How then to test non-Python code? You can just wrap your test with Python. A test can be as simple as asserting the output of a shell command is 0. The output of a command can be found using Python's `os.system()`.
-
-## Running Tests On Your Individual Plan
-
-Unlike Nightly Develop, the individual plans are triggered to run by polling your fork for commits. They do not run nightly. If you push new commits to your fork, a new build should start automatically. You can also manually start a build by navigating to your individual plan and clicking Run > Run Plan. Once again, keep in mind that the tests will run off what has been pushed to your GitHub fork of LBANN and not your local copy of the LBANN repository.
-
-## Navigating Bamboo
-
-From the [LBANN Project Summary](https://lc.llnl.gov/bamboo/browse/LBANN "https://lc.llnl.gov/bamboo/browse/LBANN"), click on a build project. From there, click on a build (builds are listed under "Recent History" and can also be accessed from the pass/fail marks in the top right, to the left of the "Run" button). This will bring you to a certain build's page. The most relevant tabs are "Tests" and "Logs". It is recommended to look at failures first in the "Tests" tab, as the build logs can be difficult to parse through. The build's "Tests" tab shows "New test failures", "Existing test failures", "Fixed tests", and "Skipped Tests".
-
-From the build's page, you can also click on individual	jobs, which have the same tabs. The "Tests" tabs of the individual jobs have two sub-tabs, "Failed tests" and "Successful tests". They do not display skipped tests. The Bamboo agent that ran the job can be found by looking at the "Agent" field under the "Job Summary" tab. Alternatively, you can determine the agent from one of the first lines in the build logs: `Build working directory is /usr/workspace/wsb/lbannusr/bamboo/<bamboo-agent-name>/xml-data/build-dir/<build-plan-and-job>`.
-
-Some build logs can be very large (e.g. over 100,000 lines). Beyond about 5,000 lines it is a good idea to download a log instead of viewing it in the browser. Beyond about 10,000 lines, some text editors may experience slowness. At this point it is good to split up the files with `split -l 10000 <log-file>`, which creates files of the form `x*` and of length 10,000. You can then run a command such as `grep -in "Errors for:" x*` to find which files have reported errors. After you are done, you can remove the files with `rm x*`. Note that the original log file is not modified by any of these steps.
-
-As an alternative to splitting the file, errors can be searched for with `grep -in -A <expected-number-of-errors> "Errors for:" <log-file>`.
-
-## Bamboo Agent Properties
-
-Bamboo agent properties are used to specify requirements for each job.
-
-| Agents (jobs)                  | `agent_owner` | `architecture` | `cluster`  | `gpu_architecture` | `sys_type`             |
-| ---                            | ---           | ---            | ---        | ---                | ---                    |
-| Catalyst Agents (x86_cpu)      | `lbannusr`    | `x86_64`       | `catalyst` | `none`             | `toss_3_x86_64_ib`     |
-| Pascal Agents (x86_gpu_pascal) | `lbannusr`    | `x86_64`       | `pascal`   | `pascal`           | `chaos_6_x86_64_ib`    |
-| Quartz Agents (x86_cpu)        | `lbannusr`    | `x86_64`	  | `quartz`   | `none`	            | `toss_3_x86_64_ib`     |
-| Ray Agents (ppc64le_gpu)       | `lbannusr`    | `ppc64_le`     | `ray`      | `pascal`           | `blueos_3_ppc64le_ib`  |
-| Surface Agents (x86_gpu)       | `lbannusr`    | `x86_64`       | `surface`  | `kepler`           | `chaos_5_x86_64_ib`    |
-
-Currently, `agent_owner`, `architecture`, and `gpu_architecture` are used to determine agents to run a job.
-
-# Running Tests From The Command Line
-
-Navigate to `bamboo/compiler_tests`, `bamboo/integration_tests`, or `bamboo/unit_tests`.
-
-To run all the tests in a subdirectory: `python -m pytest -s --weekly`. Note that running all tests can take a substantial amount of time.
-
-To run the tests that Nightly Develop or the individual plans run in a subdirectory: `python -m pytest -s`.
-
-To run a specific test file: `python -m pytest -s <test_file>.py`.
-
-To run a specific test: `python -m pytest -s <test_file>.py -k '<test_name>'`.
-
-Most integration and unit tests allow for running a test with a different executable. The convention is to have a similarly structured test replacing `_<compiler_name>` with `_exe`. These tests are set to be skipped in Bamboo, but can be run locally. There should be a line above the test that gives the command to run the test locally, likely in the following form: `python -m pytest -s <test_file>.py -k '<test_name>' --exe=<executable>`.
-
-At this time, there is no way to run all the `_exe` tests in a subdirectory and only those.
-
-# Helpful Files
-
-First, run `sudo lbannusr`.
-
-To look at output and error from previous builds: `cd /usr/workspace/wsb/lbannusr/bamboo/<bamboo-agent-name>/xml-data/build-dir/<build-plan-and-job>/bamboo/<compiler_tests, integration_tests, or unit_tests>/<error or output>`
-
-To look at archived results from previous builds: `cd /usr/workspace/wsb/lbannusr/archives/<build-plan>`
-
-To look at Bamboo agent properties: `cat /usr/global/tools/bamboo/agents/lbannusr/<bamboo-agent-name>/bin/bamboo-capabilities.properties`
-
-You can copy these files over to your own machine as follows:
-- `sudo lbannusr`
-- `give <lc-username> <absolute-path>`
-- `exit` - to go back to your own LC account, not `lbannusr`'s.
-- `take lbannusr` - now the file exists on your LC account, but not yet on your own machine.
-
-From your own machine, not a ssh terminal:
-- `scp <lc-username>@<cluster>.llnl.gov:<absolute-path> .`
+Refer to `lbann/docs/continuous_integration.rst` 
+or "LBANN CI" on the [LBANN docs](http://software.llnl.gov/lbann/) -
+specifically [LBANN CI docs](https://lbann.readthedocs.io/en/latest/continuous_integration.html).
diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh
new file mode 100755
index 00000000000..2cd798d0e76
--- /dev/null
+++ b/bamboo/allocate_and_run.sh
@@ -0,0 +1,93 @@
+#!/bin/bash -l
+
+CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
+
+echo "allocate_and_run.sh CLUSTER=${CLUSTER}"
+
+export PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:${PYTHONPATH}
+
+WEEKLY=0
+while :; do
+    case ${1} in
+        --weekly)
+            # Run all tests. This is a weekly build.
+            echo "Setting WEEKLY in allocate_and_run.sh"
+            WEEKLY=1
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+echo "allocate_and_run.sh WEEKLY=${WEEKLY}"
+
+if [ "${CLUSTER}" = 'pascal' ]; then
+    export MV2_USE_CUDA=1
+fi
+
+ALLOCATION_TIME_LIMIT_NIGHTLY=45
+ALLOCATION_TIME_LIMIT_WEEKLY=90
+
+if [ "${CLUSTER}" = 'lassen' ]; then
+    ALLOCATION_TIME_LIMIT_NIGHTLY=90
+    ALLOCATION_TIME_LIMIT_WEEKLY=120
+    if [ ${WEEKLY} -ne 0 ]; then
+        timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly
+    else
+        timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+    fi
+elif [ "${CLUSTER}" = 'ray' ]; then
+    if [ ${WEEKLY} -ne 0 ]; then
+        timeout -k 5 24h bsub -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly
+    else
+        timeout -k 5 24h bsub -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+    fi
+elif [ "${CLUSTER}" = 'corona' ]; then
+    if [ ${WEEKLY} -ne 0 ]; then
+        ALLOCATION_TIME_LIMIT_WEEKLY=960
+        timeout -k 5 24h salloc -N4 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly
+    else
+        ALLOCATION_TIME_LIMIT_NIGHTLY=90 # Start with 1.5 hrs; may adjust for CPU clusters
+        if [[ $(mjstat -c | awk 'match($1, "mi60") && NF < 7 { print $5 }') -ne "0" ]];
+        then
+            timeout -k 5 24h salloc -N2 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+        else
+            echo "Partition \"mi60\" on cluster \"${CLUSTER}\" appears to be down."
+            echo "Trying \"mi25\"."
+               timeout -k 5 24h salloc -N2 --partition=mi25 -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+        fi
+    fi
+elif [ "${CLUSTER}" = 'pascal' ]; then
+    if [ ${WEEKLY} -ne 0 ]; then
+        timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly
+    else
+        if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]];
+        then
+            timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+        else
+            echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down."
+        fi
+    fi
+elif [ "${CLUSTER}" = 'catalyst' ]; then
+    if [ ${WEEKLY} -ne 0 ]; then
+        ALLOCATION_TIME_LIMIT_WEEKLY=960
+        timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_WEEKLY} ./run.sh --weekly
+    else
+        ALLOCATION_TIME_LIMIT_NIGHTLY=90 # Start with 1.5 hrs; may adjust for CPU clusters
+        if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]];
+        then
+            timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT_NIGHTLY} ./run.sh
+        else
+            echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down."
+        fi
+    fi
+else
+    echo "allocate_and_run.sh. Unsupported cluster CLUSTER=${CLUSTER}"
+fi
diff --git a/bamboo/clean.sh b/bamboo/clean.sh
index 254930cb247..03b7826cf2a 100755
--- a/bamboo/clean.sh
+++ b/bamboo/clean.sh
@@ -6,10 +6,9 @@ LBANN_DIR=$(git rev-parse --show-toplevel)
 # Compiler Tests
 rm -f ${LBANN_DIR}/bamboo/compiler_tests/*.pyc
 rm -rf ${LBANN_DIR}/bamboo/compiler_tests/__pycache__
-rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_debug
-rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_rel
-rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/*.txt
-rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/*.txt
+rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*
+rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/*
+rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/*
 
 # Integration Tests
 rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pgm
@@ -17,13 +16,15 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.prototext*
 rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pyc
 rm -rf ${LBANN_DIR}/bamboo/integration_tests/__pycache__
 rm -f ${LBANN_DIR}/bamboo/integration_tests/*.tfevents.*
-rm -f ${LBANN_DIR}/bamboo/integration_tests/error/*.txt
-rm -f ${LBANN_DIR}/bamboo/integration_tests/output/*.txt
+rm -rf ${LBANN_DIR}/bamboo/integration_tests/experiments/*
 
 # Unit Tests
+rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt*
+rm -rf ${LBANN_DIR}/bamboo/unit_tests/lbann2_*
 rm -f ${LBANN_DIR}/bamboo/unit_tests/*.prototext*
 rm -f ${LBANN_DIR}/bamboo/unit_tests/*.pyc
 rm -rf ${LBANN_DIR}/bamboo/unit_tests/__pycache__
 rm -f ${LBANN_DIR}/bamboo/unit_tests/*.tfevents.*
-rm -f ${LBANN_DIR}/bamboo/unit_tests/error/*.txt
-rm -f ${LBANN_DIR}/bamboo/unit_tests/output/*.txt
+rm -f ${LBANN_DIR}/bamboo/unit_tests/error/*
+rm -f ${LBANN_DIR}/bamboo/unit_tests/output/*
+rm -rf ${LBANN_DIR}/bamboo/unit_tests/experiments/*
diff --git a/bamboo/common_python/data/__init__.py b/bamboo/common_python/data/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/bamboo/common_python/data/imagenet/__init__.py b/bamboo/common_python/data/imagenet/__init__.py
new file mode 100644
index 00000000000..3ab346ec2dd
--- /dev/null
+++ b/bamboo/common_python/data/imagenet/__init__.py
@@ -0,0 +1,44 @@
+import os
+import os.path
+
+import google.protobuf.text_format
+
+def make_data_reader(lbann, num_classes=1000):
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.contrib.lc.paths
+
+    # Load Protobuf message from file
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    protobuf_file = os.path.join(current_dir, 'data_reader.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Check if data paths are accessible
+    train_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='train',
+                                                         num_classes=num_classes)
+    train_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='train',
+                                                              num_classes=num_classes)
+    test_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='val',
+                                                        num_classes=num_classes)
+    test_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='val',
+                                                             num_classes=num_classes)
+    if not os.path.isdir(train_data_dir):
+        raise FileNotFoundError('could not access {}'.format(train_data_dir))
+    if not os.path.isfile(train_label_file):
+        raise FileNotFoundError('could not access {}'.format(train_label_file))
+    if not os.path.isdir(test_data_dir):
+        raise FileNotFoundError('could not access {}'.format(test_data_dir))
+    if not os.path.isfile(test_label_file):
+        raise FileNotFoundError('could not access {}'.format(test_label_file))
+
+    # Set paths
+    message.reader[0].data_filedir = train_data_dir
+    message.reader[0].data_filename = train_label_file
+    message.reader[1].data_filedir = test_data_dir
+    message.reader[1].data_filename = test_label_file
+
+    return message
diff --git a/bamboo/common_python/data/imagenet/data_reader.prototext b/bamboo/common_python/data/imagenet/data_reader.prototext
new file mode 100644
index 00000000000..3f4e0270f3f
--- /dev/null
+++ b/bamboo/common_python/data/imagenet/data_reader.prototext
@@ -0,0 +1,60 @@
+data_reader {
+  reader {
+    name: "imagenet"
+    role: "train"
+    shuffle: true
+    data_filedir: "path/to/ILSVRC2012/train"
+    data_filename: "path/to/ILSVRC2012/labels/train.txt"
+    validation_percent: 0.0
+    percent_of_data_to_use: 1.0
+    num_labels: 1000
+
+    transforms {
+      random_resized_crop {
+        height: 224
+        width: 224
+      }
+    }
+    transforms {
+      horizontal_flip {
+        p: 0.5
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+  }
+
+  reader {
+    name: "imagenet"
+    role: "validate"
+    data_filedir: "path/to/ILSVRC2012/val"
+    data_filename: "path/to/ILSVRC2012/labels/val.txt"
+    percent_of_data_to_use: 1.0
+    num_labels: 1000
+
+    transforms {
+      resized_center_crop {
+        height: 256
+        width: 256
+        crop_height: 224
+        crop_width: 224
+      }
+    }
+    transforms {
+      colorize {}
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.406 0.456 0.485"
+        stddevs: "0.225 0.224 0.229"
+      }
+    }
+  }
+}
diff --git a/bamboo/common_python/data/mnist/__init__.py b/bamboo/common_python/data/mnist/__init__.py
new file mode 100644
index 00000000000..3c4546011cd
--- /dev/null
+++ b/bamboo/common_python/data/mnist/__init__.py
@@ -0,0 +1,34 @@
+import gzip
+import os
+import os.path
+import urllib.request
+
+import google.protobuf.text_format
+
+def make_data_reader(lbann):
+    """Make Protobuf message for MNIST data reader.
+
+    MNIST data is downloaded if needed.
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.contrib.lc.paths
+
+    # Load data readers from prototext
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    # Load Protobuf message from file
+    protobuf_file = os.path.join(current_dir,
+                                 'data_reader.prototext')
+
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Set paths
+    for reader in message.reader:
+        reader.data_filedir = lbann.contrib.lc.paths.mnist_dir()
+
+    return message
diff --git a/bamboo/common_python/data/mnist/data_reader.prototext b/bamboo/common_python/data/mnist/data_reader.prototext
new file mode 100644
index 00000000000..61c3b32cf42
--- /dev/null
+++ b/bamboo/common_python/data/mnist/data_reader.prototext
@@ -0,0 +1,30 @@
+data_reader {
+  reader {
+    name: "mnist"
+    role: "train"
+    shuffle: true
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "train-images-idx3-ubyte"
+    label_filename: "train-labels-idx1-ubyte"
+    validation_percent: 0.1
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+  reader {
+    name: "mnist"
+    role: "test"
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "t10k-images-idx3-ubyte"
+    label_filename: "t10k-labels-idx1-ubyte"
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+}
diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py
index 6cafbb39bd6..2146ba05b3b 100644
--- a/bamboo/common_python/test_tools.py
+++ b/bamboo/common_python/test_tools.py
@@ -1,161 +1,267 @@
 import pytest
 import tools
 
-# This test isn't in a directory to be run from Bamboo
+
+# This test file isn't in a directory to be run from Bamboo
 # Run locally with python -m pytest -s
 
+d = dict(
+    executable='exe',
+    num_nodes=20,
+    partition='pdebug',
+    time_limit=30,
+    num_processes=40,
+    dir_name='dir',
+    data_filedir_default='lscratchh/filedir',
+    data_reader_name='mnist',
+    data_reader_percent=0.10,
+    exit_after_setup=True,
+    mini_batch_size=15,
+    model_folder='models/folder',
+    model_name='lenet',
+    num_epochs=7,
+    optimizer_name='adagrad',
+    processes_per_model=10,
+    extra_lbann_flags={'print_affinity': None},
+    output_file_name='output_file',
+    error_file_name='error_file',
+    check_executable_existence=False)
+
+
 def test_command_catalyst():
-    actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False)
-    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
+    actual = tools.get_command(cluster='catalyst', **d)
+    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file'
     assert actual == expected
 
-def test_command_pascal():
-    actual = tools.get_command(cluster='pascal', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False)
-    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
+
+def test_command_corona():
+    actual = tools.get_command(cluster='corona', **d)
+    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file'
     assert actual == expected
 
-def test_command_quartz():
-    actual = tools.get_command(cluster='quartz', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False)
-    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --data_filedir=lscratchh/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
+
+def test_command_lassen():
+    actual = tools.get_command(cluster='lassen', **d)
+    expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file'
     assert actual == expected
-    
-def test_command_surface():
-    actual = tools.get_command(cluster='surface', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False)
-    expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
+
+
+def test_command_pascal():
+    actual = tools.get_command(cluster='pascal', **d)
+    expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file'
     assert actual == expected
 
+
 def test_command_ray():
-    actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False)
-    expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
+    actual = tools.get_command(cluster='ray', **d)
+    expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file'
     assert actual == expected
 
+
 # Test error cases ############################################################
 
-def test_blacklisted_substrings():
+
+def test_blacklisted_substrings_1():
     try:
-        tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existence=False)
+        tools.get_command('ray', 'exe', partition=';',
+                          optimizer_path='--model=new_model',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid character(s): ; contains ; , --model=new_model contains --'
         assert actual == expected
 
+
+def test_blacklisted_substrings_2():
+    try:
+        tools.get_command('ray', 'exe', partition='pdebug',
+                          extra_lbann_flags={'--bad_key': 5},
+                          check_executable_existence=False)
+        assert False
+    except Exception as e:
+        actual = str(e)
+        expected = 'Invalid character(s): --bad_key contains --'
+        assert actual == expected
+
+
+def test_blacklisted_substrings_3():
+    try:
+        tools.get_command('ray', 'exe', partition='pdebug',
+                          extra_lbann_flags={'key': '--bad_value'},
+                          check_executable_existence=False)
+        assert False
+    except Exception as e:
+        actual = str(e)
+        expected = 'Invalid character(s): --bad_value contains --'
+        assert actual == expected
+
+
 def test_unsupported_cluster():
     try:
-        tools.get_command('quartz', 'exe', check_executable_existence=False)
+        tools.get_command('q', 'exe', check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
-        expected = 'Unsupported Cluster: quartz'
+        expected = 'Unsupported Cluster: q'
         assert actual == expected
 
+
 def test_bad_model_1():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder',
+                          model_name='name', model_path='path',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name'
         assert actual == expected
 
+
 def test_bad_model_2():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder',
+                          model_path='path', check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name'
         assert actual == expected
 
+
 def test_bad_model_3():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', model_name='name',  model_path='path', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', model_name='name',
+                          model_path='path', check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name'
         assert actual == expected
 
+
 def test_bad_model_4():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: model_folder set but not model_name.'
         assert actual == expected
 
+
 def test_bad_model_5():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', model_name='name',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: model_name set but not model_folder.'
         assert actual == expected
 
+
 def test_bad_data_reader():
     try:
-        tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existence=False)
+        tools.get_command('catalyst', 'exe', dir_name='dir',
+                          data_reader_name='name', data_reader_path='path',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
         assert actual == expected
 
+
 def test_bad_optimizer():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name',
+                          optimizer_path='path',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name'
         assert actual == expected
 
+
 def test_bad_dir_name_1():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
-	actual = str(e)
-	expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.'
+        actual = str(e)
+        expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.'
         assert actual == expected
 
+
 def test_bad_dir_name_2():
     try:
-        tools.get_command('ray', 'exe', model_folder='folder', check_executable_existence=False)
+        tools.get_command('ray', 'exe', model_folder='folder',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
         assert actual == expected
 
+
 def test_bad_dir_name_3():
     try:
-        tools.get_command('ray', 'exe', model_name='name', check_executable_existence=False)
+        tools.get_command('ray', 'exe', model_name='name',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
-	actual = str(e)
-	expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
+        actual = str(e)
+        expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
         assert actual == expected
 
+
 def test_bad_dir_name_4():
     try:
-        tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False)
+        tools.get_command('catalyst', 'exe', data_reader_name='name',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
         assert actual == expected
 
+
 def test_bad_dir_name_5():
     try:
-        tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existence=False)
+        tools.get_command('ray', 'exe', optimizer_name='name',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
-	actual = str(e)
-	expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
+        actual = str(e)
+        expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
         assert actual == expected
 
+
 def test_bad_data_filedir_1():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_train_default='a',
+        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name',
+                          data_filedir_default='filedir',
+                          data_filedir_train_default='a',
                           check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_2():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_train_default='b',
+        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name',
+                          data_filedir_default='filedir',
+                          data_filename_train_default='b',
                           check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
@@ -164,33 +270,50 @@ def test_bad_data_filedir_2():
 
 def test_bad_data_filedir_3():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_test_default='c',
+        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name',
+                          data_filedir_default='filedir',
+                          data_filedir_test_default='c',
                           check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_4():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_test_default='d',
+        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name',
+                          data_filedir_default='filedir',
+                          data_filename_test_default='d',
                           check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_5():
     try:
-        tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_reader_path='path',
+                          data_filedir_default='filedir',
+                          data_filedir_train_default='e',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_6():
     try:
-        tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_reader_path='path',
+                          data_filedir_default='filedir',
+                          data_filename_train_default='f',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
@@ -199,47 +322,68 @@ def test_bad_data_filedir_6():
 
 def test_bad_data_filedir_7():
     try:
-        tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_reader_path='path',
+                          data_filedir_default='filedir',
+                          data_filedir_test_default='g',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_8():
     try:
-        tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_reader_path='path',
+                          data_filedir_default='filedir',
+                          data_filename_test_default='h',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
         assert actual == expected
 
+
 def test_bad_data_filedir_9():
     try:
-        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existence=False)
+        tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
         assert actual == expected
 
+
 def test_bad_data_filedir_10():
     try:
-        tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_reader_path='path',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
         assert actual == expected
 
+
 def test_bad_data_filedir_11():
     try:
-        tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_filedir_default='filedir',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.'
-	assert actual == expected
+        assert actual == expected
+
 
 def test_bad_data_filedir_12():
     try:
-        tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_filedir_train_default='a',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.'
@@ -248,7 +392,9 @@ def test_bad_data_filedir_12():
 
 def test_bad_data_filedir_13():
     try:
-        tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_filename_train_default='b',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.'
@@ -257,7 +403,9 @@ def test_bad_data_filedir_13():
 
 def test_bad_data_filedir_14():
     try:
-        tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_filedir_test_default='c',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.'
@@ -266,8 +414,48 @@ def test_bad_data_filedir_14():
 
 def test_bad_data_filedir_15():
     try:
-        tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existence=False)
+        tools.get_command('ray', 'exe', data_filename_test_default='e',
+                          check_executable_existence=False)
+        assert False
     except Exception as e:
         actual = str(e)
         expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.'
         assert actual == expected
+
+
+def test_bad_extra_lbann_flags_invalid_flag():
+    try:
+        tools.get_command('ray', 'exe', partition='pdebug',
+                          extra_lbann_flags={'invalid_flag': 'value'},
+                          check_executable_existence=False)
+        assert False
+    except Exception as e:
+        actual = str(e)
+        expected = ("Invalid Usage: extra_lbann_flags includes invalid"
+                    " flag=invalid_flag. Flags must"
+                    " be in ['hydrogen_block_size', 'procs_per_trainer',"
+                    " 'num_parallel_readers', 'num_io_threads', 'serialize_io',"
+                    " 'disable_background_io_activity', 'disable_cuda',"
+                    " 'random_seed', 'objective_function', 'data_layout',"
+                    " 'print_affinity', 'use_data_store', 'preload_data_store',"
+                    " 'super_node', 'write_sample_list', 'ltfb_verbose',"
+                    " 'ckpt_dir', 'index_list_train', 'index_list_test',"
+                    " 'label_filename_train', 'label_filename_test',"
+                    " 'share_testing_data_readers', 'image_dir', 'no_im_comm']."
+                    )
+        assert actual == expected
+
+
+def test_bad_extra_lbann_flags_not_a_dict():
+    try:
+        tools.get_command('ray', 'exe', partition='pdebug',
+                          extra_lbann_flags='invalid_flag',
+                          check_executable_existence=False)
+        assert False
+    except Exception as e:
+        actual = str(e)
+        expected = (
+            'Invalid Usage: extra_lbann_flags must be a dict e.g. `{flag :'
+            ' None, flag: 4}`. Use `None` if a flag has no value attached '
+            'to it.')
+        assert actual == expected
diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py
index 7110ddc9a67..27f15772254 100644
--- a/bamboo/common_python/tools.py
+++ b/bamboo/common_python/tools.py
@@ -1,22 +1,34 @@
+import collections.abc
+import copy
+import math
+import os
+import re
+import sys
+import numpy as np
 import pytest
-import math, os, re
-
+import shutil
+import subprocess
+from filecmp import cmp
 
 def check_list(substrings, strings):
     errors = []
     for string in strings:
         for substring in substrings:
-            if (string != None) and (substring in string):
+            if (string is not None) and (isinstance(string, str)) and (substring in string):
                 errors.append('%s contains %s' % (string, substring))
     return errors
 
 
 def get_command(cluster,
                 executable,
+                # Allocation/Run Parameters
                 num_nodes=None,
+                num_processes=None,
                 partition=None,
                 time_limit=None,
-                num_processes=None,
+                # LBANN Parameters
+                ckpt_dir=None,
+                disable_cuda=None,
                 dir_name=None,
                 data_filedir_default=None,
                 data_filedir_train_default=None,
@@ -27,6 +39,7 @@ def get_command(cluster,
                 data_reader_path=None,
                 data_reader_percent=None,
                 exit_after_setup=False,
+                metadata=None,
                 mini_batch_size=None,
                 model_folder=None,
                 model_name=None,
@@ -35,39 +48,64 @@ def get_command(cluster,
                 optimizer_name=None,
                 optimizer_path=None,
                 processes_per_model=None,
-                ckpt_dir=None,
-                output_file_name=None,
+                restart_dir=None,
+                extra_lbann_flags=None,
+                # Error/Output Redirect
                 error_file_name=None,
-                return_tuple=False,
+                output_file_name=None,
+                # Misc. Parameters
                 check_executable_existence=True,
-                skip_no_exe=True):
+                return_tuple=False,
+                skip_no_exe=True,
+                weekly=False):
     # Check parameters for black-listed characters like semi-colons that
     # would terminate the command and allow for an extra command
     blacklist = [';', '--']
-    strings = [partition, dir_name, data_filedir_default,
-               data_filedir_train_default,
-               data_filename_train_default, data_filedir_test_default,
-               data_filename_test_default, data_reader_name, data_reader_path,
-               model_folder, model_name, model_path, optimizer_name,
-               optimizer_path, output_file_name, error_file_name]
+    strings = [
+        cluster, executable,
+        # Allocation/Run Parameters
+        num_nodes, num_processes, partition, time_limit,
+        # LBANN Parameters
+        ckpt_dir, dir_name, data_filedir_default, data_filedir_train_default,
+        data_filename_train_default, data_filedir_test_default,
+        data_filename_test_default, data_reader_name, data_reader_path,
+        data_reader_percent, exit_after_setup, metadata, mini_batch_size,
+        model_folder, model_name, model_path, num_epochs, optimizer_name,
+        optimizer_path, processes_per_model, restart_dir,
+        # Error/Output Redirect
+        error_file_name, output_file_name,
+        # Misc. Parameters
+        check_executable_existence, return_tuple,  skip_no_exe, weekly
+    ]
+    lbann_errors = []
+    if extra_lbann_flags is not None:
+        if not isinstance(extra_lbann_flags, dict):
+            lbann_errors.append(
+                ('extra_lbann_flags must be a dict e.g. `{flag :'
+                 ' None, flag: 4}`. Use `None` if a flag has no value attached '
+                 'to it.'))
+        else:
+            strings += list(extra_lbann_flags.keys())
+            strings += list(extra_lbann_flags.values())
     invalid_character_errors = check_list(blacklist, strings)
     if invalid_character_errors != []:
         raise Exception('Invalid character(s): %s' % ' , '.join(
             invalid_character_errors))
 
-    # Never give lbannusr an allocation for over 12 hours though.
-    strict_time_limit = 60*6  # 6 hours.
-    if time_limit > strict_time_limit:
-        time_limit = strict_time_limit
-
-    # Check executable existence
-    if check_executable_existence:
-        process_executable_existence(executable, skip_no_exe)
+    DEFAULT_TIME = 35
+    MAX_TIME = 360  # 6 hours.
+    if time_limit is None:
+        if weekly:
+            time_limit = MAX_TIME
+        else:
+            time_limit = DEFAULT_TIME
+    if time_limit > MAX_TIME:
+        time_limit = MAX_TIME
 
     # Determine scheduler
-    if cluster in ['catalyst', 'pascal', 'quartz', 'surface']:
+    if cluster in ['catalyst', 'corona', 'pascal']:
         scheduler = 'slurm'
-    elif cluster == 'ray':
+    elif cluster in ['lassen', 'ray']:
         scheduler = 'lsf'
     else:
         raise Exception('Unsupported Cluster: %s' % cluster)
@@ -77,9 +115,9 @@ def get_command(cluster,
     if scheduler == 'slurm':
         # Create allocate command
         command_allocate = ''
-        # Allocate a node if we don't have one already
-        # Running the tests manually allows for already having a node allocated
-        if os.getenv('SLURM_JOB_NUM_NODES') == None:
+        # Allocate nodes only if we don't already have an allocation.
+        if os.getenv('SLURM_JOB_NUM_NODES') is None:
+            print('Allocating slurm nodes.')
             command_allocate = 'salloc'
             option_num_nodes = ''
             option_partition = ''
@@ -91,8 +129,8 @@ def get_command(cluster,
                 # maxnodes.
                 option_num_nodes = ' --nodes=%d' % num_nodes
             if partition is not None:
-                # Surface does not have pdebug, so switch to pbatch
-                if (cluster in ['surface', 'pascal']) and \
+                # If cluster doesn't have pdebug switch to pbatch.
+                if (cluster in ['pascal']) and \
                         (partition == 'pdebug'):
                     partition = 'pbatch'
                 # --partition => Request a specific partition for the resource
@@ -106,12 +144,16 @@ def get_command(cluster,
             command_allocate = '%s%s%s%s' % (
                 command_allocate, option_num_nodes, option_partition,
                 option_time_limit)
+        else:
+            print('slurm nodes already allocated.')
 
         # Create run command
         if command_allocate == '':
-            command_run = 'srun --mpibind=off'
+            space = ''
         else:
-            command_run = ' srun --mpibind=off'
+            space = ' '
+        command_run = '{s}srun --mpibind=off --time={t}'.format(
+            s=space, t=time_limit)
         option_num_processes = ''
         if num_processes is not None:
             # --ntasks => Specify  the  number of tasks to run.
@@ -122,24 +164,29 @@ def get_command(cluster,
     elif scheduler == 'lsf':
         # Create allocate command
         command_allocate = ''
-        # Allocate a node if we don't have one already
-        # Running the tests manually allows for already having a node allocated
-        if os.getenv('LSB_HOSTS') is None:
+        # Allocate nodes only if we don't already have an allocation.
+        if (os.getenv('LSB_HOSTS') is None) and (os.getenv('LSB_JOBID') is None):
+            print('Allocating lsf nodes.')
             command_allocate = 'bsub'
-            # x => Puts the host running your job into exclusive execution
-            # mode.
-            option_exclusive = ' -x'
+            option_exclusive = ''
+            if cluster != 'lassen':
+                # x => Puts the host running your job into exclusive execution
+                # mode.
+                option_exclusive = ' -x'
             # G=> For fairshare scheduling. Associates the job with the
             # specified group.
             option_group = ' -G guests'
             # Is => Submits an interactive job and creates a pseudo-terminal
             # with shell mode when the job starts.
             option_interactive = ' -Is'
+            option_num_nodes = ''
             option_num_processes = ''
             option_partition = ''
             option_processes_per_node = ''
             option_time_limit = ''
-            if num_processes is not None:
+            if cluster == 'lassen':
+                option_num_nodes = ' -nnodes {n}'.format(n=num_nodes)
+            elif num_processes is not None:
                 # n => Submits a parallel job and specifies the number of
                 # tasks in the job.
                 option_num_processes = ' -n %d' % num_processes
@@ -147,7 +194,7 @@ def get_command(cluster,
                     # R => Runs the job on a host that meets the specified
                     # resource requirements.
                     option_processes_per_node = ' -R "span[ptile=%d]"' % int(
-                        math.ceil(float(num_processes)/num_nodes))
+                        math.ceil(float(num_processes) / num_nodes))
             if partition is not None:
                 # q => Submits the job to one of the specified queues.
                 option_partition = ' -q %s' % partition
@@ -158,32 +205,70 @@ def get_command(cluster,
                         time_limit = max_ray_time
                 # W => Sets the runtime limit of the job.
                 option_time_limit = ' -W %d' % time_limit
-            command_allocate = '%s%s%s%s%s%s%s%s' % (
+            command_allocate = '%s%s%s%s%s%s%s%s%s' % (
                 command_allocate, option_exclusive, option_group,
                 option_interactive, option_num_processes, option_partition,
-                option_processes_per_node, option_time_limit)
+                option_num_nodes, option_processes_per_node, option_time_limit)
+        else:
+            print('lsf nodes already allocated.')
 
         # Create run command
         if command_allocate == '':
-            command_run = 'mpirun'
+            space = ''
         else:
-            command_run = ' mpirun'
+            space = ' '
+        if cluster == 'lassen':
+            # Cannot specify time limit for jsrun.
+            command_run = '{s}jsrun'.format(s=space)
+        else:
+            command_run = '{s}mpirun --timeout {t}'.format(s=space, t=time_limit*60)
+        option_bind = ''
+        option_cpu_per_resource = ''
+        option_gpu_per_resource = ''
+        option_launch_distribution = ''
         option_num_processes = ''
         option_processes_per_node = ''
+        option_resources_per_host = ''
+        option_tasks_per_resource = ''
         if num_processes is not None:
-            # -np => Run this many copies of the program on the given nodes.
-            option_num_processes = ' -np %d' % num_processes
-            if (num_nodes is not None) and (num_nodes != 0):
-                option_processes_per_node = ' -N %d' % int(
-                    math.ceil(float(num_processes)/num_nodes))
-        command_run = '%s%s%s' % (
-            command_run, option_num_processes, option_processes_per_node)
+            if cluster == 'lassen':
+                option_bind = ' -b "packed:10"'
+                option_cpu_per_resource = ' -c 40'
+                option_gpu_per_resource = ' -g 4'
+                option_launch_distribution = ' -d packed'
+                # Avoid `nrs (32) should not be greater than rs_per_host (1) * number of servers available (16).`
+                if num_nodes is None:
+                    num_nodes = 1
+                # The "option_num_processes" is a misnomer for the LSF case. Rather than
+                # changing the rest of the code, set it to be the number of nodes. Within
+                # JSRUN, the correct number of processes will be obtained when combined
+                # with "option_tasks_per_resource".
+                option_num_processes = ' -n {n}'.format(n=num_nodes)
+                option_resources_per_host = ' -r 1'
+                option_tasks_per_resource = ' -a %d' % (num_processes/num_nodes)
+                if (num_processes%num_nodes) is not 0:
+                    raise Exception('num_processes %s, is not divisible by num_nodes %d'
+                                    % (num_processes, num_nodes))
+
+            else:
+                # -np => Run this many copies of the program on the given nodes.
+                option_num_processes = ' -np %d' % num_processes
+                if (num_nodes is not None) and (num_nodes != 0):
+                    processes_per_node = int(
+                        math.ceil(float(num_processes)/num_nodes))
+                    option_processes_per_node = ' -N %d' % processes_per_node
+        command_run = '%s%s%s%s%s%s%s%s%s' % (
+            command_run, option_bind, option_cpu_per_resource,
+            option_gpu_per_resource, option_launch_distribution,
+            option_num_processes, option_processes_per_node,
+            option_resources_per_host, option_tasks_per_resource)
 
     else:
         raise Exception('Unsupported Scheduler %s' % scheduler)
 
     # Create LBANN command
     option_ckpt_dir = ''
+    option_disable_cuda = ''
     option_data_filedir = ''
     option_data_filedir_train = ''
     option_data_filename_train = ''
@@ -192,12 +277,13 @@ def get_command(cluster,
     option_data_reader = ''
     option_data_reader_percent = ''
     option_exit_after_setup = ''
+    option_metadata = ''
     option_mini_batch_size = ''
     option_model = ''
     option_num_epochs = ''
     option_optimizer = ''
     option_processes_per_model = ''
-    lbann_errors = []
+    option_restart_dir = ''
     if model_path is not None:
         # If model_folder and/or model_name are set, an exception will be
         # raised later.
@@ -206,8 +292,8 @@ def get_command(cluster,
         # If data_reader_name is set, an exception will be raised later.
         option_data_reader = ' --reader=%s' % data_reader_path
     if optimizer_path is not None:
-        # If optimizer_name is set, an exception will be raised later.
-        option_optimizer_name = ' --optimizer=%s' % optimizer_path
+        # If optimizer_name is also set, an exception will be raised later.
+        option_optimizer = ' --optimizer=%s' % optimizer_path
     if dir_name is not None:
         if model_path is not None:
             if (model_folder is not None) or (model_name is not None):
@@ -251,27 +337,40 @@ def get_command(cluster,
     # Determine data file paths
     # If there is no regex match, then re.sub keeps the original string
     if data_filedir_default is not None:
-        if cluster in ['catalyst', 'pascal', 'surface']:
+        if cluster in ['catalyst', 'corona', 'pascal',]:
             # option_data_filedir = data_filedir_default # lscratchh, presumably
             pass  # No need to pass in a parameter
-        elif cluster == 'quartz':
+        elif cluster == 'lassen':
             option_data_filedir = ' --data_filedir=%s' % re.sub(
-                '[a-z]scratch[a-z]', 'lscratchh', data_filedir_default)
+                '[a-z]scratch[a-z]', 'gpfs1', data_filedir_default)
         elif cluster == 'ray':
             option_data_filedir = ' --data_filedir=%s' % re.sub(
                 '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default)
-    elif None not in data_file_parameters:
-        if cluster in ['catalyst', 'pascal', 'surface']:
+    elif not data_file_parameters == [None, None, None, None]:
+        # Any of the data_file_parameters has a non-None value.
+        if cluster in ['catalyst', 'corona', 'pascal']:
             # option_data_filedir_train = data_filedir_train_default
             # option_data_filename_train = data_filename_train_default
             # option_data_filedir_test = data_filedir_test_default
             # option_data_filename_train = data_filename_test_default
-            pass # No need to pass in a parameter
-        elif cluster == 'quartz':
-            option_data_filedir_train  = ' --data_filedir_train=%s'  % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_train_default)
-            option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_train_default)
-            option_data_filedir_test   = ' --data_filedir_test=%s'   % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_test_default)
-            option_data_filename_train = ' --data_filename_test=%s'  % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_test_default)
+            pass  # No need to pass in a parameter
+        elif cluster == 'lassen':
+            if data_filedir_train_default is not None:
+                option_data_filedir_train  = ' --data_filedir_train=%s'  % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_train_default)
+            if data_filename_train_default is not None:
+                filename_train = re.sub(
+                    '[a-z]scratch[a-z]', 'gpfs1', data_filename_train_default)
+                filename_train = re.sub(
+                    'labels', 'original/labels', filename_train)
+                option_data_filename_train = ' --data_filename_train=%s' % filename_train
+            if data_filedir_test_default is not None:
+                option_data_filedir_test   = ' --data_filedir_test=%s'   % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_test_default)
+            if data_filename_test_default is not None:
+                filename_test = re.sub(
+                    '[a-z]scratch[a-z]', 'gpfs1', data_filename_test_default)
+                filename_test = re.sub(
+                    'labels', 'original/labels', filename_test)
+                option_data_filename_test  = ' --data_filename_test=%s'  % filename_test
         elif cluster == 'ray':
             option_data_filedir_train  = ' --data_filedir_train=%s'  % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_train_default)
             option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default)
@@ -290,22 +389,23 @@ def get_command(cluster,
         else:
             # if None in data_file_parameters: # If any are None
             if data_file_parameters == [None, None, None, None]: # If all are None
-                lbann_errors.append(
-                    ('data_reader_name or data_reader_path is set but not'
-                     ' data_filedir_default. If a data reader is provided,'
-                     ' the default filedir must be set. This allows for'
-                     ' determining what the filedir should be on each'
-                     ' cluster. Alternatively, some or all of'
-                     ' [data_filedir_train_default, data_filename_train'
-                     '_default, data_filedir_test_default, data_filename'
-                     '_test_default] can be set.'))
+                if data_reader_name != 'synthetic':
+                    lbann_errors.append(
+                        ('data_reader_name or data_reader_path is set but not'
+                         ' data_filedir_default. If a data reader is provided,'
+                         ' the default filedir must be set. This allows for'
+                         ' determining what the filedir should be on each'
+                         ' cluster. Alternatively, some or all of'
+                         ' [data_filedir_train_default, data_filename_train'
+                         '_default, data_filedir_test_default, data_filename'
+                         '_test_default] can be set.'))
             # else: no data_file parameters are set
     else:
         if data_filedir_default is not None:
             lbann_errors.append(
                 ('data_filedir_default set but neither data_reader_name'
                  ' or data_reader_path are.'))
-        elif filter(lambda x: x is not None, data_file_parameters) != []:
+        elif list(filter(lambda x: x is not None, data_file_parameters)) != []:
             # If the list of non-None data_file parameters is not empty
             lbann_errors.append(
                 ('At least one of [data_filedir_train_default, data_filename'
@@ -313,10 +413,30 @@ def get_command(cluster,
                  '_test_default] is set, but neither data_reader_name or'
                  ' data_reader_path are.'))
         # else: no conflicts
-    if data_reader_percent is not None:
-        option_data_reader_percent = ' --data_reader_percent=%f' % data_reader_percent
+    if data_reader_percent != "prototext":
+        if data_reader_percent is not None:
+
+            # If data_reader_percent is not None, then it will override `weekly`.
+            # If it is None however, we choose its value based on `weekly`.
+            try:
+                data_reader_percent = float(data_reader_percent)
+
+            except ValueError:
+                lbann_errors.append(
+                    'data_reader_percent={d} is not a float.'.format(
+                        d=data_reader_percent))
+        elif weekly:
+            data_reader_percent = 1.00
+        else:
+            # Nightly
+            data_reader_percent = 0.10
+        option_data_reader_percent = ' --data_reader_percent={d}'.format(
+            d=data_reader_percent)
+    # else: use the data reader's value
     if exit_after_setup:
         option_exit_after_setup = ' --exit_after_setup'
+    if metadata is not None:
+        option_metadata = ' --metadata={d}/{m}'.format(d=dir_name, m=metadata)
     if mini_batch_size is not None:
         option_mini_batch_size = ' --mini_batch_size=%d' % mini_batch_size
     if num_epochs is not None:
@@ -325,17 +445,91 @@ def get_command(cluster,
         option_processes_per_model = ' --procs_per_model=%d' % processes_per_model
     if ckpt_dir is not None:
         option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir
+    if restart_dir is not None:
+        option_restart_dir = ' --restart_dir=%s' % restart_dir
+    if disable_cuda is not None:
+        option_disable_cuda = ' --disable_cuda=%d' % int(bool(disable_cuda))
+    extra_options = ''
+    if extra_lbann_flags is not None:
+        # If extra_lbann_flags is not a dict, then we have already appended
+        # this error to lbann_errors.
+        if isinstance(extra_lbann_flags, dict):
+            # See `lbann --help` or src/proto/proto_common.cpp
+            # Commented out flags already have their own parameters
+            # in this function.
+            allowed_flags = [
+                # 'model',
+                # 'optimizer',
+                # 'reader',
+                # 'metadata',
+
+                # General:
+                # 'mini_batch_size',
+                # 'num_epochs',
+                'hydrogen_block_size',
+                'procs_per_trainer',
+                'num_parallel_readers',
+                'num_io_threads',
+                'serialize_io',
+                'disable_background_io_activity',
+                #'disable_cuda',
+                'random_seed',
+                'objective_function',
+                'data_layout',
+                'print_affinity',
+                'use_data_store',
+                'preload_data_store',
+                'super_node',
+                'write_sample_list',
+                'ltfb_verbose',
+                'ckpt_dir',
+                #'restart_dir',
+                'restart_dir_is_fullpath',
+
+                # DataReaders:
+                # 'data_filedir',
+                # 'data_filedir_train',
+                # 'data_filedir_test',
+                # 'data_filename_train',
+                # 'data_filename_test',
+                'index_list_train',
+                'index_list_test',
+                'label_filename_train',
+                'label_filename_test',
+                # 'data_reader_percent',
+                'share_testing_data_readers',
+
+                # Callbacks:
+                'image_dir',
+                'no_im_comm',
+
+                # Not listed by `lbann --help`:
+                # 'exit_after_setup',
+                # 'procs_per_model'
+            ]
+            for flag, value in sorted(extra_lbann_flags.items()):
+                if flag in allowed_flags:
+                    if value is not None:
+                        extra_options += ' --{f}={v}'.format(f=flag, v=value)
+                    else:
+                        extra_options += ' --{f}'.format(f=flag)
+                else:
+                    s = ('extra_lbann_flags includes invalid flag={f}.'
+                         ' Flags must be in {flags}.').format(
+                        f=flag, flags=allowed_flags)
+                    lbann_errors.append(s)
     if lbann_errors != []:
         print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors))
         raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors))
-    command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (
-        executable, option_ckpt_dir, option_data_filedir,
+    command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (
+        executable, option_ckpt_dir, option_disable_cuda,
+        option_data_filedir,
         option_data_filedir_train, option_data_filename_train,
         option_data_filedir_test, option_data_filename_test,
         option_data_reader, option_data_reader_percent,
-        option_exit_after_setup, option_mini_batch_size,
+        option_exit_after_setup, option_metadata, option_mini_batch_size,
         option_model, option_num_epochs, option_optimizer,
-        option_processes_per_model)
+        option_processes_per_model, option_restart_dir, extra_options)
 
     # Create redirect command
     command_output = ''
@@ -357,28 +551,27 @@ def get_command(cluster,
         return command_string
 
 
-def process_executable_existence(executable, skip_no_exe=True):
-    executable_exists = os.path.exists(executable)
-    if not executable_exists:
-        error_string = 'Executable does not exist: %s' % executable
-        if skip_no_exe:
-            pytest.skip(error_string)
-        else:
-            raise Exception(error_string)
+def process_executable(name, compiler_name, executables):
+    if compiler_name not in executables:
+        e = '{n}: default_exes[{c}] does not exist'.format(
+            n=name, c=compiler_name)
+        print('Skip - ' + e)
+        import pytest
+        pytest.skip(e)
+    executable_path = executables[compiler_name]
+    print('{n}: executable_path={e}'.format(n=name, e=executable_path))
 
 
 def get_spack_exes(default_dirname, cluster):
     exes = {}
 
-    exes['clang4'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster)
-    exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster)
+    exes['clang6'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster)
     exes['gcc7'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster)
-    exes['intel18'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster)
+    exes['intel19'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster)
 
-    exes['clang4_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster)
-    exes['gcc4_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster)
+    exes['clang6_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster)
     exes['gcc7_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster)
-    exes['intel18_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster)
+    exes['intel19_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster)
 
     return exes
 
@@ -386,38 +579,411 @@ def get_spack_exes(default_dirname, cluster):
 def get_default_exes(default_dirname, cluster):
     exes = get_spack_exes(default_dirname, cluster)
     # Use build script as a backup if the Spack build doesn't work.
-    if not os.path.exists(exes['clang4']):
-        exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
+    if not os.path.exists(exes['clang6']):
+        exes['clang6'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
     if not os.path.exists(exes['gcc7']):
         exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
-    if not os.path.exists(exes['intel18']):
-        exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
+    if not os.path.exists(exes['intel19']):
+        exes['intel19'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
 
-    if not os.path.exists(exes['clang4_debug']):
-        exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
+    if not os.path.exists(exes['clang6_debug']):
+        exes['clang6_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
     if not os.path.exists(exes['gcc7_debug']):
         exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
-    if not os.path.exists(exes['intel18_debug']):
-        exes['intel18_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
+    if not os.path.exists(exes['intel19_debug']):
+        exes['intel19_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
 
     default_exes = {}
     default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster)
-    if cluster in ['catalyst', 'quartz', 'pascal']:
-        # x86_cpu - catalyst, quartz
+    if cluster in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']:
+        # Define all compilers.
+        # x86_cpu - catalyst
         # x86_gpu_pascal - pascal
-        default_exes['clang4'] = exes['clang4']
-        default_exes['gcc4'] = exes['gcc4']
+        # ppc64le_gpu_lassen - lassen
+        default_exes['clang6'] = exes['clang6']
         default_exes['gcc7'] = exes['gcc7']
-        default_exes['intel18'] = exes['intel18']
+        default_exes['intel19'] = exes['intel19']
 
-        default_exes['clang4_debug'] = exes['clang4_debug']
-        default_exes['gcc4_debug'] = exes['gcc4_debug']
+        default_exes['clang6_debug'] = exes['clang6_debug']
         default_exes['gcc7_debug'] = exes['gcc7_debug']
-        default_exes['intel18_debug'] = exes['intel18_debug']
-    elif cluster in ['surface']:
-        # x86_gpu - surface
-        default_exes['gcc4'] = exes['gcc4']
-        default_exes['gcc4_debug'] = exes['gcc4_debug']
+        default_exes['intel19_debug'] = exes['intel19_debug']
+
 
     print('default_exes={d}'.format(d=default_exes))
     return default_exes
+
+
+def get_error_line(error_file_name):
+    with open(error_file_name, 'r') as error_file:
+        error_line = ''
+        previous_line = ''
+        for line in error_file:
+            if ('ERROR' in line) or ('LBANN error' in line) or \
+                    ('Error:' in line) or \
+                    ('Expired or invalid job' in line) or \
+                    ('Segmentation fault (core dumped)' in line) or \
+                    ('Relinquishing job allocation' in line):
+                error_line = line
+                break
+            elif ('Stack trace:' in line) or \
+                    ('Error is not recoverable: exiting now' in line):
+                error_line = previous_line
+                break
+            else:
+                previous_line = line
+    return error_line
+
+
+def assert_success(return_code, error_file_name):
+    if return_code != 0:
+        error_line = get_error_line(error_file_name)
+        raise AssertionError(
+            'return_code={rc}\n{el}\nSee {efn}'.format(
+                rc=return_code, el=error_line, efn=error_file_name))
+
+
+def assert_failure(return_code, expected_error, error_file_name):
+    if return_code == 0:
+        raise AssertionError(
+            'return_code={rc}\nSuccess when expecting failure.\nSee {efn}'.format(
+                rc=return_code, efn=error_file_name))
+    with open(error_file_name, 'r') as error_file:
+        for line in error_file:
+            if expected_error in line:
+                return True
+    # If we're at this point, then we know the test did not succeed,
+    # but we didn't get the expected error.
+    actual_error = get_error_line(error_file_name)
+    raise AssertionError(
+        'return_code={rc}\nFailed with error different than expected.\nactual_error={ae}\nexpected_error={ee}\nSee {efn}'.format(
+            rc=return_code, ae=actual_error, ee=expected_error,
+            efn=error_file_name))
+
+
+def create_tests(setup_func,
+                 test_file,
+                 test_name_base=None,
+                 **kwargs):
+    """Create functions that can interact with PyTest
+
+    This function creates tests that involve running an LBANN
+    experiment with the Python frontend. `setup_func` should be a
+    function that takes in the LBANN Python module and outputs objects
+    for an LBANN experiment. A test succeeds if LBANN runs and exits
+    with an exit code of 0, and fails otherwise.
+
+    PyTest detects tests by loading in a Python script and looking for
+    functions prefixed with 'test_'. After you call this function
+    within a script to generate test functions, make sure to add the
+    test functions to the script's scope. For example:
+
+        _test_funcs = tools.create_tests(setup_func, __file__)
+        for t in _test_funcs:
+            globals()[t.__name__] = t
+
+    Args:
+        setup_func (function): Sets up an LBANN experiment using the
+            Python frontend. It takes in the LBANN Python module as
+            input and returns a `(lbann.Trainer, lbann.Model,
+            lbann.reader_pb2.DataReader, lbann.Optimizer)`.
+        test_file (str): Python script being run by PyTest. In most
+            cases, use `__file__`.
+        test_name (str, optional): Descriptive name (default: test
+            file name with '.py' removed).
+        **kwargs: Keyword arguments to pass into
+            `lbann.contrib.launcher.run`.
+
+    Returns:
+        Iterable of function: Tests that can interact with PyTest.
+            Each function returns a dict containing log files and
+            other output data.
+
+    """
+
+    # Make sure test name is valid
+    test_file = os.path.realpath(test_file)
+    if not test_name_base:
+        # Create test name by removing '.py' from file name
+        test_name_base = os.path.splitext(os.path.basename(test_file))[0]
+    if not re.match('^test_.', test_name_base):
+        # Make sure test name is prefixed with 'test_'
+        test_name_base = 'test_' + test_name_base
+
+    def test_func(cluster, executables, dir_name, compiler_name):
+        """Function that can interact with PyTest.
+
+        Returns a dict containing log files and other output data.
+
+        """
+        process_executable(test_name_base, compiler_name, executables)
+        test_name = '{}_{}'.format(test_name_base, compiler_name)
+
+        # Load LBANN Python frontend
+        build_names = {
+            'clang6': 'clang.Release.{}.llnl.gov'.format(cluster),
+            'clang6_debug': 'clang.Debug.{}.llnl.gov'.format(cluster),
+            'gcc7': 'gnu.Release.{}.llnl.gov'.format(cluster),
+            'gcc7_debug': 'gnu.Debug.{}.llnl.gov'.format(cluster),
+            'intel19': 'intel.Release.{}.llnl.gov'.format(cluster),
+            'intel19_debug': 'intel.Debug.{}.llnl.gov'.format(cluster),
+        }
+        python_frontend_path = os.path.join(dir_name,
+                                            'build',
+                                            build_names[compiler_name],
+                                            'install',
+                                            'lib',
+                                            'python3.7',
+                                            'site-packages')
+        sys.path.append(python_frontend_path)
+        import lbann
+        import lbann.contrib.launcher
+
+        # Setup LBANN experiment
+        trainer, model, data_reader, optimizer = setup_func(lbann)
+
+        # Configure kwargs to LBANN launcher
+        _kwargs = copy.deepcopy(kwargs)
+        if 'work_dir' not in _kwargs:
+            _kwargs['work_dir'] = os.path.join(os.path.dirname(test_file),
+                                               'experiments',
+                                               test_name)
+
+        # If the user provided a suffix for the work directory, append it
+        if 'work_subdir' in _kwargs:
+            _kwargs['work_dir'] = os.path.join(_kwargs['work_dir'], _kwargs['work_subdir'])
+            del _kwargs['work_subdir']
+
+        # Delete the work directory
+        if os.path.isdir(_kwargs['work_dir']):
+            shutil.rmtree(_kwargs['work_dir'])
+
+        if 'job_name' not in _kwargs:
+            _kwargs['job_name'] = f'lbann_{test_name}'
+        if 'overwrite_script' not in _kwargs:
+            _kwargs['overwrite_script'] = True
+
+        # Run LBANN
+        work_dir = _kwargs['work_dir']
+        stdout_log_file = os.path.join(work_dir, 'out.log')
+        stderr_log_file = os.path.join(work_dir, 'err.log')
+        return_code = lbann.contrib.launcher.run(
+            trainer=trainer,
+            model=model,
+            data_reader=data_reader,
+            optimizer=optimizer,
+            **_kwargs,
+        )
+        assert_success(return_code, stderr_log_file)
+        return {
+            'return_code': return_code,
+            'work_dir': work_dir,
+            'stdout_log_file': stdout_log_file,
+            'stderr_log_file': stderr_log_file,
+        }
+
+    # Specific test functions for different build configurations
+    def test_func_clang6(cluster, exes, dirname):
+        return test_func(cluster, exes, dirname, 'clang6')
+    def test_func_gcc7(cluster, exes, dirname):
+        return test_func(cluster, exes, dirname, 'gcc7')
+    def test_func_intel19(cluster, exes, dirname):
+        return test_func(cluster, exes, dirname, 'intel19')
+    test_func_clang6.__name__ = '{}_clang6'.format(test_name_base)
+    test_func_gcc7.__name__ = '{}_gcc7'.format(test_name_base)
+    test_func_intel19.__name__ = '{}_intel19'.format(test_name_base)
+
+    return (
+        test_func_gcc7,
+        test_func_clang6,
+        test_func_intel19,
+    )
+
+
+def create_python_data_reader(lbann,
+                              file_name,
+                              sample_function_name,
+                              num_samples_function_name,
+                              sample_dims_function_name,
+                              execution_mode):
+    """Create protobuf message for Python data reader
+
+    A Python data reader gets data by importing a Python module and
+    calling functions in its scope.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend.
+        file_name (str): Python file.
+        sample_function_name (str): Function to get a data sample. It
+            takes one integer argument for the sample index and
+            returns an `Iterator` of `float`s.
+        sample_dims_function_name (str): Function to get dimensions of
+            a data sample. It takes no arguments and returns a
+            `(int,)`.
+        num_samples_function_name (str): Function to get number of
+            data samples in data set. It takes no arguments and
+            returns an `int`.
+        execution_mode (str): 'train', 'validation', or 'test'
+
+    """
+
+    # Extract paths
+    file_name = os.path.realpath(file_name)
+    dir_name = os.path.dirname(file_name)
+    module_name = os.path.splitext(os.path.basename(file_name))[0]
+
+    # Construct protobuf message for data reader
+    reader = lbann.reader_pb2.Reader()
+    reader.name = 'python'
+    reader.role = execution_mode
+    reader.shuffle = False
+    reader.percent_of_data_to_use = 1.0
+    reader.python.module = module_name
+    reader.python.module_dir = dir_name
+    reader.python.sample_function = sample_function_name
+    reader.python.num_samples_function = num_samples_function_name
+    reader.python.sample_dims_function = sample_dims_function_name
+
+    return reader
+
+
+def numpy_l2norm2(x):
+    """Square of L2 norm, computed with NumPy
+
+    The computation is performed with 64-bit floats.
+
+    """
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    x = x.reshape(-1)
+    return np.inner(x, x)
+
+
+def make_iterable(obj):
+    """Convert to an iterable object
+
+    Simply returns `obj` if it is alredy iterable. Otherwise returns a
+    1-tuple containing `obj`.
+
+    """
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+        return obj
+    else:
+        return (obj,)
+
+
+def str_list(it):
+    """Convert an iterable object to a space-separated string"""
+    return ' '.join([str(i) for i in make_iterable(it)])
+
+# Define evaluation function
+def collect_metrics_from_log_func(log_file, key):
+    metrics = []
+    with open(log_file) as f:
+        for line in f:
+            match = re.search(key + ' : ([0-9.]+)', line)
+            if match:
+                metrics.append(float(match.group(1)))
+    return metrics
+
+def compare_metrics(baseline_metrics, test_metrics):
+    assert len(baseline_metrics) == len(test_metrics), \
+        'baseline and test experiments did not run for same number of epochs'
+    for i in range(len(baseline_metrics)):
+        x = baseline_metrics[i]
+        xhat = test_metrics[i]
+        assert x == xhat, \
+            'found discrepancy in metrics for baseline {b} and test {t}'.format(b=x, t=xhat)
+
+
+# Perform a diff across a directoy where not all of the subdirectories will exist in
+# the test directory.  Return a list of unchecked subdirectories, the running error code
+# and the list of failed directories
+def multidir_diff(baseline, test, fileList):
+    tmpList = []
+    err_msg = ""
+    err = 0
+    # Iterate over the list of filepaths & remove each file.
+    for filePath in fileList:
+        d = os.path.basename(filePath)
+        t = os.path.basename(os.path.dirname(filePath))
+        c = os.path.join(test, t, d)
+        if os.path.exists(c):
+            ret = subprocess.run('diff -rq {baseline} {test}'.format(
+                baseline=filePath, test=c), capture_output=True, shell=True, text=True)
+            if ret.returncode != 0:
+                err_msg += 'diff -rq {baseline} {test} failed {dt}\n'.format(
+                    dt=ret.returncode, baseline=filePath, test=c)
+                err_msg += ret.stdout
+            err += ret.returncode
+        else:
+            tmpList.append(filePath)
+
+    return tmpList, err, err_msg
+
+# Perform a line by line difference of an xml file and look for any floating point values
+# For each floating point value, check to see if it is close-enough and log a warning if it
+# is within a threshhold.
+def approx_diff_xml_files(file1, file2, rel_tol):
+    f1 = open(file1, 'r')
+    f2 = open(file2, 'r')
+    files_differ = False
+    diff_list = []
+    near_diff_list = []
+    for l1 in f1:
+        l2 = next(f2)
+        if l1 != l2:
+            try:
+                v1 = float(re.sub(r'\s*<\w*>(\S*)<\/\w*>\s*', r'\1', l1))
+                v2 = float(re.sub(r'\s*<\w*>(\S*)<\/\w*>\s*', r'\1', l2))
+                close = math.isclose(v1, v2, rel_tol=rel_tol, abs_tol=0.0)
+                if not close:
+                    err = ('lines: %s and %s differ: %.13f != %.13f (+/- %.1e)' % (l1.rstrip(), l2.rstrip(), v1, v2, rel_tol))
+                    diff_list.append(err)
+                    files_differ = True
+                else:
+                    warn = ('lines: %s and %s are close: %.13f ~= %.13f (+/- %.1e)' % (l1.rstrip(), l2.rstrip(), v1, v2, rel_tol))
+                    near_diff_list.append(warn)
+            except ValueError:
+                # Non-numerical diff.
+                err = ('lines: %s and %s differ' % (l1.rstrip(), l2.rstrip()))
+                diff_list.append(err)
+                files_differ = True
+    return files_differ, diff_list, near_diff_list
+
+# Given a recursive python diff from dircmp, perform a recursive exploration of any files
+# with differences.  For files with differences, if check any XML files for approximate equivalence
+# which can be seen in some of the floating point recorded values
+def print_diff_files(dcmp):
+    any_files_differ = False
+    all_diffs = []
+    all_warns = []
+    for name in dcmp.diff_files:
+        from pprint import pprint
+        err = f'Files {os.path.join(dcmp.left, name)} and {os.path.join(dcmp.right, name)} differ'
+        if re.search('.xml', name):
+            files_differ, diff_list, warn_list = approx_diff_xml_files(
+                os.path.join(dcmp.left, name), os.path.join(dcmp.right, name), 1e-6)
+            if files_differ:
+                any_files_differ = True
+                all_diffs.append(err)
+                for d in diff_list:
+                    all_diffs.append(d)
+            if len(warn_list) > 0:
+                warn = f'Files {os.path.join(dcmp.left, name)} and {os.path.join(dcmp.right, name)} have a near difference'
+                all_warns.append(warn)
+                for w in warn_list:
+                    all_warns.append(w)
+        else:
+            any_files_differ = True
+            all_diffs.append(err)
+
+    for sub_dcmp in dcmp.subdirs.values():
+        files_differ, diff_list, warn_list = print_diff_files(sub_dcmp)
+        if files_differ:
+            any_files_differ = True
+            for d in diff_list:
+                all_diffs.append(d)
+        for d in warn_list:
+            all_warns.append(d)
+
+    return any_files_differ, all_diffs, all_warns
diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh
index 07a19172f26..1ecdc393b57 100755
--- a/bamboo/compiler_tests/build_script.sh
+++ b/bamboo/compiler_tests/build_script.sh
@@ -1,7 +1,131 @@
-CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
-if [ "${CLUSTER}" != 'surface' ]; then
-    source /usr/share/lmod/lmod/init/bash
-    source /etc/profile.d/00-modulepath.sh
-fi
+#!/bin/bash
+
+source /usr/share/lmod/lmod/init/bash
+source /etc/profile.d/00-modulepath.sh
+
 LBANN_DIR=$(git rev-parse --show-toplevel)
-${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit
+CLUSTER=$(hostname | sed 's/[0-9]*//g')
+USER=$(whoami)
+WORKSPACE_DIR=$(ls --color=no -d /usr/workspace/ws*/${USER})
+COMMON_DEPENDENCY_DIR=${WORKSPACE_DIR}/stable_dependencies
+DEPENDENCY_DIR_BASE=${COMMON_DEPENDENCY_DIR}/${CLUSTER}
+
+# For this script, we only care about GCC.
+LATEST_GCC=$(ls -1 ${DEPENDENCY_DIR_BASE} | grep gcc | tail -n1)
+COMPILER_DIR=${DEPENDENCY_DIR_BASE}/${LATEST_GCC}
+
+# For now, there's only one MPI library. The pipe to tail ensures that
+# we just pick one thing, just in case.
+MPI_LIBRARY=$(ls -1 --color=no ${COMPILER_DIR} | tail -n1)
+MPI_DIR=${COMPILER_DIR}/${MPI_LIBRARY}
+
+# All the dependencies are installed at the MPI level (even though
+# most are MPI-independent).
+DEPENDENCY_DIR=${MPI_DIR}
+
+export CMAKE_PREFIX_PATH=${COMMON_DEPENDENCY_DIR}/catch2:${COMMON_DEPENDENCY_DIR}/cereal:${COMMON_DEPENDENCY_DIR}/clara:${COMMON_DEPENDENCY_DIR}/cub:${COMMON_DEPENDENCY_DIR}/half:${DEPENDENCY_DIR}/aluminum:${DEPENDENCY_DIR}/cnpy:${DEPENDENCY_DIR}/conduit:${DEPENDENCY_DIR}/hdf5:${DEPENDENCY_DIR}/hydrogen:${DEPENDENCY_DIR}/jpeg-turbo:${DEPENDENCY_DIR}/nccl:${DEPENDENCY_DIR}/openblas:${DEPENDENCY_DIR}/opencv:${DEPENDENCY_DIR}/protobuf:${CMAKE_PREFIX_PATH}
+
+if [ -e ${DEPENDENCY_DIR} ];
+then
+    SAVELIST_NAME=$(echo ${CLUSTER}_${LATEST_GCC}_${MPI_LIBRARY} | sed -e 's/\./x/g')
+
+    if ml -t savelist |& grep ${SAVELIST_NAME} > /dev/null 2>&1
+    then
+        ml restore ${SAVELIST_NAME}
+    else
+        # Compilers are easy...
+        COMPILER_MODULE=$(echo ${LATEST_GCC} | sed -e 's|-|/|g')
+
+        if [[ ${MPI_LIBRARY} =~ ^spectrum-mpi-.*$ ]]
+        then
+            MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|spectrum-mpi-|spectrum-mpi/|g')
+        else
+            MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|-|/|g')
+        fi
+
+        # Use the latest CUDA 10, since it's compatible with other
+        # CUDA 10.* libraries
+        CUDA_MODULE=$(ml --terse avail cuda |& sed -n '/\/10\./p' | tail -n1)
+
+        # Load up the appropriate modules
+        module load ${COMPILER_MODULE} ${MPI_MODULE} ${CUDA_MODULE} cmake/3.14.5
+        ml save ${SAVELIST_NAME}
+    fi
+
+    BRAIN_DIR=/usr/workspace/wsb/brain
+
+    # CUDA-y things (Use the newest)
+    ARCH=$(uname -i)
+    export NCCL_DIR=$(ls -d --color=no ${BRAIN_DIR}/nccl2/*cuda10*${ARCH} | tail -n1)
+    # Right now, we only support cuDNN 7 versions.
+    export CUDNN_DIR=$(find ${BRAIN_DIR}/cudnn -maxdepth 2 -type d | grep "cudnn-7.*/cuda-10.*_${ARCH}" | sort -r | head -1)
+
+    # Unit testing framework
+    export CLARA_DIR=${WORKSPACE_DIR}/stable_dependencies/clara
+    export CATCH2_DIR=${WORKSPACE_DIR}/stable_dependencies/catch2
+
+    # Add Ninja support
+    export PATH=${DEPENDENCY_DIR_BASE}/ninja/bin:${PATH}
+
+    # Setup paths to match the build_lbann_lc.sh script (ugh)
+    BUILD_DIR_BASE=${LBANN_DIR}/build/gnu.Release.${CLUSTER}.llnl.gov
+    BUILD_DIR=${BUILD_DIR_BASE}/lbann/build
+    INSTALL_DIR=${BUILD_DIR_BASE}/install
+
+    # Setup a path for Catch2 to use
+    CATCH2_OUTPUT_DIR=${LBANN_DIR}/bamboo/compiler_tests
+    rm -f ${CATCH2_OUTPUT_DIR}/*.xml
+
+    # Decide if CUDA should be used.
+    if [[ "${CLUSTER}" =~ ^(pascal|lassen|ray)$ ]];
+    then
+        USE_CUDA=ON
+    else
+        USE_CUDA=OFF
+    fi
+
+    # Cleanup
+    [[ -e ${BUILD_DIR_BASE} ]] && rm -rf ${BUILD_DIR_BASE}
+    mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+
+    # Hack to be nice to others.
+    if [[ "${CLUSTER}" =~ ^(lassen|ray)$ ]];
+    then
+        LAUNCH_CMD="lrun -1"
+        NHOSTS=$(expr $(printenv LSB_HOSTS | wc -w) - 1)
+        NNODES=$(expr ${NHOSTS} / 40)
+        PARALLEL_LAUNCH_CMD="jsrun -n${NNODES} -r1 -a4 -c40 -g4 -d packed -b packed:10 "
+    else
+        unset LAUNCH_CMD
+        PARALLEL_LAUNCH_CMD="srun --mpibind=off -N${SLURM_NNODES} --ntasks-per-node=2 "
+    fi
+
+    cmake \
+        -GNinja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} \
+        \
+        -DCMAKE_CXX_COMPILER=$(which g++) \
+        -DCMAKE_CXX_FLAGS="-DLBANN_SET_EL_RNG -g" \
+        -DCMAKE_CUDA_COMPILER=$(which nvcc) \
+        -DCMAKE_CUDA_HOST_COMPILER=$(which g++) \
+        \
+        -DCMAKE_CXX_STANDARD=14 \
+        -DCMAKE_CUDA_STANDARD=14 \
+        \
+        -DLBANN_DATATYPE=float \
+        -DLBANN_DETERMINISTIC=ON \
+        -DLBANN_WARNINGS_AS_ERRORS=ON \
+        -DLBANN_WITH_CONDUIT=ON \
+        -DLBANN_WITH_CUDA=ON \
+        -DLBANN_WITH_NVPROF=OFF \
+        -DLBANN_WITH_TBINF=ON \
+        -DLBANN_WITH_UNIT_TESTING=ON \
+        -DLBANN_WITH_VTUNE=OFF \
+        \
+        -Dprotobuf_MODULE_COMPATIBLE=ON \
+        \
+        ${LBANN_DIR} && ${LAUNCH_CMD} ninja && ${LAUNCH_CMD} ninja install && ${LAUNCH_CMD} ./unit_test/seq-catch-tests -r junit -o ${CATCH2_OUTPUT_DIR}/seq_catch_tests_output-${CLUSTER}.xml ; ${PARALLEL_LAUNCH_CMD} ./unit_test/mpi-catch-tests -r junit -o "${CATCH2_OUTPUT_DIR}/mpi_catch_tests_output-${CLUSTER}-rank=%r-size=%s.xml"
+else
+    ${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit
+fi
diff --git a/bamboo/compiler_tests/build_script_specific.sh b/bamboo/compiler_tests/build_script_specific.sh
index 975d58ac4a1..49833de8b1e 100755
--- a/bamboo/compiler_tests/build_script_specific.sh
+++ b/bamboo/compiler_tests/build_script_specific.sh
@@ -2,10 +2,8 @@ set -e
 CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
 LBANN_DIR=$(git rev-parse --show-toplevel)
 DEBUG=''
-if [ "${CLUSTER}" != 'surface' ]; then
-    source /usr/share/lmod/lmod/init/bash
-    source /etc/profile.d/00-modulepath.sh
-fi
+source /usr/share/lmod/lmod/init/bash
+source /etc/profile.d/00-modulepath.sh
 
 while :; do
     case ${1} in
@@ -32,22 +30,18 @@ while :; do
     shift
 done
 
-if [ "${COMPILER}" == 'clang4' ]; then
-    module load clang/4.0.0
+if [ "${COMPILER}" == 'clang6' ]; then
+    module load clang/6.0.0
     ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure --with-conduit
 fi
 
-if [ "${COMPILER}" == 'intel18' ]; then
-    module load intel/18.0.0
-    ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit
-fi
-
-if [ "${COMPILER}" == 'gcc4' ]; then
-    module load gcc/4.9.3
-    ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit
-fi
 
 if [ "${COMPILER}" == 'gcc7' ]; then
     module load gcc/7.1.0
     ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit
 fi
+
+if [ "${COMPILER}" == 'intel19' ]; then
+    module load intel/19.0.0
+    ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit
+fi
diff --git a/bamboo/compiler_tests/builds/.gitignore b/bamboo/compiler_tests/builds/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/bamboo/compiler_tests/builds/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/bamboo/compiler_tests/builds/README.md b/bamboo/compiler_tests/builds/README.md
deleted file mode 100644
index 1962c6506d6..00000000000
--- a/bamboo/compiler_tests/builds/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for build directories
diff --git a/bamboo/compiler_tests/conftest.py b/bamboo/compiler_tests/conftest.py
index 238b812e638..ccffb182a73 100644
--- a/bamboo/compiler_tests/conftest.py
+++ b/bamboo/compiler_tests/conftest.py
@@ -4,13 +4,13 @@
 
 def pytest_addoption(parser):
     cluster = re.sub('[0-9]+', '', subprocess.check_output(
-        'hostname'.split()).strip())
+        'hostname'.split()).decode('utf-8').strip())
     default_dirname = subprocess.check_output(
-        'git rev-parse --show-toplevel'.split()).strip()
+        'git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
     parser.addoption('--cluster', action='store', default=cluster,
                      help='--cluster=<cluster> to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster')
     parser.addoption('--dirname', action='store', default=default_dirname,
-                     help='--dirname specifies the top-level directory')
+                     help='--dirname=<path_to_dir> specifies the top-level directory')
 
 
 @pytest.fixture
diff --git a/bamboo/compiler_tests/error/.gitignore b/bamboo/compiler_tests/error/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/bamboo/compiler_tests/error/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/bamboo/compiler_tests/error/README.md b/bamboo/compiler_tests/error/README.md
deleted file mode 100644
index 78712c2962b..00000000000
--- a/bamboo/compiler_tests/error/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test error
diff --git a/bamboo/compiler_tests/output/.gitignore b/bamboo/compiler_tests/output/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/bamboo/compiler_tests/output/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/bamboo/compiler_tests/output/README.md b/bamboo/compiler_tests/output/README.md
deleted file mode 100644
index 308358e3777..00000000000
--- a/bamboo/compiler_tests/output/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test output
diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py
index 5682d11f3af..212dcf7f8cc 100644
--- a/bamboo/compiler_tests/test_compiler.py
+++ b/bamboo/compiler_tests/test_compiler.py
@@ -1,81 +1,41 @@
-# import sys
-# sys.path.insert(0, '../common_python')
-# import tools
+import sys
+sys.path.insert(0, '../common_python')
+import tools
 import pytest
 import os, re, subprocess
 
 
 def test_compiler_build_script(cluster, dirname):
-    if cluster in ['pascal']:
-        output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname)
-        error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname)
-        command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % (
-            dirname, output_file_name, error_file_name)
-        return_code = os.system(command)
-        if return_code != 0:
-            output_file = open(output_file_name, 'r')
-            for line in output_file:
-                print('%s: %s' % (output_file_name, line))
-            error_file = open(error_file_name, 'r')
-            for line in error_file:
-                print('%s: %s' % (error_file_name, line))
-        assert return_code == 0
-    else:
+    if cluster not in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']:
         e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster
         print('Skip - ' + e)
         pytest.skip(e)
+    output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname)
+    error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname)
+    command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % (
+        dirname, output_file_name, error_file_name)
+    return_code = os.system(command)
+    tools.assert_success(return_code, error_file_name)
 
 
-def test_compiler_clang4_release(cluster, dirname):
-    try:
-        skeleton_clang4(cluster, dirname, False)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'clang4', False)
-    path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (dirname, cluster)
+def test_compiler_clang6_release(cluster, dirname):
+    skeleton_clang6(cluster, dirname, False)
+    path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
         assert os.path.exists(path)
 
 
-def test_compiler_clang4_debug(cluster, dirname):
-    try:
-        skeleton_clang4(cluster, dirname, True)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'clang4', True)
-    path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (dirname, cluster)
+def test_compiler_clang6_debug(cluster, dirname):
+    skeleton_clang6(cluster, dirname, True)
+    path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
         assert os.path.exists(path)
 
 
-def test_compiler_gcc4_release(cluster, dirname):
-    try:
-        skeleton_gcc4(cluster, dirname, False)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'gcc4', False)
-    path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (dirname, cluster)
-    assert os.path.exists(path)
-
-
-def test_compiler_gcc4_debug(cluster, dirname):
-    try:
-        skeleton_gcc4(cluster, dirname, True)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'gcc4', True)
-    path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (dirname, cluster)
-    assert os.path.exists(path)
-
-
 def test_compiler_gcc7_release(cluster, dirname):
-    try:
-        skeleton_gcc7(cluster, dirname, False)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'gcc7', False)
+    skeleton_gcc7(cluster, dirname, False)
     path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
@@ -83,87 +43,69 @@ def test_compiler_gcc7_release(cluster, dirname):
 
 
 def test_compiler_gcc7_debug(cluster, dirname):
-    try:
-        skeleton_gcc7(cluster, dirname, True)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'gcc7', True)
+    skeleton_gcc7(cluster, dirname, True)
     path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
         assert os.path.exists(path)
 
 
-def test_compiler_intel18_release(cluster, dirname):
-    try:
-        skeleton_intel18(cluster, dirname, False)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'intel18', False)
-    path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (dirname, cluster)
+def test_compiler_intel19_release(cluster, dirname):
+    skeleton_intel19(cluster, dirname, False)
+    path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
         assert os.path.exists(path)
 
 
-def test_compiler_intel18_debug(cluster, dirname):
-    try:
-        skeleton_intel18(cluster, dirname, True)
-    except AssertionError as e:
-        print(e)
-        build_script(cluster, dirname, 'intel18', True)
-    path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (dirname, cluster)
+def test_compiler_intel19_debug(cluster, dirname):
+    skeleton_intel19(cluster, dirname, True)
+    path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (dirname, cluster)
     if not os.path.exists(path):
         path = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster)
         assert os.path.exists(path)
 
 
-def skeleton_clang4(cluster, dir_name, debug, should_log=False):
-    if cluster in ['catalyst', 'quartz']:
-        spack_skeleton(dir_name, 'clang@4.0.0', 'mvapich2@2.2', debug, should_log)
-        build_skeleton(dir_name, 'clang@4.0.0', debug, should_log)
-    else:
-        e = 'skeleton_clang4: Unsupported Cluster %s' % cluster
-        print('Skip - ' + e)
-        pytest.skip(e)
-
-
-def skeleton_gcc4(cluster, dir_name, debug, should_log=False):
-    if cluster in ['quartz']:  # Taking out 'catalyst'
-        mpi = 'mvapich2@2.2'
-    elif cluster in ['surface']:  # Taking out 'pascal'
-        mpi = 'mvapich2@2.2+cuda'
-    elif cluster == 'ray':
-        mpi = 'spectrum-mpi@2018.04.27'
-    else:
-        e = 'skeleton_gcc4: Unsupported Cluster %s' % cluster
+def skeleton_clang6(cluster, dir_name, debug):
+    if cluster not in []:
+        e = 'skeleton_clang6: Unsupported Cluster %s' % cluster
         print('Skip - ' + e)
         pytest.skip(e)
-    spack_skeleton(dir_name, 'gcc@4.9.3', mpi, debug, should_log)
-    build_skeleton(dir_name, 'gcc@4.9.3', debug, should_log)
+    try:
+        spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug)
+        build_skeleton(dir_name, 'clang@6.0.0', debug)
+    except AssertionError as e:
+        print(e)
+        build_script(cluster, dir_name, 'clang6', debug)
 
 
-def skeleton_gcc7(cluster, dir_name, debug, should_log=False):
-    if cluster in ['catalyst', 'quartz']:
-        spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log)
-        build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log)
-    else:
+def skeleton_gcc7(cluster, dir_name, debug):
+    if cluster not in []:
         e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster
         print('Skip - ' + e)
         pytest.skip(e)
+    try:
+        spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug)
+        build_skeleton(dir_name, 'gcc@7.1.0', debug)
+    except AssertionError as e:
+        print(e)
+        build_script(cluster, dir_name, 'gcc7', debug)
 
 
-def skeleton_intel18(cluster, dir_name, debug, should_log=False):
-    if cluster in ['quartz']:  # Taking out 'catalyst'
-        spack_skeleton(dir_name, 'intel@18.0.0', 'mvapich2@2.2', debug, should_log)
-        build_skeleton(dir_name, 'intel@18.0.0', debug, should_log)
-    else:
-        e = 'skeleton_intel18: Unsupported Cluster %s' % cluster
+def skeleton_intel19(cluster, dir_name, debug):
+    if cluster not in []:  # Taking out 'catalyst'
+        e = 'skeleton_intel19: Unsupported Cluster %s' % cluster
         print('Skip - ' + e)
         pytest.skip(e)
+    try:
+        spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug)
+        build_skeleton(dir_name, 'intel@19.0.0', debug)
+    except AssertionError as e:
+        print(e)
+        build_script(cluster, dir_name, 'intel19', debug)
 
 
-def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log):
+def spack_skeleton(dir_name, compiler, mpi_lib, debug):
     compiler_underscored = re.sub('[@\.]', '_', compiler)
     if debug:
         build_type = 'debug'
@@ -179,17 +121,10 @@ def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log):
         dir_name, compiler, mpi_lib, debug_flag, output_file_name, error_file_name)
     return_code = os.system(command)
     os.chdir('..')
-    if should_log or (return_code != 0):
-        output_file = open(output_file_name, 'r')
-        for line in output_file:
-            print('%s: %s' % (output_file_name, line))
-        error_file = open(error_file_name, 'r')
-        for line in error_file:
-            print('%s: %s' % (error_file_name, line))
-    assert return_code == 0
+    tools.assert_success(return_code, error_file_name)
 
 
-def build_skeleton(dir_name, compiler, debug, should_log):
+def build_skeleton(dir_name, compiler, debug):
     compiler_underscored = re.sub('[@\.]', '_', compiler)
     if debug:
         build_type = 'debug'
@@ -199,31 +134,22 @@ def build_skeleton(dir_name, compiler, debug, should_log):
     error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_build_error.txt' % (dir_name, compiler_underscored, build_type)
     compiler = compiler.replace('@', '-')
     #mpi_lib = mpi_lib.replace('@', '-')
-    cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip())
+    cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).decode('utf-8').strip())
     # For reference:
     # Commenting out for now. These additions to path name will likely return
     # one day, so I am not removing them entirely.
-    # x86_64 <=> catalyst, pascal, quartz, surface
+    # x86_64 <=> catalyst, pascal
     # ppc64le <=> ray
-    #architecture = subprocess.check_output('uname -m'.split()).strip()
+    #architecture = subprocess.check_output('uname -m'.split()).decode('utf-8').strip()
     #if cluster == 'ray':
     #    architecture += '_gpu_cuda-9.2.64_cudnn-7.0'
     #elif cluster == 'pascal':
     #    architecture += '_gpu_cuda-9.1.85_cudnn-7.1'
-    #elif cluster == 'surface':
-    #    architecture += '_gpu'
     os.chdir('%s/bamboo/compiler_tests/builds/%s_%s_%s/build' % (dir_name, cluster, compiler, build_type))
     command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name)
     return_code = os.system(command)
     os.chdir('../..')
-    if should_log or (return_code != 0):
-        output_file = open(output_file_name, 'r')
-        for line in output_file:
-            print('%s: %s' % (output_file_name, line))
-        error_file = open(error_file_name, 'r')
-        for line in error_file:
-            print('%s: %s' % (error_file_name, line))
-    assert return_code == 0
+    tools.assert_success(return_code, error_file_name)
 
 
 def build_script(cluster, dirname, compiler, debug):
@@ -240,11 +166,4 @@ def build_script(cluster, dirname, compiler, debug):
     error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build)
     command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name)
     return_code = os.system(command)
-    if return_code != 0:
-        output_file = open(output_file_name, 'r')
-        for line in output_file:
-            print('%s: %s' % (output_file_name, line))
-        error_file = open(error_file_name, 'r')
-        for line in error_file:
-            print('%s: %s' % (error_file_name, line))
-    assert return_code == 0
+    tools.assert_success(return_code, error_file_name)
diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py
deleted file mode 100644
index 0d0a4dda68e..00000000000
--- a/bamboo/integration_tests/common_code.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import collections, csv, os, pprint, re, time
-
-
-# Set up the command ##########################################################
-def get_command(cluster, dir_name, model_folder, model_name, executable,
-                output_file_name, error_file_name, compiler_name, weekly=False):
-    if model_name in ['alexnet', 'conv_autoencoder_imagenet']:
-        data_reader_percent = 0.01
-        if weekly:
-            data_reader_percent = 0.10
-        command = tools.get_command(
-            cluster=cluster, executable=executable, num_nodes=16,
-            partition='pbatch', time_limit=600, num_processes=32,
-            dir_name=dir_name,
-            data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/',
-            data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt',
-            data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/',
-            data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt',
-            data_reader_name='imagenet', data_reader_percent=data_reader_percent,
-            model_folder=model_folder, model_name=model_name, num_epochs=20,
-            optimizer_name='adagrad', output_file_name=output_file_name,
-            error_file_name=error_file_name)
-    elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']:
-        if (model_name == 'lenet_mnist') and \
-                (compiler_name in ['clang4', 'intel18']):
-            partition = 'pbatch'
-            time_limit = 600
-        else:
-            partition = 'pdebug'
-            time_limit = 30
-        if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'):
-            num_processes = 20
-        else:
-            num_processes = 2
-        command = tools.get_command(
-            cluster=cluster, executable=executable, num_nodes=1,
-            partition=partition, time_limit=time_limit,
-            num_processes=num_processes, dir_name=dir_name,
-            data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-            data_reader_name='mnist', model_folder=model_folder,
-            model_name=model_name, num_epochs=5, optimizer_name='adagrad',
-            output_file_name=output_file_name, error_file_name=error_file_name)
-    else:
-        raise Exception('Invalid model: %s' % model_name)
-    return command
-
-# Run LBANN ###################################################################
-
-
-def run_lbann(command, model_name, output_file_name, error_file_name,
-              should_log=False):
-    print('About to run: %s' % command)
-    print('%s began waiting in the queue at ' % model_name +
-          time.strftime('%H:%M:%S', time.localtime()))
-    output_value = os.system(command)
-    print('%s finished at ' % model_name +
-          time.strftime('%H:%M:%S', time.localtime()))
-    lbann_exceptions = []
-    timed_out = False
-    if should_log or (output_value != 0):
-        output_file = open(output_file_name, 'r')
-        for line in output_file:
-            print('%s: %s' % (output_file_name, line))
-            is_match = re.search(
-                'This lbann_exception is about to be thrown:(.*)', line)
-            if is_match:
-                lbann_exceptions.append(is_match.group(1))
-            is_match = re.search('CANCELLED AT (.*) DUE TO TIME LIMIT', line)
-            if is_match:
-                timed_out = True
-        error_file = open(error_file_name, 'r')
-        for line in error_file:
-            print('%s: %s' % (error_file_name, line))
-            is_match = re.search('LBANN error on (.*)', line)
-            if is_match:
-                lbann_exceptions.append(is_match.group(1))
-    if output_value != 0:
-        error_string = ('Model %s crashed with output_value=%d, timed_out=%s,'
-                        ' and lbann exceptions=%s. Command was: %s') % (
-            model_name, output_value, str(timed_out),
-            str(collections.Counter(lbann_exceptions)), command)
-        raise Exception(error_string)
-    return output_value
-
-# Extract data from output ####################################################
-
-
-def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict,
-                             model_id):
-    is_match = re.search(regex, line)
-    if is_match and (data_field in data_fields):
-        if model_id not in data_dict[data_field].keys():
-            data_dict[data_field][model_id] = {}
-        epoch_id = is_match.group(1)
-        value = float(is_match.group(2))
-        data_dict[data_field][model_id][epoch_id] = value
-
-
-def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict,
-                               model_id):
-    is_match = re.search(regex, line)
-    if is_match and (data_field in data_fields):
-        if model_id not in data_dict[data_field].keys():
-            data_dict[data_field][model_id] = {}
-        value = float(is_match.group(1))
-        data_dict[data_field][model_id]['overall'] = value
-
-
-# data_dict[data_field][model_id][epoch_id] = float
-# data_fields is the list or set of data we're interested in.
-def extract_data(output_file_name, data_fields, should_log):
-    output_file = open(output_file_name, 'r')
-    data_dict = {}
-    for data_field in data_fields:
-        data_dict[data_field] = {}
-
-    for line in output_file:
-        if should_log:
-            print('extract_data: %s: %s' % (output_file_name, line))
-
-        # Check if line is reporting model results
-        is_model = re.search('^Model ([0-9]+)', line)
-        if not is_model:
-            is_model = re.search('^model([0-9]+)', line)
-        if is_model:
-            print('extract_data: is_model={is_model}'.format(is_model=is_model))
-            model_id = is_model.group(1)
-
-            regex = 'training epoch ([0-9]+) objective function : ([0-9.]+)'
-            data_field = 'training_objective_function'
-            populate_data_dict_epoch(regex, line, data_field, data_fields,
-                                     data_dict, model_id)
-
-            regex = 'training epoch ([0-9]+) run time : ([0-9.]+)'
-            data_field = 'training_run_time'
-            populate_data_dict_epoch(regex, line, data_field, data_fields,
-                                     data_dict, model_id)
-
-            regex = 'training epoch ([0-9]+) mini-batch time statistics : ([0-9.]+)s mean, ([0-9.]+)s max, ([0-9.]+)s min, ([0-9.]+)s stdev'
-            is_match = re.search(regex, line)
-            if is_match:
-                print('extract_data: is_mini-batch time statistics={is_match}'.format(
-                    is_match=is_match))
-                epoch_id = is_match.group(1)
-                mean_value = float(is_match.group(2))
-                max_value = float(is_match.group(3))
-                min_value = float(is_match.group(4))
-                stdev_value = float(is_match.group(5))
-                data_field = 'training_mean'
-                if data_field in data_fields:
-                    if model_id not in data_dict[data_field].keys():
-                        data_dict[data_field][model_id] = {}
-                    print('extract_data: mean_value={mv}'.format(mv=mean_value))
-                    data_dict[data_field][model_id][epoch_id] = mean_value
-                data_field = 'training_max'
-                if data_field in data_fields:
-                    if model_id not in data_dict[data_field].keys():
-                        data_dict[data_field][model_id] = {}
-                    print('extract_data: max_value={mv}'.format(mv=max_value))
-                    data_dict[data_field][model_id][epoch_id] = max_value
-                data_field = 'training_min'
-                if data_field in data_fields:
-                    if model_id not in data_dict[data_field].keys():
-                        data_dict[data_field][model_id] = {}
-                    print('extract_data: min_value={mv}'.format(mv=min_value))
-                    data_dict[data_field][model_id][epoch_id] = min_value
-                data_field = 'training_stdev'
-                if data_field in data_fields:
-                    if model_id not in data_dict[data_field].keys():
-                        data_dict[data_field][model_id] = {}
-                    print('extract_data: stdev={sv}'.format(sv=stdev_value))
-                    data_dict[data_field][model_id][epoch_id] = stdev_value
-
-            regex = 'test categorical accuracy : ([0-9.]+)'
-            data_field = 'test_accuracy'
-            populate_data_dict_overall(regex, line, data_field, data_fields,
-                                       data_dict, model_id)
-    output_file.close()
-    if should_log:
-        print('extract_data: Extracted Data below:')
-        pprint.pprint(data_dict)
-    return data_dict
-
-# Skeleton ####################################################################
-
-
-def skeleton(cluster, dir_name, executable, model_folder, model_name,
-             data_fields, should_log, compiler_name=None, weekly=False):
-    if compiler_name is None:
-        output_file_name = '%s/bamboo/integration_tests/output/%s_output.txt' % (dir_name, model_name)
-        error_file_name = '%s/bamboo/integration_tests/error/%s_error.txt' % (dir_name, model_name)
-    else:
-        output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (dir_name, model_name, compiler_name)
-        error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (dir_name, model_name, compiler_name)
-    command = get_command(
-        cluster, dir_name, model_folder, model_name, executable,
-        output_file_name, error_file_name, compiler_name, weekly=weekly)
-    run_lbann(command, model_name, output_file_name,
-              error_file_name, should_log)  # Don't need return value
-    return extract_data(output_file_name, data_fields, should_log)
-
-# Misc. functions  ############################################################
-
-
-# csv_dict[row_header][column_header] = float
-def csv_to_dict(csv_path):
-    with open(csv_path, 'r') as csv_file:
-        reader = csv.reader(csv_file, skipinitialspace=True)
-        column_headers = reader.next()
-        values = {}
-        for row in reader:
-            row_header = row[0]
-            values[row_header] = dict(
-                zip(column_headers[1:], map(float, row[1:])))
-    return values
diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py
index da2ffc127be..9487cdf242e 100644
--- a/bamboo/integration_tests/conftest.py
+++ b/bamboo/integration_tests/conftest.py
@@ -6,9 +6,9 @@
 
 def pytest_addoption(parser):
     cluster = re.sub('[0-9]+', '', subprocess.check_output(
-        'hostname'.split()).strip())
+        'hostname'.split()).decode('utf-8').strip())
     default_dirname = subprocess.check_output(
-        'git rev-parse --show-toplevel'.split()).strip()
+        'git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
     default_exes = tools.get_default_exes(default_dirname, cluster)
 
     parser.addoption('--cluster', action='store', default=cluster,
@@ -17,12 +17,8 @@ def pytest_addoption(parser):
                      help='--dirname=<path_to_dir> to specify the top-level directory. Default directory of build_lbann_lc executable')
     parser.addoption('--exes', action='store', default=default_exes,
                      help='--exes={compiler_name: path}')
-    parser.addoption('--log', action='store', default=0,
-                     help='--log=1 to keep trimmed accuracy files. Default (--log=0) removes files')
     parser.addoption('--weekly', action='store_true', default=False,
-                     help='--weekly specifies that the test should ONLY be run weekly, not nightly')
-    # For local testing only
-    parser.addoption('--exe', action='store', help='--exe=<hand-picked executable>')
+                     help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False')
 
 
 @pytest.fixture
@@ -30,11 +26,6 @@ def cluster(request):
     return request.config.getoption('--cluster')
 
 
-@pytest.fixture
-def debug(request):
-    return request.config.getoption('--debug')
-
-
 @pytest.fixture
 def dirname(request):
     return request.config.getoption('--dirname')
@@ -48,8 +39,3 @@ def exes(request):
 @pytest.fixture
 def weekly(request):
     return request.config.getoption('--weekly')
-
-
-@pytest.fixture
-def exe(request):
-    return request.config.getoption('--exe')
diff --git a/bamboo/integration_tests/error/README.md b/bamboo/integration_tests/error/README.md
deleted file mode 100644
index 78712c2962b..00000000000
--- a/bamboo/integration_tests/error/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test error
diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv
deleted file mode 100644
index 003794fd557..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-Epoch_number, training_objective_function_nightly, training_objective_function_weekly
-0,            0.675652,                            0.608574
-1,            0.590008,                            0.590008
-2,            0.587484,                            0.587484
-3,            0.586305,                            0.586305
-4,            0.585585,                            0.585585
-5,            0.585036,                            0.585036
-6,            0.584688,                            0.584688
-7,            0.584348,                            0.584348
-8,            0.584041,                            0.584041
-9,            0.583865,                            0.583865
-10,           0.583665,                            0.583665
-11,           0.583521,                            0.583521
-12,           0.583303,                            0.583303
-13,           0.58328,                             0.58328
-14,           0.5832,                              0.5832
-15,           0.583134,                            0.583134
-16,           0.583052,                            0.583052
-17,           0.583039,                            0.583039
-18,           0.582954,                            0.582954
-19,           0.582936,                            0.582936
diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv
deleted file mode 100644
index 80c12b2b0ed..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-Epoch_number, training_objective_function
-0,            0.207480
-1,            0.194710
-2,            0.193224
-3,            0.192867
-4,            0.192758
diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv
deleted file mode 100644
index 32551e8e70b..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-Model_name,      training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy
-alexnet_nightly, 56.00,             1.20,          5.00,         0.80,         0.40,           0.00
-alexnet_weekly,  0.00,              0.00,          0.00,         0.00,         0.00,           100.00
-cache_alexnet,   0.00,              0.00,          0.00,         0.00,         0.00,           100.00
-lenet_mnist,     88.00,             0.12,          0.40,         0.10,         0.09,           98.40
diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv
deleted file mode 100644
index 003794fd557..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-Epoch_number, training_objective_function_nightly, training_objective_function_weekly
-0,            0.675652,                            0.608574
-1,            0.590008,                            0.590008
-2,            0.587484,                            0.587484
-3,            0.586305,                            0.586305
-4,            0.585585,                            0.585585
-5,            0.585036,                            0.585036
-6,            0.584688,                            0.584688
-7,            0.584348,                            0.584348
-8,            0.584041,                            0.584041
-9,            0.583865,                            0.583865
-10,           0.583665,                            0.583665
-11,           0.583521,                            0.583521
-12,           0.583303,                            0.583303
-13,           0.58328,                             0.58328
-14,           0.5832,                              0.5832
-15,           0.583134,                            0.583134
-16,           0.583052,                            0.583052
-17,           0.583039,                            0.583039
-18,           0.582954,                            0.582954
-19,           0.582936,                            0.582936
diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv
deleted file mode 100644
index 8bcf25bb71d..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-Epoch_number, training_objective_function
-0,            0.207514
-1,            0.194710
-2,            0.193221
-3,            0.192864
-4,            0.192755
diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv
deleted file mode 100644
index d3ac7caa6b4..00000000000
--- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-Model_name,      training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy
-alexnet_nightly, 57.00,             1.11,          4.80,        0.37,         1.20,           0.00
-alexnet_weekly,  0.00,              0.00,          0.00,        0.00,         0.00,           100.00
-cache_alexnet,   0.00,              0.00,          0.00,        0.00,         0.00,           100.00
-lenet_mnist,     64.00,             0.10,          0.40,        0.08,         0.04,           98.92
diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv
deleted file mode 100644
index 003794fd557..00000000000
--- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-Epoch_number, training_objective_function_nightly, training_objective_function_weekly
-0,            0.675652,                            0.608574
-1,            0.590008,                            0.590008
-2,            0.587484,                            0.587484
-3,            0.586305,                            0.586305
-4,            0.585585,                            0.585585
-5,            0.585036,                            0.585036
-6,            0.584688,                            0.584688
-7,            0.584348,                            0.584348
-8,            0.584041,                            0.584041
-9,            0.583865,                            0.583865
-10,           0.583665,                            0.583665
-11,           0.583521,                            0.583521
-12,           0.583303,                            0.583303
-13,           0.58328,                             0.58328
-14,           0.5832,                              0.5832
-15,           0.583134,                            0.583134
-16,           0.583052,                            0.583052
-17,           0.583039,                            0.583039
-18,           0.582954,                            0.582954
-19,           0.582936,                            0.582936
diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv
deleted file mode 100644
index 8bcf25bb71d..00000000000
--- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-Epoch_number, training_objective_function
-0,            0.207514
-1,            0.194710
-2,            0.193221
-3,            0.192864
-4,            0.192755
diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv
deleted file mode 100644
index cca3451efd2..00000000000
--- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-Model_name,      training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy
-alexnet_nightly, 51.00,             1.20,          4.00,         0.50,         0.40,           0.17
-alexnet_weekly,  0.00,              0.00,          0.00,         0.00,         0.00,           100.00
-cache_alexnet,   0.00,              0.00,          0.00,         0.00,         0.00,           100.00
-lenet_mnist,     9.00,              0.01,          6.00,         0.01,         0.40,           98.40
diff --git a/bamboo/integration_tests/experiments/.gitignore b/bamboo/integration_tests/experiments/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/bamboo/integration_tests/experiments/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh
deleted file mode 100644
index ff1b5cf1c76..00000000000
--- a/bamboo/integration_tests/full_alexnet.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-module load mpifileutils
-
-# Clear SSDs
-srun --wait=0 --clear-ssd hostname > /dev/null
-
-# Cache dataset
-echo "Caching dataset..."
-[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar ] || \
-  srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/train_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar > /dev/null
-[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train ] || \
-  srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012
-[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar ] || \
-  srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/val_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar > /dev/null
-[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val ] || \
-  srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012
-[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar ] || \
-  srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar > /dev/null
-[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt ] || \
-  srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012
-wait
-echo "Done caching dataset..."
-
-LBANN_DIR=$(git rev-parse --show-toplevel)
-CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
-
-# Experiment
-srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-4.9.3_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt
diff --git a/bamboo/integration_tests/output/README.md b/bamboo/integration_tests/output/README.md
deleted file mode 100644
index 308358e3777..00000000000
--- a/bamboo/integration_tests/output/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test output
diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py
new file mode 100644
index 00000000000..576b2852204
--- /dev/null
+++ b/bamboo/integration_tests/test_integration_alexnet.py
@@ -0,0 +1,190 @@
+import functools
+import operator
+import os
+import os.path
+import re
+import sys
+import numpy as np
+import google.protobuf.text_format
+import pytest
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+import data.imagenet
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 5
+mini_batch_size = 256
+num_nodes = 4
+imagenet_fraction = 0.280994  # Train with 360K out of 1.28M samples
+
+# Top-5 classification accuracy (percent)
+expected_train_accuracy_range = (9, 15)
+expected_test_accuracy_range = (15, 24)
+
+# Average mini-batch time (in sec) for each LC system
+# Note that run times are with LBANN_DETERMINISTIC set
+# Commented out times are prior to thread safe RNGs
+expected_mini_batch_times = {
+    'pascal': 0.154, # 0.100,
+    'lassen': 0.050,
+    'ray':    0.075,
+}
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    # Setup data reader
+    data_reader = data.imagenet.make_data_reader(lbann, num_classes=1000)
+    # We train on a subset of ImageNet
+    data_reader.reader[0].percent_of_data_to_use = imagenet_fraction
+    # Only evaluate on ImageNet validation set at end of training
+    data_reader.reader[1].role = 'test'
+
+    optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Layer graph
+    input_ = lbann.Input()
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    x = lbann.models.AlexNet(1000)(images)
+    probs = lbann.Softmax(x)
+    cross_entropy = lbann.CrossEntropy(probs, labels)
+    top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5)
+    layers = list(lbann.traverse_layer_graph(x))
+
+    # Setup objective function
+    l2_reg_weights = set()
+    for l in layers:
+        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+            l2_reg_weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
+    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname, weekly):
+
+        # Skip test with nightly builds and on CPU systems
+        if not weekly:
+            pytest.skip('only run {} with weekly builds'.format(test_name))
+        if cluster in ('catalyst', 'corona'):
+            pytest.skip('only run {} on GPU systems'.format(test_name))
+
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        test_accuracy = None
+        mini_batch_times = []
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('test top-5 accuracy : ([0-9.]+)%', line)
+                if match:
+                    test_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+
+        # Check if training accuracy is within expected range
+        assert (expected_train_accuracy_range[0]
+                < train_accuracy
+                < expected_train_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+
+        # Check if testing accuracy is within expected range
+        assert (expected_test_accuracy_range[0]
+                < test_accuracy
+                < expected_test_accuracy_range[1]), \
+                'test accuracy is outside expected range'
+
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+        mini_batch_times = mini_batch_times[1:]
+        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+        assert (0.75 * expected_mini_batch_times[cluster]
+                < mini_batch_time
+                < 1.25 * expected_mini_batch_times[cluster]), \
+                'average mini-batch time is outside expected range'
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py
deleted file mode 100644
index 5f021ce6f53..00000000000
--- a/bamboo/integration_tests/test_integration_autoencoders.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import pytest
-import common_code
-
-
-def error_if(f, f_symbol, data_field, actual_values, expected_values,
-             model_name, errors, all_values, frequency_str):
-  d = actual_values[data_field]
-  for model_id in sorted(d.keys()):
-    for epoch_id in sorted(d[model_id].keys()):
-      actual_value = d[model_id][epoch_id]
-      expected_value = expected_values[epoch_id][data_field + frequency_str]
-
-      if actual_value is None:
-        errors.append('d[%s][%s] == None' % (model_id, epoch_id))
-      if expected_value is None:
-        errors.append('d[%s]([%s] == None' % (model_id, epoch_id))
-
-      if f(actual_value, expected_value):
-        errors.append('%f %s %f %s Model %s Epoch %s %s' % (
-            actual_value, f_symbol, expected_value, model_name, model_id,
-            epoch_id, data_field))
-      all_values.append('%f %s Model %s Epoch %s %s' % (
-          actual_value, model_name, model_id, epoch_id, data_field))
-
-
-def run_tests(actual_objective_functions, model_name, dir_name, cluster,
-              should_log, compiler_name, frequency_str=''):
-    expected_objective_functions = common_code.csv_to_dict(
-        '%s/bamboo/integration_tests/expected_values/%s/%s/expected_%s_objective_functions.csv' % (dir_name, cluster, compiler_name, model_name))
-    errors = []
-    all_values = []
-    tolerance = 0.05
-    # Are we within tolerance * expected_value?
-    outside_tolerance = lambda x, y: abs(x - y) > abs(tolerance * y)
-    error_if(outside_tolerance, '!=', 'training_objective_function',
-             actual_objective_functions, expected_objective_functions,
-             model_name, errors, all_values, frequency_str)
-
-    print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors)))
-    for error in errors:
-        print(error)
-    if should_log:
-        print('All values for: %s %s (%d)' % (model_name, compiler_name,
-                                              len(all_values)))
-        for value in all_values:
-            print(value)
-    assert errors == []
-
-DATA_FIELDS = [
-  'training_objective_function'
-]
-
-
-def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name,
-                                  weekly):
-  if cluster in ['surface', 'pascal']:
-      e = 'skeleton_autoencoder_imagenet: does not run on GPU'
-      print('Skip - ' + e)
-      pytest.skip(e)
-  if compiler_name not in executables:
-      e = 'skeleton_autoencoder_imagenet: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-  model_folder = 'models/autoencoder_imagenet'
-  model_name = 'conv_autoencoder_imagenet'
-  should_log = False
-  actual_objective_functions = common_code.skeleton(
-      cluster, dir_name, executables[compiler_name], model_folder, model_name,
-      DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly)
-  frequency_str = '_nightly'
-  if weekly:
-    frequency_str = '_weekly'
-  run_tests(actual_objective_functions, model_name, dir_name, cluster,
-            should_log, compiler_name, frequency_str)
-
-
-def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes,
-                                                 weekly):
-    skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly)
-
-
-def test_integration_autoencoder_imagenet_gcc4(cluster, dirname, exes, weekly):
-    skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc4', weekly)
-
-
-def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly):
-    skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly)
-
-
-def test_integration_autoencoder_imagenet_intel18(cluster, dirname, exes,
-                                                  weekly):
-    skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel18', weekly)
-
-
-# Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe=<executable>
-def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_integration_autoencoder_imagenet_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip()
-    exes = {'exe': exe}
-    skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', True)
diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py
deleted file mode 100644
index c205dffb24c..00000000000
--- a/bamboo/integration_tests/test_integration_debug.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import common_code
-
-
-def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly,
-                         debug, should_log=False):
-    # If weekly or debug are true, then run the test.
-    if (not weekly) and (not debug):
-        e = 'skeleton_mnist_debug: Not doing weekly or debug testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    if compiler_name not in executables:
-        e = 'skeleton_mnist_debug: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    model_name = 'lenet_mnist'
-    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
-    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        partition='pbatch', time_limit=100, dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='models/' + model_name,
-        model_name=model_name, num_epochs=5, optimizer_name='adagrad',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
-    assert output_value == 0
-
-
-def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly,
-                         debug, should_log=False):
-    # If weekly or debug are true, then run the test.
-    if (not weekly) and (not debug):
-        e = 'skeleton_cifar_debug: Not doing weekly or debug testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    if cluster == 'ray':
-        e = 'skeleton_cifar_debug: cifar not operational on Ray'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    if compiler_name not in executables:
-        e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    model_name = 'autoencoder_cifar10'
-    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
-    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],	num_nodes=1,
-        partition='pbatch', time_limit=100, dir_name=dir_name,
-        data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin',
-        data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin',
-        data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name,
-        model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
-    assert output_value == 0
-
-
-def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug)
-
-
-def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug)
-
-
-def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)
-
-
-def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)
-
-
-def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug)
-
-
-def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug)
-
-
-def test_integration_mnist_intel18_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_mnist_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug)
-
-
-def test_integration_cifar_intel18_debug(cluster, dirname, exes, weekly, debug):
-    skeleton_cifar_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug)
-
-
-# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe=<executable>
-def test_integration_mnist_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_integration_mnist_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_mnist_debug(cluster, dirname, exes, 'exe', True, True)
-
-
-# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe=<executable>
-def test_integration_cifar_exe(cluster, dirname, exe):
-    if exe == None:
-        e = 'test_integration_cifar_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_cifar_debug(cluster, dirname, exes, 'exe', True, True)
diff --git a/bamboo/integration_tests/test_integration_lenet.py b/bamboo/integration_tests/test_integration_lenet.py
new file mode 100644
index 00000000000..3abc4a02387
--- /dev/null
+++ b/bamboo/integration_tests/test_integration_lenet.py
@@ -0,0 +1,174 @@
+import functools
+import operator
+import os
+import os.path
+import re
+import sys
+import numpy as np
+import google.protobuf.text_format
+import pytest
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+import data.mnist
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 5
+mini_batch_size = 64
+num_nodes = 2
+
+# Classification accuracy (percent)
+expected_train_accuracy_range = (98.75, 99.25)
+expected_test_accuracy_range = (98, 99)
+
+# Average mini-batch time (in sec) for each LC system
+# Note that run times are with LBANN_DETERMINISTIC set
+# Commented out times are prior to thread safe RNGs
+expected_mini_batch_times = {
+    'pascal':   0.0014, # 0.0013,
+    'catalyst': 0.0073, # 0.0055,
+    'lassen':   0.0022,
+    'ray':      0.0025,
+    'corona':   0.0117, # 0.0075,
+}
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+
+    data_reader = data.mnist.make_data_reader(lbann)
+    # No validation set
+    data_reader.reader[0].validation_percent = 0
+
+    optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Layer graph
+    input_ = lbann.Input()
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    x = lbann.models.LeNet(10)(images)
+    probs = lbann.Softmax(x)
+    loss = lbann.CrossEntropy(probs, labels)
+    acc = lbann.CategoricalAccuracy(probs, labels)
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    metrics = [lbann.Metric(acc, name='accuracy', unit='%')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(input_),
+                       objective_function=loss,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname):
+
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        test_accuracy = None
+        mini_batch_times = []
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('test accuracy : ([0-9.]+)%', line)
+                if match:
+                    test_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+
+        # Check if training accuracy is within expected range
+        assert (expected_train_accuracy_range[0]
+                < train_accuracy
+                < expected_train_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+
+        # Check if testing accuracy is within expected range
+        assert (expected_test_accuracy_range[0]
+                < test_accuracy
+                < expected_test_accuracy_range[1]), \
+                'test accuracy is outside expected range'
+
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+        mini_batch_times = mini_batch_times[1:]
+        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+        assert (0.75 * expected_mini_batch_times[cluster]
+                < mini_batch_time
+                < 1.25 * expected_mini_batch_times[cluster]), \
+                'average mini-batch time is outside expected range'
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py
deleted file mode 100644
index a171184ba5e..00000000000
--- a/bamboo/integration_tests/test_integration_performance.py
+++ /dev/null
@@ -1,252 +0,0 @@
-import pytest
-import operator, os
-import common_code
-
-
-def error_if(f, f_symbol, data_field, actual_values, expected_values,
-             model_name, errors, all_values, frequency_str):
-  d = actual_values[data_field]
-  if f_symbol == '<':
-    # Every time a value is smaller, update archive_value
-    archive_value = float('inf')
-  elif f_symbol == '>':
-    # Every time a value is greater, update archive_value
-    archive_value = float('-inf')
-  else:
-    raise Exception('Invalid Function Symbol %s' % f_symbol)
-  for model_id in sorted(d.keys()):
-    for epoch_id in sorted(d[model_id].keys()):
-      actual_value = d[model_id][epoch_id]
-      expected_value = expected_values[model_name + frequency_str][data_field]
-
-      if actual_value is None:
-        errors.append('actual_value: d[%s][%s] is None' % (model_id, epoch_id))
-      else:
-        print('actual_value={av}'.format(av=actual_value))
-      if expected_value is None:
-        errors.append(
-          'expected_value: d[%s]([%s] is None' % (model_id, epoch_id))
-      else:
-        print('expected_value={ev}'.format(ev=expected_value))
-
-      if (actual_value is not None) and (expected_value is not None):
-        if f(actual_value, expected_value):
-          errors.append('%f %s %f %s Model %s Epoch %s %s' % (
-            actual_value, f_symbol, expected_value, model_name, model_id,
-            epoch_id, data_field))
-        all_values.append('%f %s Model %s Epoch %s %s' % (
-          actual_value, model_name, model_id, epoch_id, data_field))
-
-        if f(actual_value, archive_value):
-          archive_value = actual_value
-      else:
-        print('archiving: either actual_value or expected_value is None.')
-  return archive_value
-
-
-def run_tests(actual_performance, model_name, dir_name, should_log,
-              compiler_name, cluster, frequency_str=''):
-  expected_performance = common_code.csv_to_dict(
-    '%s/bamboo/integration_tests/expected_values/%s/%s/expected_performance.csv' % (dir_name, cluster, compiler_name))
-  errors = []
-  all_values = []
-  greater_than = operator.gt
-  less_than = operator.lt
-  max_run_time = error_if(greater_than, '>', 'training_run_time', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-  max_mean     = error_if(greater_than, '>', 'training_mean', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-  max_max      = error_if(greater_than, '>', 'training_max', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-  max_min      = error_if(greater_than, '>', 'training_min', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-  max_stdev    = error_if(greater_than, '>', 'training_stdev', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-  min_accuracy = error_if(less_than, '<', 'test_accuracy', actual_performance, expected_performance, model_name, errors, all_values, frequency_str)
-
-  archival_string = '%s, %f, %f, %f, %f, %f, %f\n' % (
-    os.environ['bamboo_buildNumber'], max_run_time, max_mean, max_max, max_min,
-    max_stdev, min_accuracy)
-  print('archival_string: ' + archival_string)
-  if os.environ['LOGNAME'] == 'lbannusr':
-    key = 'bamboo_planKey'
-    if key in os.environ:
-      plan = os.environ[key]
-      if plan in ['LBANN-NIGHTD', 'LBANN-WD']:
-        archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/performance_%s.txt' % (plan, cluster, compiler_name, model_name)
-        print('Archive file: ' + archive_file)
-        with open(archive_file, 'a') as archive:
-          print('Archiving to file.')
-          archive.write(archival_string)
-      else:
-        print('The plan %s does not have archiving activated' % plan)
-    else:
-      print('%s is not in os.environ' % key)
-  else:
-    print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME'])
-
-  print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors)))
-  for error in errors:
-    print(error)
-  if should_log:
-    print('All values for: %s %s (%d)' % (
-      model_name, compiler_name, len(all_values)))
-    for value in all_values:
-      print(value)
-  assert errors == []
-
-DATA_FIELDS = [
-  'training_run_time',
-  'training_mean',
-  'training_max',
-  'training_min',
-  'training_stdev',
-  'test_accuracy'
-]
-
-
-def skeleton_performance_lenet_mnist(cluster, dir_name, executables,
-                                     compiler_name):
-  if compiler_name not in executables:
-    e = 'skeleton_performance_lenet_mnist: default_exes[%s] does not exist' % compiler_name
-    print('Skip - ' + e)
-    pytest.skip(e)
-  executable = executables[compiler_name]
-  model_name = 'lenet_mnist'
-  model_folder = 'models/' + model_name
-  should_log = True
-  actual_performance = common_code.skeleton(
-    cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS,
-    should_log, compiler_name=compiler_name)
-  run_tests(actual_performance, model_name, dir_name, should_log,
-            compiler_name, cluster)
-
-
-def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name,
-                                 weekly):
-  if compiler_name not in executables:
-    e = 'skeleton_performance_alexnet: default_exes[%s] does not exist' % compiler_name
-    print('Skip - ' + e)
-    pytest.skip(e)
-  executable = executables[compiler_name]
-  model_name = 'alexnet'
-  model_folder = 'models/' + model_name
-  should_log = True
-  actual_performance = common_code.skeleton(
-    cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS,
-    should_log, compiler_name=compiler_name, weekly=weekly)
-  frequency_str = '_nightly'
-  if weekly:
-    frequency_str = '_weekly'
-  run_tests(actual_performance, model_name, dir_name, should_log,
-            compiler_name, cluster, frequency_str)
-
-
-def skeleton_performance_full_alexnet(cluster, dir_name, executables,
-                                      compiler_name, weekly):
-  if not weekly:
-    e = 'skeleton_performance_full_alexnet: Non-local testing'
-    print('Skip - ' + e)
-    pytest.skip(e)
-  if compiler_name not in executables:
-    e = 'skeleton_performance_full_alexnet: default_exes[%s] does not exist' % compiler_name
-    print('Skip - ' + e)
-    pytest.skip(e)
-  executable = executables[compiler_name]
-  if not os.path.exists(executable):
-    pytest.skip('Executable does not exist: %s' % executable)
-  model_name = 'full_alexnet'
-  should_log = True
-  output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
-  error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) 
-  if cluster in ['catalyst', 'surface']:
-    command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (dir_name, model_name, output_file_name)
-  elif cluster == 'ray':
-    e = 'skeleton_performance_full_alexnet: Ray is unsupported for skeleton_performance_full_alexnet'
-    print('Skip - ' + e)
-    pytest.skip(e)
-  else:
-    raise Exception('Unsupported Cluster %s' % cluster)
-  common_code.run_lbann(command, model_name, output_file_name, error_file_name,
-                        should_log)  # Don't need return value
-  actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS,
-                                                should_log)
-  run_tests(actual_performance, model_name, dir_name, should_log, compiler_name,
-            cluster)
-
-
-def test_integration_performance_lenet_mnist_clang4(cluster, dirname, exes):
-  skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang4')
-
-
-def test_integration_performance_alexnet_clang4(cluster, dirname, exes, weekly):
-  skeleton_performance_alexnet(cluster, dirname, exes, 'clang4', weekly)
-
-
-def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes,
-                                                     weekly):
-  skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly)
-
-
-def test_integration_performance_lenet_mnist_gcc4(cluster, dirname, exes):
-  skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc4')
-
-
-def test_integration_performance_alexnet_gcc4(cluster, dirname, exes, weekly):
-  skeleton_performance_alexnet(cluster, dirname, exes, 'gcc4', weekly)
-
-
-def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, weekly):
-  skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly)
-
-
-def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes):
-  skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7')
-
-
-def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly):
-  skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly)
-
-
-def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes,
-                                                   weekly):
-  skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly)
-
-
-def test_integration_performance_lenet_mnist_intel18(cluster, dirname, exes):
-  skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel18')
-
-
-def test_integration_performance_alexnet_intel18(cluster, dirname, exes,
-                                                 weekly):
-  skeleton_performance_alexnet(cluster, dirname, exes, 'intel18', weekly)
-
-
-def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes,
-                                                      weekly):
-  skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly)
-
-
-# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe=<executable>
-def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe):
-    if exe is None:
-      e = 'test_integration_performance_lenet_mnist_exe: Non-local testing'
-      print('Skip - ' + e)
-      pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe')
-
-
-# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe=<executable>
-def test_integration_performance_alexnet_exe(cluster, dirname, exe):
-    if exe is None:
-      e = 'stest_integration_performance_alexnet_exe: Non-local testing'
-      print('Skip - ' + e)
-      pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True)
-
-
-# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe=<executable>
-def test_integration_performance_full_alexnet_exe(cluster, dirname, exe):
-    if exe is None:
-      e = 'test_integration_performance_full_alexnet_exe: Non-local testing'
-      print('Skip - ' + e)
-      pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', True)
diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py
new file mode 100644
index 00000000000..360e3fb20e1
--- /dev/null
+++ b/bamboo/integration_tests/test_integration_resnet50.py
@@ -0,0 +1,188 @@
+import functools
+import operator
+import os
+import os.path
+import re
+import sys
+import numpy as np
+import google.protobuf.text_format
+import pytest
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+import data.imagenet
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 5
+mini_batch_size = 256
+num_nodes = 4
+imagenet_fraction = 0.280994  # Train with 360K out of 1.28M samples
+
+# Top-5 classification accuracy (percent)
+expected_train_accuracy_range = (45, 50)
+expected_test_accuracy_range = (40, 55)
+
+# Average mini-batch time (in sec) for each LC system
+expected_mini_batch_times = {
+    'pascal': 0.25,
+    'lassen': 0.10,
+    'ray':    0.15,
+}
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    # Setup data reader
+    data_reader = data.imagenet.make_data_reader(lbann, num_classes=1000)
+    # We train on a subset of ImageNet
+    data_reader.reader[0].percent_of_data_to_use = imagenet_fraction
+    # Only evaluate on ImageNet validation set at end of training
+    data_reader.reader[1].role = 'test'
+
+    optimizer = lbann.SGD(learn_rate=0.1, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Layer graph
+    input_ = lbann.Input()
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    x = lbann.models.ResNet50(1000, bn_statistics_group_size=-1)(images)
+    probs = lbann.Softmax(x)
+    cross_entropy = lbann.CrossEntropy(probs, labels)
+    top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5)
+    layers = list(lbann.traverse_layer_graph(x))
+
+    # Setup objective function
+    l2_reg_weights = set()
+    for l in layers:
+        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
+            l2_reg_weights.update(l.weights)
+    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
+    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def augment_test_func(test_func):
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname, weekly):
+
+        # Skip test with nightly builds and on CPU systems
+        if not weekly:
+            pytest.skip('only run {} with weekly builds'.format(test_name))
+        if cluster in ('catalyst', 'corona'):
+            pytest.skip('only run {} on GPU systems'.format(test_name))
+
+        # Run LBANN experiment
+        experiment_output = test_func(cluster, exes, dirname)
+
+        # Parse LBANN log file
+        train_accuracy = None
+        test_accuracy = None
+        mini_batch_times = []
+        with open(experiment_output['stdout_log_file']) as f:
+            for line in f:
+                match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line)
+                if match:
+                    train_accuracy = float(match.group(1))
+                match = re.search('test top-5 accuracy : ([0-9.]+)%', line)
+                if match:
+                    test_accuracy = float(match.group(1))
+                match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line)
+                if match:
+                    mini_batch_times.append(float(match.group(1)))
+
+        # Check if training accuracy is within expected range
+        assert (expected_train_accuracy_range[0]
+                < train_accuracy
+                < expected_train_accuracy_range[1]), \
+                'train accuracy is outside expected range'
+
+        # Check if testing accuracy is within expected range
+        assert (expected_test_accuracy_range[0]
+                < test_accuracy
+                < expected_test_accuracy_range[1]), \
+                'test accuracy is outside expected range'
+
+        # Check if mini-batch time is within expected range
+        # Note: Skip first epoch since its runtime is usually an outlier
+        mini_batch_times = mini_batch_times[1:]
+        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
+        assert (0.75 * expected_mini_batch_times[cluster]
+                < mini_batch_time
+                < 1.25 * expected_mini_batch_times[cluster]), \
+                'average mini-batch time is outside expected range'
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     nodes=num_nodes):
+    globals()[_test_func.__name__] = augment_test_func(_test_func)
diff --git a/bamboo/local_test.cmd b/bamboo/local_test.cmd
new file mode 100644
index 00000000000..aa17ec3101b
--- /dev/null
+++ b/bamboo/local_test.cmd
@@ -0,0 +1,11 @@
+#!/bin/bash
+#SBATCH --nodes 16
+#SBATCH --partition pbatch
+#SBATCH --time 1440
+
+# Update "--time" above to increase/decrease allocation time.
+# Update "executable" with your executable.
+# Use "data-reader-percent" to specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%.
+# Use "--integration-tests" to only run integration tests.
+# Use "--unit-tests" to only run unit tests.
+./local_test.sh --executable "../build/gnu.Release.pascal.llnl.gov/install/bin/lbann" --data-reader-percent 0.001 --unit-tests
diff --git a/bamboo/local_test.sh b/bamboo/local_test.sh
new file mode 100755
index 00000000000..051c1931c8c
--- /dev/null
+++ b/bamboo/local_test.sh
@@ -0,0 +1,120 @@
+#!/bin/bash -l
+
+# Local testing (i.e. not with Bamboo)
+
+################################################################
+# Help message
+################################################################
+
+function help_message {
+    local SCRIPT=$(basename ${0})
+    local N=$(tput sgr0)    # Normal text
+    local C=$(tput setf 4)  # Colored text
+    cat << EOF
+Run integration and unit tests locally, outside Bamboo.
+Usage: ./${SCRIPT} [options]
+Options:
+  ${C}--help${N}                      Display this help message and exit.
+  ${C}--data-reader-percent${N} <val> Specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%.
+  ${C}--executable${N} <val>          Specify executable to be used. Required field.
+  ${C}--integration-tests${N}         Specify that only integration tests should be run.
+  ${C}--unit-tests${N}                Specify that only unit tests should be run.
+EOF
+}
+
+################################################################
+# Parse command-line arguments
+################################################################
+
+DATA_READER_PERCENT=0.001
+EXECUTABLE=
+INTEGRATION_TESTS=1
+UNIT_TESTS=1
+while :; do
+    case ${1} in
+        -h|--help)
+            # Help message
+            help_message
+            exit 0
+            ;;
+        -d|--data-reader-percent)
+            # Set data reader percent.
+            # -n: check if string has non-zero length.
+            if [ -n "${2}" ]; then
+                DATA_READER_PERCENT=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                help_message
+                exit 1
+            fi
+            ;;
+        -e|--executable)
+            # Set executable.
+            # -n: check if string has non-zero length.
+            if [ -n "${2}" ]; then
+                EXECUTABLE=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                help_message
+                exit 1
+            fi
+            ;;
+        -i|--integration-tests)
+            # Run only integration tests
+            UNIT_TESTS=0
+            ;;
+        -u|--unit-tests)
+            # Run only unit tests
+            INTEGRATION_TESTS=0
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+# -z: check if string has zero length.
+if [ -z ${EXECUTABLE} ]; then
+    echo "Executable must be set."
+    help_message
+    exit 1
+fi
+
+################################################################
+# Run tests
+################################################################
+
+# Assume user already has an executable (i.e. no need for compiler tests).
+# Assume user already has 16 nodes allocated on a cluster.
+
+echo "EXECUTABLE=${EXECUTABLE}"
+echo "INTEGRATION_TESTS=${INTEGRATION_TESTS}"
+echo "UNIT_TESTS=${UNIT_TESTS}"
+PYTHON=python3
+
+echo "Task: Cleaning"
+./clean.sh
+
+echo "Task: Integration Tests"
+cd integration_tests
+if [ ${INTEGRATION_TESTS} -ne 0 ]; then
+    $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE}
+fi
+cd ..
+
+echo "Task: Unit Tests"
+cd unit_tests
+if [ ${UNIT_TESTS} -ne 0 ]; then
+    $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} --data-reader-percent=${DATA_READER_PERCENT}
+fi
+cd ..
+
+echo "Task: Finished"
diff --git a/bamboo/run.sh b/bamboo/run.sh
new file mode 100755
index 00000000000..aef17792f88
--- /dev/null
+++ b/bamboo/run.sh
@@ -0,0 +1,55 @@
+#!/bin/bash -l
+
+CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
+
+echo "run.sh CLUSTER="
+echo $CLUSTER
+
+PYTHON=python3
+
+WEEKLY=0
+while :; do
+    case ${1} in
+        --weekly)
+            # Run all tests. This is a weekly build.
+            echo "Setting WEEKLY in run.sh"
+            WEEKLY=1
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+echo "run.sh WEEKLY="
+echo $WEEKLY
+
+echo "Task: Cleaning"
+./clean.sh
+
+echo "Task: Compiler Tests"
+cd compiler_tests
+$PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml
+cd ..
+
+echo "Task: Integration Tests"
+cd integration_tests
+if [ ${WEEKLY} -ne 0 ]; then
+    $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --weekly --junitxml=results.xml
+else
+    $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --junitxml=results.xml
+fi
+cd ..
+
+echo "Task: Unit Tests"
+cd unit_tests
+OMP_NUM_THREADS=10 $PYTHON -m pytest -s -vv -k "gcc7" --durations=0 --junitxml=results.xml
+cd ..
+
+echo "Task: Finished"
diff --git a/bamboo/unit_tests/.gitignore b/bamboo/unit_tests/.gitignore
index 16d3c4dbbfe..0cc4de789bf 100644
--- a/bamboo/unit_tests/.gitignore
+++ b/bamboo/unit_tests/.gitignore
@@ -1 +1,2 @@
 .cache
+*.prototext
diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py
index eda975da95a..cf646ad1e04 100644
--- a/bamboo/unit_tests/conftest.py
+++ b/bamboo/unit_tests/conftest.py
@@ -3,34 +3,54 @@
 import tools
 import pytest, re, subprocess
 
+
 def pytest_addoption(parser):
     cluster = re.sub('[0-9]+', '', subprocess.check_output(
-        'hostname'.split()).strip())
+        'hostname'.split()).decode('utf-8').strip())
     default_dirname = subprocess.check_output(
-        'git rev-parse --show-toplevel'.split()).strip()
+        'git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
     default_exes = tools.get_default_exes(default_dirname, cluster)
 
     parser.addoption('--cluster', action='store', default=cluster,
                      help='--cluster=<cluster> to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster')
     parser.addoption('--dirname', action='store', default=default_dirname,
-                     help='--dirname specifies the top-level directory')
+                     help='--dirname=<path_to_dir> specifies the top-level directory')
     parser.addoption('--exes', action='store', default=default_exes,
                      help='--exes={compiler_name: path}')
+    parser.addoption('--weekly', action='store_true', default=False,
+                     help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False')
     # For local testing only
-    parser.addoption('--exe', action='store', help='--exe=<hand-picked executable>')
+    parser.addoption('--data-reader-percent', action='store', default=None,
+                     help='--data-reader-percent=<percent of dataset to be used>. Default None. Note that 1.0 is 100%.')
+    parser.addoption('--exe', action='store',
+                     help='--exe=<hand-picked executable>')
+
 
 @pytest.fixture
 def cluster(request):
     return request.config.getoption('--cluster')
 
+
 @pytest.fixture
 def dirname(request):
     return request.config.getoption('--dirname')
 
+
 @pytest.fixture
 def exes(request):
     return request.config.getoption('--exes')
 
+
+@pytest.fixture
+def weekly(request):
+    return request.config.getoption('--weekly')
+
+
+@pytest.fixture
+def data_reader_percent(request):
+    return request.config.getoption('--data-reader-percent')
+
+
 @pytest.fixture
 def exe(request):
     return request.config.getoption('--exe')
diff --git a/bamboo/unit_tests/error/.gitignore b/bamboo/unit_tests/error/.gitignore
index 7c9d611b592..d6b7ef32c84 100644
--- a/bamboo/unit_tests/error/.gitignore
+++ b/bamboo/unit_tests/error/.gitignore
@@ -1,3 +1,2 @@
 *
 !.gitignore
-!README.md
diff --git a/bamboo/unit_tests/error/README.md b/bamboo/unit_tests/error/README.md
deleted file mode 100644
index 78712c2962b..00000000000
--- a/bamboo/unit_tests/error/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test error
diff --git a/bamboo/unit_tests/experiments/.gitignore b/bamboo/unit_tests/experiments/.gitignore
new file mode 100644
index 00000000000..d6b7ef32c84
--- /dev/null
+++ b/bamboo/unit_tests/experiments/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/bamboo/unit_tests/output/.gitignore b/bamboo/unit_tests/output/.gitignore
index 7c9d611b592..d6b7ef32c84 100644
--- a/bamboo/unit_tests/output/.gitignore
+++ b/bamboo/unit_tests/output/.gitignore
@@ -1,3 +1,2 @@
 *
 !.gitignore
-!README.md
diff --git a/bamboo/unit_tests/output/README.md b/bamboo/unit_tests/output/README.md
deleted file mode 100644
index 308358e3777..00000000000
--- a/bamboo/unit_tests/output/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Subdirectory for test output
diff --git a/bamboo/unit_tests/prototext/data_reader_mnist.prototext b/bamboo/unit_tests/prototext/data_reader_mnist.prototext
deleted file mode 100644
index 9d2e2663202..00000000000
--- a/bamboo/unit_tests/prototext/data_reader_mnist.prototext
+++ /dev/null
@@ -1,64 +0,0 @@
-data_reader {
-  reader {
-    name: "mnist"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "train-images-idx3-ubyte"
-    label_filename: "train-labels-idx1-ubyte"
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
-      }
-    }
-  }
-  reader {
-    name: "mnist"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "t10k-images-idx3-ubyte"
-    label_filename: "t10k-labels-idx1-ubyte"
-    validation_percent: 1.0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
-      }
-    }
-  }
-}
diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext
deleted file mode 100644
index 77a1c7ed256..00000000000
--- a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext
+++ /dev/null
@@ -1,122 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 3
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    summary {
-      dir: "."
-      mat_interval: 25
-    }
-  }
-  callback {
-    adaptive_learning_rate {
-      patience: 4
-      amt: 0.1
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "image"
-    name: "ip1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1"
-    name: "ip2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip2"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "accuracy"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext
deleted file mode 100644
index c89c171566f..00000000000
--- a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext
+++ /dev/null
@@ -1,138 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 3
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    summary {
-      dir: "."
-      mat_interval: 25
-    }
-  }
-  callback {
-    adaptive_learning_rate {
-      patience: 4
-      amt: 0.1
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "image"
-    name: "ip1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1"
-    name: "ip3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip3"
-    name: "relu3"
-    data_layout: "model_parallel"
-    relu {}
-  }
-  layer {
-    parents: "relu3"
-    name: "ip2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip2"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "accuracy"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/bamboo/unit_tests/prototext/opt_sgd.prototext b/bamboo/unit_tests/prototext/opt_sgd.prototext
deleted file mode 100644
index 8d066780476..00000000000
--- a/bamboo/unit_tests/prototext/opt_sgd.prototext
+++ /dev/null
@@ -1,7 +0,0 @@
-optimizer {
-  sgd {
-    learn_rate: 0.01
-    momentum: 0.9
-    nesterov: false
-  }
-}
diff --git a/bamboo/unit_tests/test_unit_callback_set_weights_value.py b/bamboo/unit_tests/test_unit_callback_set_weights_value.py
new file mode 100644
index 00000000000..97ae2b72dad
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_callback_set_weights_value.py
@@ -0,0 +1,158 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200526)
+_samples = np.random.uniform(size=13).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return (_samples[index],)
+def num_samples():
+    return len(_samples)
+def sample_dims():
+    return (1,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=1)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # LBANN implementation
+    weights_values = np.random.uniform(size=num_samples()).astype(np.float32)
+    w = lbann.Weights(optimizer=None,
+                      initializer=lbann.ConstantInitializer(value=1234.5))
+    for step, val in enumerate(weights_values):
+        callbacks.append(
+            lbann.CallbackSetWeightsValue(weights=w.name, value=val, step=step)
+        )
+    x_lbann = lbann.Identity(lbann.Input())
+    x = x_lbann
+    y = lbann.WeightsLayer(weights=w, dims='1')
+    z = lbann.Multiply(x, y)
+    metrics.append(lbann.Metric(z, name='value'))
+
+    # Numpy implementation of training
+    vals = []
+    for step, val in enumerate(weights_values):
+        x = np.float64(get_sample(step)[0])
+        y = np.float64(val)
+        z = x * y
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='train'))
+
+    # Numpy implementation of testing
+    vals = []
+    for i in range(num_samples()):
+        x = np.float64(get_sample(i)[0])
+        y = np.float64(weights_values[-1])
+        z = x * y
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # Construct model
+    return lbann.Model(epochs=1,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+### @todo Run on >1 proc when https://github.com/LLNL/lbann/issues/1548 is resolved
+for test in tools.create_tests(setup_experiment, __file__, procs_per_node=1, nodes=1):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py
index 353fca3143a..2921931c86c 100644
--- a/bamboo/unit_tests/test_unit_check_proto_models.py
+++ b/bamboo/unit_tests/test_unit_check_proto_models.py
@@ -5,7 +5,8 @@
 import os
 
 
-def skeleton_models(cluster, dir_name, executables, compiler_name):
+def skeleton_models(cluster, dir_name, executables, compiler_name,
+                    weekly, data_reader_percent):
     if compiler_name not in executables:
         e = 'skeleton_models: default_exes[%s] does not exist' % compiler_name
         print('Skip - ' + e)
@@ -37,23 +38,6 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
                     data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
                     data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name)
                     data_reader_name = None
-                elif 'triplet' in file_name:
-                    # Disabling triplet test.
-                    print('Skipping triplet tests.')
-                    continue
-                    data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/'
-                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz'
-                    data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/'
-                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz'
-                    data_reader_path = '%s/model_zoo/models/siamese/triplet/data_reader_triplet.prototext' % (dir_name)
-                    data_reader_name = None
-                elif 'siamese_alexnet' in file_name:
-                    data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/'
-                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt'
-                    data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/'
-                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt'
-                    data_reader_path = '%s/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext' % (dir_name)
-                    data_reader_name = None
                 elif 'net' in file_name:
                     data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/'
                     data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt'
@@ -65,6 +49,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
                         time_limit = 3
                     if 'resnet50' in file_name:
                         node_count = 8
+                        if not weekly:
+                            continue # This is too many nodes for nightly.
                 elif 'cifar' in file_name:
                     data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin'
                     data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin'
@@ -98,10 +84,11 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
                         data_filename_test_default=data_filename_test_default,
                         data_reader_name=data_reader_name,
                         data_reader_path=data_reader_path,
+                        data_reader_percent=data_reader_percent,
                         exit_after_setup=True, model_path=model_path,
                         optimizer_name=opt,
                         output_file_name=output_file_name,
-                        error_file_name=error_file_name)
+                        error_file_name=error_file_name, weekly=weekly)
                     if os.system(cmd) != 0:
                         print("Error detected in " + model_path)
                         #defective_models.append(file_name)
@@ -115,30 +102,29 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
         print('Errors for: The following models exited with errors %s' % compiler_name)
         for model in defective_models:
             print(model)
-    assert num_defective == 0
-
-
-def test_unit_models_clang4(cluster, dirname, exes):
-    skeleton_models(cluster, dirname, exes, 'clang4')
+    if num_defective != 0:
+        raise AssertionError(
+            'num_defective={nd}\nDefective models:\n{dms}'.format(
+                nd=num_defective, dms=defective_models))
 
 
-def test_unit_models_gcc4(cluster, dirname, exes):
-    skeleton_models(cluster, dirname, exes, 'gcc4')
+def test_unit_models_clang6(cluster, dirname, exes, weekly, data_reader_percent):
+    skeleton_models(cluster, dirname, exes, 'clang6', weekly, data_reader_percent)
 
 
-def test_unit_models_gcc7(cluster, dirname, exes):
-    skeleton_models(cluster, exes, dirname, 'gcc7')
+def test_unit_models_gcc7(cluster, dirname, exes, weekly, data_reader_percent):
+    skeleton_models(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent)
 
 
-def test_unit_models_intel18(cluster, dirname, exes):
-    skeleton_models(cluster, dirname, exes, 'intel18')
+def test_unit_models_intel19(cluster, dirname, exes, weekly, data_reader_percent):
+    skeleton_models(cluster, dirname, exes, 'intel19', weekly, data_reader_percent)
 
 
-# Run with python -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe=<executable>
-def test_unit_models_exe(cluster, dirname, exe):
+# Run with python3 -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe=<executable>
+def test_unit_models_exe(cluster, dirname, exe, weekly, data_reader_percent):
     if exe is None:
         e = 'test_unit_models_exe: Non-local testing'
         print('Skip - ' + e)
         pytest.skip(e)
     exes = {'exe' : exe}
-    skeleton_models(cluster, dirname, exes, 'exe')
+    skeleton_models(cluster, dirname, exes, 'exe', weekly, data_reader_percent)
diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py
index 25ea6614e3b..7c3a36028ae 100644
--- a/bamboo/unit_tests/test_unit_checkpoint.py
+++ b/bamboo/unit_tests/test_unit_checkpoint.py
@@ -3,46 +3,48 @@
 import tools
 import pytest
 import os
-
+from filecmp import dircmp
 
 def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name,
-                                     compiler_name):
+                                     compiler_name, weekly, data_reader_percent):
     if compiler_name not in executables:
         e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name
         print('Skip - ' + e)
         pytest.skip(e)
     exe = executables[compiler_name]
-
+    # Handle data
+    if data_reader_percent is None:
+        data_reader_percent = 0.01
     # No checkpointing, printing weights to files.
     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
     error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
+    os.system('rm -rf ckpt_lenet_shared && mkdir ckpt_lenet_shared')
+    no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name)
     command = tools.get_command(
         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
         dir_name=dir_name,
         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=no_ckpt_dir, model_folder='tests',
         model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
     return_code_nockpt = os.system(command)
-    if return_code_nockpt != 0:
-        sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error')
-        sys.exit(1)
-    os.system('mv ckpt ckpt_baseline')
+    tools.assert_success(return_code_nockpt, error_file_name)
 
     # Run to checkpoint, printing weights to files.
     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name)
     error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name)
+    ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name)
     command = tools.get_command(
         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
         dir_name=dir_name,
         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=ckpt_dir, model_folder='tests',
         model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
     return_code_ckpt_1 = os.system(command)
-    if return_code_ckpt_1 != 0:
-        sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error')
-        sys.exit(1)
+    tools.assert_success(return_code_ckpt_1, error_file_name)
 
     # Pick up from checkpoint, printing weights to files.
     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name)
@@ -51,104 +53,153 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name,
         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
         dir_name=dir_name,
         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=ckpt_dir, model_folder='tests',
         model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
     return_code_ckpt_2 = os.system(command)
-    if return_code_ckpt_2 != 0:
-        sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error')
-        sys.exit(1)
+    tools.assert_success(return_code_ckpt_2, error_file_name)
 
-    diff_test = os.system('diff -rq ckpt ckpt_baseline')
-    os.system('rm -rf ckpt*')
-    assert diff_test == 0
+    dcmp = dircmp(ckpt_dir, no_ckpt_dir)
+    fail, diffs, warns = tools.print_diff_files(dcmp)
+    for w in warns:
+        print(w)
+
+    if fail:
+        print()
+        for d in diffs:
+            print(d)
+        path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name)
+        raise AssertionError(
+            'Compare {ncd} and {cd} in {p}'.format(
+                ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix))
 
 
 def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name,
-                                          compiler_name):
-     if compiler_name not in executables:
-         e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name
-         print('Skip - ' + e)
-         pytest.skip(e)
-     exe = executables[compiler_name]
-
-     # No checkpointing, printing weights to files.
-     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
-     error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
-     command = tools.get_command(
-         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
-         dir_name=dir_name,
-         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-         data_reader_name='mnist', model_folder='tests',
-         model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-     return_code_nockpt = os.system(command)
-     if return_code_nockpt != 0:
-         sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error')
-         sys.exit(1)
-     os.system('mv ckpt ckpt_baseline')
-
-     # Run to checkpoint, printing weights to files.
-     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name)
-     error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name)
-     command = tools.get_command(
-         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
-         dir_name=dir_name,
-         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-         data_reader_name='mnist', model_folder='tests',
-         model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-     return_code_ckpt_1 = os.system(command)
-     if return_code_ckpt_1 != 0:
-         sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error')
-         sys.exit(1)
-
-     # Pick up from checkpoint, printing weights to files.
-     output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name)
-     error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name)
-     command = tools.get_command(
-         cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
-         dir_name=dir_name,
-         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-         data_reader_name='mnist', model_folder='tests',
-         model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-     return_code_ckpt_2 = os.system(command)
-     if return_code_ckpt_2 != 0:
-         sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error')
-         sys.exit(1)
-
-     diff_test = os.system('diff -rq ckpt ckpt_baseline')
-     os.system('rm -rf ckpt*')
-     assert diff_test == 0
-
-
-def test_unit_checkpoint_lenet_clang4(cluster, exes, dirname):
-    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang4')
-    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_checkpoint_lenet_gcc4(cluster, exes, dirname):
-    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc4')
-    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname):
-    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7')
-    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_checkpoint_lenet_intel18(cluster, exes, dirname):
-    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel18')
-    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe=<executable>
-def test_unit_checkpoint_lenet_exe(cluster, dirname, exe):
+                                          compiler_name,
+                                          weekly, data_reader_percent):
+    if compiler_name not in executables:
+        e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name
+        print('Skip - ' + e)
+        pytest.skip(e)
+    exe = executables[compiler_name]
+    # Handle data
+    if data_reader_percent is None:
+        data_reader_percent = 0.01
+
+    # No checkpointing, printing weights to files.
+    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
+    os.system('rm -rf ckpt_lenet_distributed && mkdir ckpt_lenet_distributed')
+    no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
+        dir_name=dir_name,
+        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=no_ckpt_dir, model_folder='tests',
+        model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
+    return_code_nockpt = os.system(command)
+    tools.assert_success(return_code_nockpt, error_file_name)
+
+    # Run to checkpoint, printing weights to files.
+    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name)
+    ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
+        dir_name=dir_name,
+        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=ckpt_dir, model_folder='tests',
+        model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
+    return_code_ckpt_1 = os.system(command)
+    tools.assert_success(return_code_ckpt_1, error_file_name)
+
+    # Pick up from checkpoint, printing weights to files.
+    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
+        dir_name=dir_name,
+        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
+        data_reader_name='mnist', data_reader_percent=data_reader_percent,
+        ckpt_dir=ckpt_dir, model_folder='tests',
+        model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly)
+    return_code_ckpt_2 = os.system(command)
+    tools.assert_success(return_code_ckpt_2, error_file_name)
+
+    dcmp = dircmp(ckpt_dir, no_ckpt_dir)
+    fail, diffs, warns = tools.print_diff_files(dcmp)
+    for w in warns:
+        print(w)
+
+    if fail:
+        print()
+        for d in diffs:
+            print(d)
+        path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name)
+        raise AssertionError(
+            'Compare {ncd} and {cd} in {p}'.format(
+                ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix))
+
+
+def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname,
+                                             weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6',
+                                     weekly, data_reader_percent)
+
+
+def test_unit_checkpoint_lenet_distributed_clang6(cluster, exes, dirname,
+                                                  weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6',
+                                          weekly, data_reader_percent)
+
+
+def test_unit_checkpoint_lenet_shared_gcc7(cluster, exes, dirname,
+                                           weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7',
+                                     weekly, data_reader_percent)
+
+
+def test_unit_checkpoint_lenet_distributed_gcc7(cluster, exes, dirname,
+                                                weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7',
+                                          weekly, data_reader_percent)
+
+
+def test_unit_checkpoint_lenet_shared_intel19(cluster, exes, dirname,
+                                              weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19',
+                                     weekly, data_reader_percent)
+
+
+def test_unit_checkpoint_lenet_distributed_intel19(cluster, exes, dirname,
+                                                   weekly, data_reader_percent):
+    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19',
+                                          weekly, data_reader_percent)
+
+
+# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_shared_exe' --exe=<executable>
+def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe,
+                                          weekly, data_reader_percent):
+    if exe is None:
+        e = 'test_unit_checkpoint_lenet_exe: Non-local testing'
+        print('Skip - ' + e)
+        pytest.skip(e)
+    exes = {'exe': exe}
+    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe',
+                                     weekly, data_reader_percent)
+
+
+# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_distributed_exe' --exe=<executable>
+def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe, weekly, data_reader_percent):
     if exe is None:
         e = 'test_unit_checkpoint_lenet_exe: Non-local testing'
         print('Skip - ' + e)
         pytest.skip(e)
     exes = {'exe': exe}
-    skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe')
-    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe')
+    skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe', weekly, data_reader_percent)
diff --git a/bamboo/unit_tests/test_unit_checkpoint_lenet.py b/bamboo/unit_tests/test_unit_checkpoint_lenet.py
new file mode 100644
index 00000000000..b052056c654
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_checkpoint_lenet.py
@@ -0,0 +1,261 @@
+import os.path
+import re
+import sys
+import math
+import numpy as np
+import google.protobuf.text_format
+import pytest
+import glob
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 4
+num_ckpt_epochs = int(float(num_epochs)/2)
+mini_batch_size = 64
+num_nodes = 1
+lenet_fraction = 0.1
+random_seed = 20191206
+
+test_name_base='test_unit_checkpoint_lenet'
+checkpoint_dir='ckpt'
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size,
+                            random_seed=random_seed)
+
+    # Checkpoint after every epoch
+    trainer.callbacks = [
+        lbann.CallbackCheckpoint(
+            checkpoint_dir=checkpoint_dir,
+            checkpoint_epochs=1,
+            checkpoint_steps=845
+        )
+    ]
+
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Manually override the global count so that each model is named the same
+    lbann.models.LeNet.global_count = 0
+    # Layer graph
+    input_ = lbann.Input()
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    x = lbann.models.LeNet(10)(images)
+    probs = lbann.Softmax(x)
+    loss = lbann.CrossEntropy(probs, labels)
+    acc = lbann.CategoricalAccuracy(probs, labels)
+
+    # Make sure all layers are on CPU
+    for layer in lbann.traverse_layer_graph(input_):
+        layer.device = 'cpu'
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    metrics = [lbann.Metric(acc, name='accuracy', unit='%')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(input_),
+                       objective_function=loss,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.contrib.lc.paths
+
+    # Load data readers from prototext
+    dirname = os.path.dirname
+    lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__))))
+    pb_file = os.path.join(lbann_dir,
+                           'model_zoo',
+                           'data_readers',
+                           'data_reader_mnist.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(pb_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Set location of MNIST data
+    for reader in message.reader:
+        reader.data_filedir = lbann.contrib.lc.paths.mnist_dir()
+        reader.percent_of_data_to_use = lenet_fraction
+
+
+    # Validation set
+    message.reader[0].validation_percent = 0.1
+
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def create_test_func(test_func):
+    """Augment test function to cascade multiple tests and parse results.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname, weekly):
+
+        # Run LBANN experiment baseline
+        print('\n################################################################################')
+        print('Running baseline model')
+        print('################################################################################\n')
+        baseline_test_output = test_func(cluster, exes, dirname)
+        baseline_training_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        baseline_validation_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'validation objective function')
+        baseline_test_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'test objective function')
+
+        # Run LBANN model to checkpoint
+        print('\n################################################################################')
+        print('Running initial model to checkpoint')
+        print('################################################################################\n')
+        test_func_checkpoint = tools.create_tests(
+            setup_experiment,
+            __file__,
+            test_name_base=test_name_base,
+            nodes=num_nodes,
+            work_subdir='checkpoint',
+            lbann_args=['--disable_cuda=True' + ' --num_epochs='+str(num_ckpt_epochs)],
+        )
+
+        checkpoint_test_output = test_func_checkpoint[0](cluster, exes, dirname)
+        checkpoint_training_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        checkpoint_validation_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'validation objective function')
+        checkpoint_test_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'test objective function')
+
+        print('\n################################################################################')
+        print('Running restarted model to completion')
+        print('################################################################################\n')
+        test_func_restart = tools.create_tests(
+            setup_experiment,
+            __file__,
+            test_name_base=test_name_base,
+            nodes=num_nodes,
+            work_subdir='restart',
+            lbann_args=['--disable_cuda=True'
+                        + ' --restart_dir='
+                        + os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir)
+                        + ' --num_epochs='+str(num_epochs)],
+        )
+
+        # Restart LBANN model and run to completion
+        restart_test_output = test_func_restart[0](cluster, exes, dirname)
+        restart_training_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        restart_validation_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'validation objective function')
+        restart_test_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'test objective function')
+
+        print('\n################################################################################')
+        print('Comparing results of models')
+        print('################################################################################\n')
+
+        # Check if metrics are same in baseline and test experiments
+        # Note: "Print statistics" callback will print up to 6 digits
+        # of metric values.
+
+        # Comparing training objective functions
+        tools.compare_metrics(baseline_training_metrics, checkpoint_training_metrics + restart_training_metrics)
+        # Comparing validation objective functions
+        tools.compare_metrics(baseline_validation_metrics, checkpoint_validation_metrics + restart_validation_metrics)
+        # Comparing test objective functions
+        tools.compare_metrics(baseline_test_metrics, restart_test_metrics)
+
+        baseline_ckpt=os.path.join(baseline_test_output['work_dir'], checkpoint_dir)
+        checkpoint_ckpt=os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir)
+        restart_ckpt=os.path.join(restart_test_output['work_dir'], checkpoint_dir)
+
+        err = 0
+        err_dirs = ''
+        fileList = glob.glob('{base}/trainer0/*'.format(base=baseline_ckpt))
+        fileList, tmp_err, tmp_err_str = tools.multidir_diff(baseline_ckpt, restart_ckpt, fileList)
+        err += tmp_err
+        err_dirs += tmp_err_str
+        fileList, tmp_err, tmp_err_str = tools.multidir_diff(baseline_ckpt, checkpoint_ckpt, fileList)
+        err += tmp_err
+        err_dirs += tmp_err_str
+
+        err_msg = "\nUnmatched checkpoints:\n"
+        for f in fileList:
+            err_msg += f + "\n"
+        assert len(fileList) == 0, \
+            'Extra checkpoint data in baseline directory: ' + err_msg
+        assert err == 0, err_dirs
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     test_name_base=test_name_base,
+                                     nodes=num_nodes,
+                                     work_subdir='baseline',
+                                     lbann_args=['--disable_cuda=True']):
+    globals()[_test_func.__name__] = create_test_func(_test_func)
diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py
new file mode 100644
index 00000000000..618e5a8f77e
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_datareader_python.py
@@ -0,0 +1,130 @@
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190708)
+_num_samples = 29
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size))
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 4
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Layer graph
+    x = lbann.Input()
+    y = lbann.L2Norm2(x)
+    layers = list(lbann.traverse_layer_graph(x))
+    metric = lbann.Metric(y, name='obj')
+    callbacks = []
+
+    # Compute expected value with NumPy
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = tools.numpy_l2norm2(x)
+        vals.append(y)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metric.name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       metrics=[metric],
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_datastore_imagenet.py b/bamboo/unit_tests/test_unit_datastore_imagenet.py
new file mode 100644
index 00000000000..69519ed26fc
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_datastore_imagenet.py
@@ -0,0 +1,324 @@
+import os.path
+import re
+import sys
+import math
+import numpy as np
+import pytest
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 5
+mini_batch_size = 256
+num_nodes = 2
+imagenet_fraction = 0.0031971 # Train with 4096 out of 1.28M samples
+validation_percent = 0.1
+random_seed = 20191206
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size, random_seed=random_seed)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Layer graph
+    input_ = lbann.Input()
+    x = lbann.Identity(input_)
+    y = lbann.L2Norm2(x)
+    z = lbann.Multiply(y, lbann.Sqrt(lbann.MiniBatchIndex()))
+
+    # Make sure all layers are on CPU
+    for layer in lbann.traverse_layer_graph(input_):
+        layer.device = 'cpu'
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
+    metrics = [lbann.Metric(z, name='metric')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(input_),
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for ImageNet data reader.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.contrib.lc.paths
+
+    # Construct data reader
+    message = lbann.reader_pb2.DataReader()
+    reader = message.reader.add()
+
+    # Configure data reader
+    reader.name = 'imagenet'
+    reader.role = 'train'
+    reader.shuffle = False
+    reader.data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='train')
+    reader.data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='train')
+    reader.percent_of_data_to_use = imagenet_fraction
+    reader.validation_percent = validation_percent
+    reader.num_labels = 1000
+    reader.shuffle = True
+
+    # Configure transforms
+    # Note: The image just resized to 32x32
+    resize = reader.transforms.add().resize
+    resize.SetInParent()
+    resize.height = 32
+    resize.width = 32
+    colorize = reader.transforms.add().colorize
+    colorize.SetInParent()
+    normalize = reader.transforms.add().to_lbann_layout
+    normalize.SetInParent()
+
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+def run_datastore_test_func(test_func, baseline_metrics, cluster, exes, dirname, profile_data) :
+    '''Executes the input test function
+
+    Args:
+        run_datastore_test_func (function): test function
+        baseline_metrics: list of metrics against which the output of
+                          the test function will be compared
+        profile_data: dictionary of key, value pairs for testing
+                      entries in the output file: data_store_profile_train.txt
+
+    Returns:
+        list containg test name, pass/fail, etc.
+        On error, this will have the form:
+          ['FAILED', <test function name>, <error>]
+        on success:
+          ['passed', <test function name>]
+    '''
+    datastore_test_output = test_func(cluster, exes, dirname)
+
+    test_name = test_func.__name__
+    r = ['passed', test_name]
+    datastore_metrics = []
+    with open(datastore_test_output['stdout_log_file']) as f:
+        for line in f:
+            match = re.search('validation metric : ([0-9.]+)', line)
+            if match:
+                datastore_metrics.append(float(match.group(1)))
+
+    # Check if metrics are same in baseline and data store experiments
+    # Note: "Print statistics" callback will print up to 6 digits
+    # of metric values.
+    if len(baseline_metrics) != len(datastore_metrics) :
+        r[0] = 'FAILED'
+        r.append('baseline and data store experiments did not run for same number of epochs; num baseline: ' + str(len(baseline_metrics)) + '; num ds: ' + str(len(datastore_metrics)))
+
+    for i in range(len(datastore_metrics)):
+        x = baseline_metrics[i]
+        xhat = datastore_metrics[i]
+        eps = np.finfo(np.float32).eps
+        ceillogx = int(math.ceil(math.log10(x)))
+        if abs(x-xhat) >= max(8*eps*x, 1.5*10**(ceillogx-6)) :
+            r[0] = 'FAILED'
+            r.append('found large discrepancy in metrics for baseline and data store experiments')
+
+    # Check if entries profile_data exist and have correct values
+    d = None
+    for key in profile_data.keys() :
+      if test_name.find(key) != -1 :
+        d = profile_data[key]
+        break
+    assert d != None, 'failed to find key for profile_data'
+
+    found_profile_data = {}
+    with open(datastore_test_output['work_dir'] + '/data_store_profile_train.txt') as f:
+        for line in f:
+            for key in d :
+                if key in line and key not in found_profile_data.keys() :
+                    t = line.split()
+                    found_profile_data[key] = t[-1]
+
+    for key in d.keys() :
+        if key not in found_profile_data.keys() :
+            r[0] = 'FAILED'
+            r.append('missing key in profile_data: ' + key)
+        elif found_profile_data[key] != d[key] :
+            r[0] = 'FAILED'
+            r.append('bad value for "' + key + '; value is: ' + str(found_profile_data[key]) + '; should be: ' + str(d[key]))
+    return r
+
+def run_baseline_test_func(baseline_test_func, cluster, exes, dirname) :
+    '''Executes the input test function
+
+    Args:
+        baseline_test_func (function): test function
+
+    Returns:
+        list of metrics that are parsed from the function's
+        output log
+    '''
+    baseline_test_output = baseline_test_func(cluster, exes, dirname)
+    baseline_metrics = []
+    with open(baseline_test_output['stdout_log_file']) as f:
+        for line in f:
+            match = re.search('validation metric : ([0-9.]+)', line)
+            if match:
+                baseline_metrics.append(float(match.group(1)))
+
+    assert len(baseline_metrics) > 0, 'failed to parse baseline_metrics; len: ' + str(len(baseline_metrics))
+    return baseline_metrics
+
+def create_test_func(baseline_test_func, datastore_test_funcs, profile_data=None) :
+    """Augment test function to parse log files.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    # Define test function
+    def func(cluster, exes, dirname, weekly):
+        # Run LBANN experiment without data store
+        baseline_metrics = run_baseline_test_func(baseline_test_func, cluster, exes, dirname)
+
+        # Run LBANN experiments with data store
+        num_failed = 0
+        results = []
+        for i in range(len(datastore_test_funcs)) :
+            r = run_datastore_test_func(datastore_test_funcs[i], baseline_metrics, cluster, exes, dirname, profile_data)
+            results.append(r)
+            if len(r) > 2 :
+              num_failed += 1
+
+        work = []
+        for x in results :
+            work.append(' :: '.join(x))
+        result_string = '\n'.join(work)
+        assert num_failed == 0, '\n' + result_string
+
+        print('\n===============================================')
+        print('data_store test synopsis:')
+        print(result_string)
+        print('===============================================\n')
+
+    # Return test function from factory function
+    func.__name__ = baseline_test_func.__name__
+    return func
+
+# Create test functions that can interact with PyTest
+def make_test(name, test_by_platform_list=[], args=[]) :
+    test_list = tools.create_tests(
+            setup_experiment,
+            __file__,
+            nodes=num_nodes,
+            test_name_base=name,
+            lbann_args=args)
+
+    if test_by_platform_list != [] :
+        for i in range(len(test_list)) :
+            test_by_platform_list[i].append(test_list[i])
+    return test_list
+
+baseline_tests = make_test('nodatastore')
+
+datastore_tests = [[] for j in range(len(baseline_tests))]
+
+# Dictionary of dictionaries; this will contain data for testing
+# the output file: data_store_profile_train.txt
+profile_data = {}
+
+# handles for entries in the profile_data dictionaries
+is_e = 'is_explicitly_loading'
+is_l = 'is_local_cache'
+is_f = 'is_fully_loaded'
+
+# test checkpoint, preload
+test_name = 'data_store_checkpoint_preload'
+make_test(test_name, datastore_tests, ['--preload_data_store', '--data_store_test_checkpoint=CHECKPOINT', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '0', is_l : '0', is_f : '1'}
+
+# test checkpoint, explicit
+test_name = 'data_store_checkpoint_explicit'
+make_test(test_name, datastore_tests, ['--use_data_store', '--data_store_test_checkpoint=CHECKPOINT', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '1', is_l : '0', is_f : '0'}
+
+# explicit loading
+test_name = 'data_store_explicit'
+make_test(test_name, datastore_tests, ['--use_data_store', '--data_store_profile'])
+profile_data[test_name] = {is_e : '1', is_l : '0', is_f : '0'}
+
+# preloading
+test_name = 'data_store_preload'
+make_test(test_name, datastore_tests, ['--preload_data_store', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '0', is_l : '0', is_f : '1'}
+
+#local cache with explicit loading (internally, this should run identically
+#with the flag: --preload_data_store
+test_name = 'data_store_cache_explicit'
+make_test(test_name, datastore_tests, ['--data_store_cache', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '1', is_l : '1', is_f : '0'}
+
+#local cache with preloading
+test_name = 'data_store_cache_preloading'
+make_test(test_name, datastore_tests, ['--data_store_cache', '--preload_data_store', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '0', is_l : '1', is_f : '0'}
+
+#test local cache
+test_name = 'data_store_test_cache'
+make_test(test_name, datastore_tests, ['--data_store_cache', '--preload_data_store', '--data_store_test_cache', '--data_store_profile'])
+profile_data[test_name] =  {is_e : '0', is_l : '1', is_f : '0'}
+
+for i in range(len(datastore_tests)):
+    _test_func = create_test_func(baseline_tests[i], datastore_tests[i], profile_data)
+    globals()[_test_func.__name__] = _test_func
diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py
new file mode 100644
index 00000000000..4228fd5bad4
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_argmax.py
@@ -0,0 +1,147 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190911)
+_num_samples = 35
+_sample_dims = (11,)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+_samples[1,:] = 0.5
+_samples[15,:] = -1.0
+_samples[15,3] = -0.5
+_samples[15,5] = -0.5
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Convenience function to compute L2 norm squared with NumPy
+    def l2_norm2(x):
+        x = x.reshape(-1)
+        return np.inner(x, x)
+
+    # LBANN implementation
+    x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    y = lbann.Argmax(x, device='cpu')
+    z = lbann.L2Norm2(y)
+
+    # Objects for LBANN model
+    obj = z
+    metric = lbann.Metric(z, name='obj')
+    layers = list(lbann.traverse_layer_graph(z))
+    callbacks = []
+
+    # Get expected metric value from NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = np.argmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metric.name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metric,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_argmin.py b/bamboo/unit_tests/test_unit_layer_argmin.py
new file mode 100644
index 00000000000..b6830f0891a
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_argmin.py
@@ -0,0 +1,147 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201909112)
+_num_samples = 37
+_sample_dims = (11,)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+_samples[1,:] = 0.5
+_samples[15,:] = 1.0
+_samples[15,3] = 0.5
+_samples[15,5] = 0.5
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # LBANN implementation
+    x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    y = lbann.Argmin(x, device='cpu')
+    z = lbann.L2Norm2(y)
+
+    # Objects for LBANN model
+    obj = z
+    metric = lbann.Metric(z, name='obj')
+    layers = list(lbann.traverse_layer_graph(z))
+    callbacks = []
+
+    # Get expected metric value from NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = np.argmin(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metric.name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metric,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note: Create test name by removing ".py" from file name
+_test_name = os.path.splitext(os.path.basename(current_file))[0]
+for test in tools.create_tests(setup_experiment, _test_name):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py b/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py
new file mode 100644
index 00000000000..bed7e415795
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_channelwise_fully_connected.py
@@ -0,0 +1,309 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200113)
+_num_samples = 17
+_sample_dims = (5,7,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+_scale = np.random.normal(loc=1, size=(_sample_dims[0],1,1)).astype(np.float32)
+_bias = np.random.normal(loc=0, size=(_sample_dims[0],1,1)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x0 = lbann.WeightsLayer(weights=x_weights,
+                            dims=tools.str_list(_sample_dims))
+    x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    x = lbann.Sum(x0, x1)
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Compute expected metric values with NumPy
+    # ------------------------------------------
+
+    # Input and output dimensions
+    input_channel_dims = _sample_dims[1:]
+    output_channel_dims = (2,5)
+    input_channel_size = functools.reduce(operator.mul, input_channel_dims)
+    output_channel_size = functools.reduce(operator.mul, output_channel_dims)
+
+    # Weight values
+    linearity = np.random.normal(
+        size=(output_channel_size,input_channel_size)
+    ).astype(np.float32)
+    bias = np.random.normal(size=(output_channel_size,1)).astype(np.float32)
+
+    # With bias
+    x = (_samples
+         .reshape((-1,input_channel_size))
+         .transpose()
+         .astype(np.float64))
+    y = np.matmul(linearity.astype(np.float64), x) + bias.astype(np.float64)
+    z = tools.numpy_l2norm2(y) / _num_samples
+    val_with_bias = z
+
+    # Without bias
+    x = (_samples
+         .reshape((-1,input_channel_size))
+         .transpose()
+         .astype(np.float64))
+    y = np.matmul(linearity.astype(np.float64), x)
+    z = tools.numpy_l2norm2(y) / _num_samples
+    val_without_bias = z
+
+    # ------------------------------------------
+    # Data-parallel layout, non-transpose, bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='F'))
+        )
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(bias))
+        )
+    )
+    x = x_lbann
+    y = lbann.ChannelwiseFullyConnected(
+        x,
+        weights=(linearity_weights, bias_weights),
+        output_channel_dims=output_channel_dims,
+    )
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, bias'))
+
+    # NumPy implementation
+    tol = 8 * val_with_bias * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val_with_bias-tol,
+        upper_bound=val_with_bias+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, non-transpose, no bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='F'))
+        )
+    )
+    x = x_lbann
+    y = lbann.ChannelwiseFullyConnected(
+        x,
+        weights=(linearity_weights),
+        output_channel_dims=output_channel_dims,
+        bias=False,
+    )
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, no bias'))
+
+    # NumPy implementation
+    tol = 8 * val_without_bias * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val_without_bias-tol,
+        upper_bound=val_without_bias+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, transpose, bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='C'))
+        )
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(bias))
+        )
+    )
+    x = x_lbann
+    y = lbann.ChannelwiseFullyConnected(
+        x,
+        weights=(linearity_weights, bias_weights),
+        output_channel_dims=output_channel_dims,
+        transpose=True,
+    )
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, bias'))
+
+    # NumPy implementation
+    tol = 8 * val_with_bias * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val_with_bias-tol,
+        upper_bound=val_with_bias+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, transpose, no bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='C'))
+        )
+    )
+    x = x_lbann
+    y = lbann.ChannelwiseFullyConnected(
+        x,
+        weights=(linearity_weights),
+        output_channel_dims=output_channel_dims,
+        bias=False,
+        transpose=True,
+    )
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias'))
+
+    # NumPy implementation
+    tol = 8 * val_without_bias * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val_without_bias-tol,
+        upper_bound=val_without_bias+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py
new file mode 100644
index 00000000000..1df73392538
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py
@@ -0,0 +1,161 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190719)
+_num_samples = 23
+_sample_dims = (7,5,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+_scale = np.random.normal(loc=1, size=(_sample_dims[0],1,1)).astype(np.float32)
+_bias = np.random.normal(loc=0, size=(_sample_dims[0],1,1)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x0 = lbann.WeightsLayer(weights=x_weights,
+                            dims=tools.str_list(_sample_dims))
+    x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    x = lbann.Sum(x0, x1)
+
+    # Apply channel-wise scale/bias
+    scale_values = tools.str_list(np.nditer(_scale))
+    bias_values = tools.str_list(np.nditer(_bias))
+    scalebias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values,
+                                                                 bias_values)),
+        name='scalebias_weights'
+    )
+    y = lbann.ChannelwiseScaleBias(x, weights=scalebias_weights)
+    z = lbann.L2Norm2(y)
+
+    # Objects for LBANN model
+    obj = z
+    metric = lbann.Metric(z, name='obj')
+    layers = list(lbann.traverse_layer_graph(z))
+    callbacks = []
+
+    # Get expected metric value from NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = _scale.astype(np.float64) * x + _bias.astype(np.float64)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metric.name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # Gradient checking
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=metric,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py b/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py
new file mode 100644
index 00000000000..921ecee364b
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_channelwise_softmax.py
@@ -0,0 +1,176 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200115)
+_num_samples = 15
+_sample_dims = (5,2,7)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy implementation
+# ==============================================
+
+def numpy_channelwise_softmax(x):
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    axis = tuple(range(1,x.ndim))
+    shift = np.max(x, axis=axis, keepdims=True)
+    y = np.exp(x-shift)
+    return y / np.sum(y, axis=axis, keepdims=True)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.ChannelwiseSoftmax(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = numpy_channelwise_softmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py
index 8cd7d579374..6ddd53a6d12 100644
--- a/bamboo/unit_tests/test_unit_layer_clamp.py
+++ b/bamboo/unit_tests/test_unit_layer_clamp.py
@@ -1,49 +1,196 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The clamp function is not differentiable at the interval
+# boundaries, so we make sure values are well inside or well outside
+# the interval.
+np.random.seed(201910241)
+_num_samples = 27
+_sample_size = 11
+_samples = np.random.choice([-193.0,-4.0,-1.0,1.0,3.0,5.0,2003.0],
+                            size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Clamp(x, min=-2, max=2, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel output'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.clip(x, -2, 2)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Clamp(x, min=0, max=4, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel output'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.clip(x, 0, 4)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-        e = 'skeleton_layer_clamp: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_clamp_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_clamp_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='clamp',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_clamp_clang4(cluster, exes, dirname):
-    skeleton_layer_clamp(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_clamp_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_clamp(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_clamp_gcc7(cluster, exes, dirname):
-    skeleton_layer_clamp(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_clamp_intel18(cluster, exes, dirname):
-    skeleton_layer_clamp(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe=<executable>
-def test_unit_layer_clamp_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_clamp_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_clamp(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_concatenate.py b/bamboo/unit_tests/test_unit_layer_concatenate.py
new file mode 100644
index 00000000000..946f8ec1fab
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_concatenate.py
@@ -0,0 +1,278 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+import pytest
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20191204)
+_num_samples = 17
+_sample_size = 60
+_samples = np.random.normal(size=(_num_samples,_sample_size), loc=1).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Input(),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # --------------------------
+    # Concatenate along axis 0
+    # --------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims=tools.str_list([5,3,4]))
+    x_slice = lbann.Slice(x, axis=0, slice_points=tools.str_list([0,1,3,5]))
+    x1 = lbann.Identity(x_slice)
+    x2 = lbann.Identity(x_slice)
+    x3 = lbann.Identity(x_slice)
+    y = lbann.Concatenation(x3, x2, x1, axis=0)
+    z = lbann.L2Norm2(lbann.Multiply(x, y))
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis0'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape([5,3,4]).astype(np.float64)
+        x1 = x[0:1,:,:]
+        x2 = x[1:3,:,:]
+        x3 = x[3:5,:,:]
+        y = np.concatenate((x3, x2, x1), axis=0)
+        z = tools.numpy_l2norm2(x*y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Slice along axis 1
+    # --------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims=tools.str_list([3,4,5]))
+    x_slice = lbann.Slice(x, axis=1, slice_points=tools.str_list([0,1,3,4]))
+    x1 = lbann.Identity(x_slice)
+    x2 = lbann.Identity(x_slice)
+    x3 = lbann.Identity(x_slice)
+    y = lbann.Concatenation(x2, x1, x3, axis=1)
+    z = lbann.L2Norm2(lbann.Multiply(x, y))
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis1'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape([3,4,5]).astype(np.float64)
+        x1 = x[:,0:1,:]
+        x2 = x[:,1:3,:]
+        x3 = x[:,3:4,:]
+        y = np.concatenate((x2, x1, x3), axis=1)
+        z = tools.numpy_l2norm2(x*y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Slice along axis 2
+    # --------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims=tools.str_list([3,4,5]))
+    x_slice = lbann.Slice(x, axis=2, slice_points=tools.str_list([0,1,2,3,5]))
+    x1 = lbann.Identity(x_slice)
+    x2 = lbann.Identity(x_slice)
+    x3 = lbann.Identity(x_slice)
+    x4 = lbann.Identity(x_slice)
+    y = lbann.Concatenation(x2, x4, x1, x3, axis=2)
+    z = lbann.L2Norm2(lbann.Multiply(x, y))
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis2'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape([3,4,5]).astype(np.float64)
+        x1 = x[:,:,0:1]
+        x2 = x[:,:,1:2]
+        x3 = x[:,:,2:3]
+        x4 = x[:,:,3:5]
+        y = np.concatenate((x2, x4, x1, x3), axis=2)
+        z = tools.numpy_l2norm2(x*y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Model-parallel
+    # --------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims=tools.str_list([60]))
+    x_slice = lbann.Slice(x, slice_points=tools.str_list([0,22,23,60]))
+    x1 = lbann.Identity(x_slice)
+    x2 = lbann.Identity(x_slice)
+    x3 = lbann.Identity(x_slice)
+    y = lbann.Concatenation(x3, x1, x2, data_layout='model_parallel')
+    z = lbann.L2Norm2(lbann.Multiply(x, y))
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape([60]).astype(np.float64)
+        x1 = x[0:22]
+        x2 = x[22:23]
+        x3 = x[23:60]
+        y = np.concatenate((x3, x1, x2))
+        z = tools.numpy_l2norm2(x*y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Gradient checking
+    # --------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # --------------------------
+    # Construct model
+    # --------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py
new file mode 100644
index 00000000000..275b4d9d94d
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_convolution.py
@@ -0,0 +1,327 @@
+import functools
+import math
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+def make_random_array(shape, seed):
+    """Hacked function to generate a random array.
+
+    NumPy's RNG produces different values with different NumPy
+    versions. This function is helpful when array values must be
+    identical across all runs, e.g. when checking against precomputed
+    metric values.
+
+    Args:
+        shape (Iterable of int): Array dimensions
+        seed (int): Parameter for RNG. Must be non-zero.
+    Returns:
+        numpy.ndarray: Array of `np.float32`. Values will be in
+            [-0.5,0.5).
+
+    """
+    size = functools.reduce(operator.mul, shape)
+    eps = np.finfo(np.float32).eps
+    x = (seed / np.linspace(math.sqrt(eps), 0.1, size)) % 1 - 0.5
+    return x.reshape(shape).astype(np.float32)
+
+# Data
+_num_samples = 23
+_sample_dims = [6,11,7]
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = make_random_array([_num_samples] + _sample_dims, 7)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:].reshape(-1)
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# PyTorch convolution
+# ==============================================
+
+def pytorch_convolution(data,
+                        kernel,
+                        bias=None,
+                        stride=1,
+                        padding=0,
+                        dilation=1,
+                        groups=1):
+    """Wrapper around PyTorch convolution.
+
+    Input and output data are NumPy arrays.
+
+    """
+
+    # Convert input data to PyTorch tensors with 64-bit floats
+    import torch
+    import torch.nn.functional
+    if type(data) is np.ndarray:
+        data = torch.from_numpy(data)
+    if type(kernel) is np.ndarray:
+        kernel = torch.from_numpy(kernel)
+    if type(bias) is np.ndarray:
+        bias = torch.from_numpy(bias)
+    if data.dtype is not torch.float64:
+        data = data.astype(torch.float64)
+    if kernel.dtype is not torch.float64:
+        kernel = kernel.astype(torch.float64)
+    if bias.dtype is not torch.float64:
+        bias = bias.astype(torch.float64)
+
+    # Perform convolution with PyTorch
+    output = None
+    if len(kernel.shape) == 3:
+        output = torch.nn.functional.conv1d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if len(kernel.shape) == 4:
+        output = torch.nn.functional.conv2d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if len(kernel.shape) == 5:
+        output = torch.nn.functional.conv3d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if output is None:
+        raise ValueError('PyTorch only supports 1D, 2D, and 3D convolution')
+
+    # Return output as NumPy array
+    return output.numpy()
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Basic 3x3 convolution
+    # ------------------------------------------
+    # 3x3 conv, stride=1, pad=1, dilation=1, bias
+
+    # Convolution settings
+    kernel_dims = (5, _sample_dims[0], 3, 3)
+    strides = (1, 1)
+    pads = (1, 1)
+    dilations = (1, 1)
+    kernel = make_random_array(kernel_dims, 11)
+    bias = make_random_array([kernel_dims[0]], 123)
+
+    # Apply convolution
+    kernel_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))),
+        name='kernel1'
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(bias))),
+        name='bias1'
+    )
+    x = x_lbann
+    y = lbann.Convolution(x,
+                          weights=(kernel_weights, bias_weights),
+                          num_dims=3,
+                          num_output_channels=kernel_dims[0],
+                          has_vectors=True,
+                          conv_dims=tools.str_list(kernel_dims[2:]),
+                          conv_strides=tools.str_list(strides),
+                          conv_pads=tools.str_list(pads),
+                          conv_dilations=tools.str_list(dilations),
+                          has_bias=True)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='basic 3x3 convolution'))
+
+    # PyTorch implementation
+    try:
+        x = _samples
+        y = pytorch_convolution(
+            x, kernel, bias=bias,
+            stride=strides, padding=pads, dilation=dilations
+        )
+        z = tools.numpy_l2norm2(y) / _num_samples
+        val = z
+    except:
+        # Precomputed value
+        val = 153.84937996554953
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # 2x4 strided convolution
+    # ------------------------------------------
+
+    # Convolution settings
+    kernel_dims = (3, _sample_dims[0], 2, 4)
+    strides = (3, 1)
+    pads = (3, 0)
+    dilations = (1, 1)
+    num_groups = 1
+    kernel = make_random_array(kernel_dims, 19)
+
+    # Apply convolution
+    kernel_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))),
+        name='kernel2'
+    )
+    x = x_lbann
+    y = lbann.Convolution(x,
+                          weights=(kernel_weights),
+                          num_dims=3,
+                          num_output_channels=kernel_dims[0],
+                          has_vectors=True,
+                          conv_dims=tools.str_list(kernel_dims[2:]),
+                          conv_strides=tools.str_list(strides),
+                          conv_pads=tools.str_list(pads),
+                          conv_dilations=tools.str_list(dilations),
+                          num_groups=num_groups,
+                          has_bias=False)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='2x4 convolution'))
+
+    # PyTorch implementation
+    try:
+        x = _samples
+        y = pytorch_convolution(
+            x, kernel, bias=None,
+            stride=strides, padding=pads,
+            dilation=dilations, groups=num_groups
+        )
+        z = tools.numpy_l2norm2(y) / _num_samples
+        val = z
+    except:
+        # Precomputed value
+        val = 19.24587403346207
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note: Create test name by removing ".py" from file name
+_test_name = os.path.splitext(os.path.basename(current_file))[0]
+for test in tools.create_tests(setup_experiment, _test_name):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_convolution_distconv.py b/bamboo/unit_tests/test_unit_layer_convolution_distconv.py
new file mode 100644
index 00000000000..74e665a09cc
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_convolution_distconv.py
@@ -0,0 +1,332 @@
+import functools
+import math
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+def make_random_array(shape, seed):
+    """Hacked function to generate a random array.
+
+    NumPy's RNG produces different values with different NumPy
+    versions. This function is helpful when array values must be
+    identical across all runs, e.g. when checking against precomputed
+    metric values.
+
+    Args:
+        shape (Iterable of int): Array dimensions
+        seed (int): Parameter for RNG. Must be non-zero.
+    Returns:
+        numpy.ndarray: Array of `np.float32`. Values will be in
+            [-0.5,0.5).
+
+    """
+    size = functools.reduce(operator.mul, shape)
+    eps = np.finfo(np.float32).eps
+    x = (seed / np.linspace(math.sqrt(eps), 0.1, size)) % 1 - 0.5
+    return x.reshape(shape).astype(np.float32)
+
+# Data
+_num_samples = 23
+_sample_dims = [6,11,7]
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = make_random_array([_num_samples] + _sample_dims, 7)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:].reshape(-1)
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# PyTorch convolution
+# ==============================================
+
+def pytorch_convolution(data,
+                        kernel,
+                        bias=None,
+                        stride=1,
+                        padding=0,
+                        dilation=1,
+                        groups=1):
+    """Wrapper around PyTorch convolution.
+
+    Input and output data are NumPy arrays.
+
+    """
+
+    # Convert input data to PyTorch tensors with 64-bit floats
+    import torch
+    import torch.nn.functional
+    if type(data) is np.ndarray:
+        data = torch.from_numpy(data)
+    if type(kernel) is np.ndarray:
+        kernel = torch.from_numpy(kernel)
+    if type(bias) is np.ndarray:
+        bias = torch.from_numpy(bias)
+    if data.dtype is not torch.float64:
+        data = data.astype(torch.float64)
+    if kernel.dtype is not torch.float64:
+        kernel = kernel.astype(torch.float64)
+    if bias.dtype is not torch.float64:
+        bias = bias.astype(torch.float64)
+
+    # Perform convolution with PyTorch
+    output = None
+    if len(kernel.shape) == 3:
+        output = torch.nn.functional.conv1d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if len(kernel.shape) == 4:
+        output = torch.nn.functional.conv2d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if len(kernel.shape) == 5:
+        output = torch.nn.functional.conv3d(
+            data, kernel, bias, stride, padding, dilation, groups
+        )
+    if output is None:
+        raise ValueError('PyTorch only supports 1D, 2D, and 3D convolution')
+
+    # Return output as NumPy array
+    return output.numpy()
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def create_parallel_strategy(num_height_groups):
+    return {"height_groups": num_height_groups}
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Basic 3x3 convolution
+    # ------------------------------------------
+    # 3x3 conv, stride=1, pad=1, dilation=1, bias
+
+    # Convolution settings
+    kernel_dims = (5, _sample_dims[0], 3, 3)
+    strides = (1, 1)
+    pads = (1, 1)
+    dilations = (1, 1)
+    kernel = make_random_array(kernel_dims, 11)
+    bias = make_random_array([kernel_dims[0]], 123)
+
+    # Apply convolution
+    kernel_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))),
+        name='kernel1'
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(bias))),
+        name='bias1'
+    )
+    x = x_lbann
+    y = lbann.Convolution(x,
+                          weights=(kernel_weights, bias_weights),
+                          num_dims=3,
+                          num_output_channels=kernel_dims[0],
+                          has_vectors=True,
+                          conv_dims=tools.str_list(kernel_dims[2:]),
+                          conv_strides=tools.str_list(strides),
+                          conv_pads=tools.str_list(pads),
+                          conv_dilations=tools.str_list(dilations),
+                          has_bias=True,
+                          parallel_strategy=create_parallel_strategy(4))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='basic 3x3 convolution'))
+
+    # PyTorch implementation
+    try:
+        x = _samples
+        y = pytorch_convolution(
+            x, kernel, bias=bias,
+            stride=strides, padding=pads, dilation=dilations
+        )
+        z = tools.numpy_l2norm2(y) / _num_samples
+        val = z
+    except:
+        # Precomputed value
+        val = 153.84937996554953
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # 2x4 strided convolution
+    # ------------------------------------------
+
+    # Convolution settings
+    kernel_dims = (3, _sample_dims[0], 2, 4)
+    strides = (3, 1)
+    pads = (3, 0)
+    dilations = (1, 1)
+    num_groups = 1
+    kernel = make_random_array(kernel_dims, 19)
+
+    # Apply convolution
+    kernel_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))),
+        name='kernel2'
+    )
+    x = x_lbann
+    y = lbann.Convolution(x,
+                          weights=(kernel_weights),
+                          num_dims=3,
+                          num_output_channels=kernel_dims[0],
+                          has_vectors=True,
+                          conv_dims=tools.str_list(kernel_dims[2:]),
+                          conv_strides=tools.str_list(strides),
+                          conv_pads=tools.str_list(pads),
+                          conv_dilations=tools.str_list(dilations),
+                          num_groups=num_groups,
+                          has_bias=False,
+                          parallel_strategy=create_parallel_strategy(4))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='2x4 convolution'))
+
+    # PyTorch implementation
+    try:
+        x = _samples
+        y = pytorch_convolution(
+            x, kernel, bias=None,
+            stride=strides, padding=pads,
+            dilation=dilations, groups=num_groups
+        )
+        z = tools.numpy_l2norm2(y) / _num_samples
+        val = z
+    except:
+        # Precomputed value
+        val = 19.24587403346207
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note: Create test name by removing ".py" from file name
+_test_name = os.path.splitext(os.path.basename(current_file))[0]
+for test in tools.create_tests(setup_experiment, _test_name, procs_per_node=4):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py
index e72bca4fb51..a0137841b27 100644
--- a/bamboo/unit_tests/test_unit_layer_covariance.py
+++ b/bamboo/unit_tests/test_unit_layer_covariance.py
@@ -1,49 +1,261 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910242)
+_samples = np.random.normal(size=(27,2,5)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout, unbiased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.Covariance(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, unbiased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.Covariance(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, biased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.Covariance(x0, x1, biased=True, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, biased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        x0 = x[:slice_size].astype(np.float64)
+        x1 = x[slice_size:].astype(np.float64)
+        y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, biased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.Covariance(x0, x1, biased=True, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, biased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        x0 = x[:slice_size].astype(np.float64)
+        x1 = x[slice_size:].astype(np.float64)
+        y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-        e = 'skeleton_layer_covariance: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='covariance',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_covariance_clang4(cluster, exes, dirname):
-    skeleton_layer_covariance(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_covariance(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_covariance_gcc7(cluster, exes, dirname):
-    skeleton_layer_covariance(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_covariance_intel18(cluster, exes, dirname):
-    skeleton_layer_covariance(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe=<executable>
-def test_unit_layer_covariance_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_covariance_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_covariance(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py
new file mode 100644
index 00000000000..a417f61947f
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py
@@ -0,0 +1,232 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The error bounds for gradient checking assume that the fourth
+# derivative of the objective function is ~1. However, given our loss
+# function:
+#   L = ( -xhat * log(x) )^2
+#   L'''' = O( xhat^2 * log(x) / x^4 )
+# We have x >= 0.25 to make sure the fourth derivative does not get
+# too big and mess up the error bounds.
+np.random.seed(201910143)
+_samples = np.random.uniform(low=0.25,
+                             high=1,
+                             size=(23,2,7)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# NumPy cross entropy
+# ==============================================
+
+def numpy_cross_entropy(x, xhat):
+    """Cross entropy between two distributions, computed with NumPy
+
+    The computation is performed with 64-bit floats.
+
+    Args:
+        x: Estimated distribution
+        xhat: True distribution
+
+    """
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    if xhat.dtype is not np.float64:
+        xhat = xhat.astype(np.float64)
+    return -np.inner(xhat, np.log(x))
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.CrossEntropy(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = -np.inner(x1, np.log(x0))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.CrossEntropy(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = -np.inner(x1, np.log(x0))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note: Create test name by removing ".py" from file name
+_test_name = os.path.splitext(os.path.basename(current_file))[0]
+for test in tools.create_tests(setup_experiment, _test_name):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_dist_embedding.py b/bamboo/unit_tests/test_unit_layer_dist_embedding.py
new file mode 100644
index 00000000000..bdec9c11d54
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_dist_embedding.py
@@ -0,0 +1,215 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+_seed = 20200117
+_num_samples = 41
+_num_embeddings = 11
+_sequence_length = 3
+
+# Sample access functions
+def get_sample(index):
+    np.random.seed(100*_seed+index)
+    return np.random.randint(_num_embeddings, size=_sequence_length)
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sequence_length,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    x = lbann.Identity(lbann.Input())
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # GPU
+    # ------------------------------------------
+
+    # Embeddings
+    np.random.seed(_seed)
+    embedding_dim = 7
+    embeddings = np.random.normal(size=(_num_embeddings,embedding_dim))
+
+    # LBANN implementation
+    embedding_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings)))
+    )
+    x = x_lbann
+    y = lbann.DistEmbedding(x,
+                            weights=embedding_weights,
+                            num_embeddings=_num_embeddings,
+                            embedding_dim=embedding_dim,
+                            barrier_in_forward_prop=True,
+                            device='gpu')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='GPU'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        y = embeddings[x,:]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # CPU
+    # ------------------------------------------
+
+    # Embeddings
+    np.random.seed(_seed)
+    embedding_dim = 5
+    embeddings = np.random.normal(size=(_num_embeddings,embedding_dim))
+
+    # LBANN implementation
+    embedding_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings)))
+    )
+    x = x_lbann
+    y = lbann.DistEmbedding(x,
+                            weights=embedding_weights,
+                            num_embeddings=_num_embeddings,
+                            embedding_dim=embedding_dim,
+                            barrier_in_forward_prop=True,
+                            device='cpu')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='CPU'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        y = embeddings[x,:]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note (tym 6/12/20): Tests are disabled for now since the default
+# build doesn't include SHMEM or NVSHMEM. Restore these tests when
+# proper support is added.
+# for test in tools.create_tests(setup_experiment, __file__):
+#     globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py
index 66b10d1fc5b..e42882e264d 100644
--- a/bamboo/unit_tests/test_unit_layer_elu.py
+++ b/bamboo/unit_tests/test_unit_layer_elu.py
@@ -1,49 +1,194 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: ELU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(201910243)
+_num_samples = 37
+_sample_size = 8
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Elu(x, alpha=1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x < 0, np.expm1(x), x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Elu(x, alpha=0.5, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x < 0, 0.5*np.expm1(x), x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_elu(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-        e = 'skeleton_layer_elu: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='elu',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_elu_clang4(cluster, exes, dirname):
-    skeleton_layer_elu(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_elu_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_elu(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_elu_gcc7(cluster, exes, dirname):
-    skeleton_layer_elu(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_elu_intel18(cluster, exes, dirname):
-    skeleton_layer_elu(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe=<executable>
-def test_unit_layer_elu_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_elu_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_elu(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py
new file mode 100644
index 00000000000..8f05792c6f6
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_embedding.py
@@ -0,0 +1,210 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+_num_samples = 41
+_num_embeddings = 11
+_sequence_length = 3
+
+# Sample access functions
+def get_sample(index):
+    np.random.seed(2019101500+index)
+    return np.random.randint(_num_embeddings, size=_sequence_length)
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sequence_length,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    x = lbann.Identity(lbann.Input())
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # No padding index
+    # ------------------------------------------
+
+    # Embeddings
+    np.random.seed(20191015)
+    embedding_dim = 5
+    embeddings = np.random.normal(size=(_num_embeddings,embedding_dim))
+
+    # LBANN implementation
+    embedding_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings)))
+    )
+    x = x_lbann
+    y = lbann.Embedding(x,
+                        weights=embedding_weights,
+                        num_embeddings=_num_embeddings,
+                        embedding_dim=embedding_dim)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='no padding index'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        y = embeddings[x,:]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Padding index 0
+    # ------------------------------------------
+
+    # Embeddings
+    np.random.seed(201910152)
+    embedding_dim = 7
+    padding_idx = 0
+    embeddings = np.random.normal(size=(_num_embeddings,embedding_dim))
+
+    # LBANN implementation
+    # Note: Embedding layer gradients are not exact if a padding index
+    # is set. Avoid gradient checking by not using an optimizer.
+    embedding_weights = lbann.Weights(
+        optimizer=None,
+        initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings)))
+    )
+    x = x_lbann
+    y = lbann.Embedding(x,
+                        weights=embedding_weights,
+                        num_embeddings=_num_embeddings,
+                        embedding_dim=embedding_dim,
+                        padding_idx=padding_idx)
+    z = lbann.L2Norm2(y)
+    metrics.append(lbann.Metric(z, name='padding index = 0'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)
+        y = np.where((x==padding_idx).reshape((-1,1)), 0, embeddings[x,:])
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py
new file mode 100644
index 00000000000..ced689cdad4
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py
@@ -0,0 +1,174 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190815)
+_num_samples = 29
+_sample_dims = (7,5,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: We want to use gradient checking to verify that error
+    # signals are correct. To do this, we zero-initialize a weights
+    # object, construct a zero-valued tensor, and add it to the
+    # input. To make sure that batchnorm is non-trivial, we multiply
+    # the zero-valued tensor by the mini-batch index.
+    x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims))
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x0 = lbann.WeightsLayer(weights=x_weights,
+                            dims=tools.str_list(_sample_dims))
+    x1 = lbann.Divide(lbann.MiniBatchIndex(), lbann.MiniBatchSize())
+    x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims))
+    x = lbann.Sum(x, lbann.Multiply(x0, x1))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    decay = 0.9
+    epsilon = 1e-5
+    x = x_lbann
+    y = lbann.EntrywiseBatchNormalization(x,
+                                          decay=decay,
+                                          epsilon=epsilon,
+                                          data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    decay = 0.9
+    epsilon = 1e-5
+    x = x_lbann
+    y = lbann.EntrywiseBatchNormalization(x,
+                                          decay=decay,
+                                          epsilon=epsilon,
+                                          data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 1
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py
new file mode 100644
index 00000000000..e6308a8ea4f
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py
@@ -0,0 +1,209 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190723)
+_num_samples = 29
+_sample_dims = (7,5,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+_scale = np.random.normal(loc=1, size=_sample_dims).astype(np.float32)
+_bias = np.random.normal(loc=0, size=_sample_dims).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    scale_values = tools.str_list(np.nditer(_scale))
+    bias_values = tools.str_list(np.nditer(_bias))
+    scalebias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values,
+                                                                 bias_values)))
+    x = x_lbann
+    y = lbann.EntrywiseScaleBias(x,
+                                 weights=scalebias_weights,
+                                 data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = _scale.astype(np.float64) * x + _bias.astype(np.float64)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    scale_values = tools.str_list(np.nditer(_scale))
+    bias_values = tools.str_list(np.nditer(_bias))
+    scalebias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values,
+                                                                 bias_values)))
+    x = x_lbann
+    y = lbann.EntrywiseScaleBias(x,
+                                 weights=scalebias_weights,
+                                 data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = _scale.astype(np.float64) * x + _bias.astype(np.float64)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py
new file mode 100644
index 00000000000..4ccee406c39
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py
@@ -0,0 +1,299 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20191011)
+_num_samples = 31
+_input_size = 11
+_output_size = 3
+_samples = np.random.normal(size=(_num_samples,_input_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_input_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_input_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_input_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Compute expected metric values with NumPy
+    # ------------------------------------------
+
+    # Weight values
+    linearity = np.random.normal(size=(_output_size,_input_size)).astype(np.float32)
+    bias = np.random.normal(size=(_output_size,1)).astype(np.float32)
+
+    # With bias
+    x = _samples.transpose().astype(np.float64)
+    y = np.matmul(linearity.astype(np.float64), x) + bias.astype(np.float64)
+    z = tools.numpy_l2norm2(y) / _num_samples
+    val_with_bias = z
+
+    # Without bias
+    x = _samples.transpose().astype(np.float64)
+    y = np.matmul(linearity.astype(np.float64), x)
+    z = tools.numpy_l2norm2(y) / _num_samples
+    val_without_bias = z
+
+    # ------------------------------------------
+    # Data-parallel layout, non-transpose, bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='F'))
+        )
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(bias))
+        )
+    )
+    x = x_lbann
+    y = lbann.FullyConnected(x,
+                             weights=(linearity_weights, bias_weights),
+                             data_layout='data_parallel',
+                             num_neurons=_output_size,
+                             has_bias=True,
+                             transpose=False)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, bias'))
+
+    # NumPy implementation
+    val = val_with_bias
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, non-transpose, bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='F'))
+        )
+    )
+    bias_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(bias))
+        )
+    )
+    x = x_lbann
+    y = lbann.FullyConnected(x,
+                             weights=(linearity_weights, bias_weights),
+                             data_layout='model_parallel',
+                             num_neurons=_output_size,
+                             has_bias=True,
+                             transpose=False)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, non-transpose, bias'))
+
+    # NumPy implementation
+    val = val_with_bias
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, transpose, no bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='C'))
+        )
+    )
+    x = x_lbann
+    y = lbann.FullyConnected(x,
+                             weights=linearity_weights,
+                             data_layout='data_parallel',
+                             num_neurons=_output_size,
+                             has_bias=False,
+                             transpose=True)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias'))
+
+    # NumPy implementation
+    val = val_without_bias
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, transpose, no bias
+    # ------------------------------------------
+
+    # LBANN implementation
+    linearity_weights = lbann.Weights(
+        optimizer=lbann.SGD(),
+        initializer=lbann.ValueInitializer(
+            values=tools.str_list(np.nditer(linearity, order='C'))
+        )
+    )
+    x = x_lbann
+    y = lbann.FullyConnected(x,
+                             weights=linearity_weights,
+                             data_layout='model_parallel',
+                             num_neurons=_output_size,
+                             has_bias=False,
+                             transpose=True)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias'))
+
+    # NumPy implementation
+    val = val_without_bias
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+# Note: Create test name by removing ".py" from file name
+_test_name = os.path.splitext(os.path.basename(current_file))[0]
+for test in tools.create_tests(setup_experiment, _test_name):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py
index 86568e946d5..cb9a523d62f 100644
--- a/bamboo/unit_tests/test_unit_layer_identity.py
+++ b/bamboo/unit_tests/test_unit_layer_identity.py
@@ -1,49 +1,190 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910244)
+_num_samples = 83
+_sample_size = 47
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Identity(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Identity(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_identity(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-        e = 'skeleton_layer_identity: default_exes[%s] does not exist' % compiler_name
-        print('Skip - ' + e)
-        pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='identity',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_identity_clang4(cluster, exes, dirname):
-    skeleton_layer_identity(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_identity_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_identity(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_identity_gcc7(cluster, exes, dirname):
-    skeleton_layer_identity(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_identity_intel18(cluster, exes, dirname):
-    skeleton_layer_identity(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe=<executable>
-def test_unit_layer_identity_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_identity_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_identity(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_identity_distconv.py b/bamboo/unit_tests/test_unit_layer_identity_distconv.py
new file mode 100644
index 00000000000..7a991359bcc
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_identity_distconv.py
@@ -0,0 +1,196 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910244)
+_num_samples = 83
+_sample_size = 48
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def create_parallel_strategy(num_height_groups):
+    return {"height_groups": num_height_groups}
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout with distconv
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims="4 4 3")
+    y = lbann.Identity(x, data_layout='data_parallel',
+                       parallel_strategy=create_parallel_strategy(4))
+    x = lbann.Reshape(x, dims="48")
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Identity(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_instance_norm.py b/bamboo/unit_tests/test_unit_layer_instance_norm.py
new file mode 100644
index 00000000000..bdc2c44a075
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_instance_norm.py
@@ -0,0 +1,176 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20200107)
+_num_samples = 15
+_sample_dims = (5,3,7)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy implementation
+# ==============================================
+
+def numpy_instance_norm(x, epsilon=1e-5):
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    axes = tuple(range(1,x.ndim))
+    mean = np.mean(x, axis=axes, keepdims=True)
+    var = np.var(x, ddof=1, axis=axes, keepdims=True)
+    return (x - mean) / np.sqrt(var + epsilon)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.InstanceNorm(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = numpy_instance_norm(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py
index 9abcc2652ce..c3054f862f0 100644
--- a/bamboo/unit_tests/test_unit_layer_l1_norm.py
+++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py
@@ -1,49 +1,194 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The L1 norm is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(201910245)
+_num_samples = 23
+_sample_size = 11
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.L1Norm(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.linalg.norm(x, 1)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.L1Norm(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.linalg.norm(x, 1)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_l1_norm: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_l1_norm_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='l1_norm',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_l1_norm_clang4(cluster, exes, dirname):
-    skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_l1_norm_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname):
-    skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_l1_norm_intel18(cluster, exes, dirname):
-    skeleton_layer_l1_norm(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe=<executable>
-def test_unit_layer_l1_norm_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_l1_norm_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_l1_norm(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py
deleted file mode 100644
index cdbad231498..00000000000
--- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-
-def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_l2_norm2: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='l2_norm2',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-
-def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname):
-    skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname):
-    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname):
-    skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe=<executable>
-def test_unit_layer_l2_norm2_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_l2_norm2_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_layer_norm.py b/bamboo/unit_tests/test_unit_layer_layer_norm.py
new file mode 100644
index 00000000000..c3daef9b888
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_layer_norm.py
@@ -0,0 +1,202 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20191114)
+_num_samples = 31
+_sample_size = 31
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy softmax
+# ==============================================
+
+def numpy_layer_norm(x, epsilon=1e-5):
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    mean = np.mean(x)
+    var = np.var(x, ddof=1)
+    return (x - mean) / np.sqrt(var + epsilon)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LayerNorm(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_layer_norm(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    epsilon = 0.0123
+    x = x_lbann
+    y = lbann.LayerNorm(x, data_layout='model_parallel', epsilon=epsilon)
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_layer_norm(x, epsilon)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py
index 6c90b34ce78..cce0e0802c2 100644
--- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py
+++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py
@@ -1,49 +1,194 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The leaky ReLU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(201910246)
+_num_samples = 23
+_sample_size = 11
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LeakyRelu(x, negative_slope=0.01, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x > 0, x, 0.01*x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LeakyRelu(x, negative_slope=2, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x > 0, x, 2*x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_leaky_relu: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_leaky_relu_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_leaky_relu_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='leaky_relu',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_leaky_relu_clang4(cluster, exes, dirname):
-    skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_leaky_relu_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname):
-    skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_leaky_relu_intel18(cluster, exes, dirname):
-    skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe=<executable>
-def test_unit_layer_leaky_relu_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_leaky_relu_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py b/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py
new file mode 100644
index 00000000000..e3abe076bef
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_leaky_relu_distconv.py
@@ -0,0 +1,205 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The leaky ReLU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(201910246)
+_num_samples = 23
+_sample_size = 48
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def create_parallel_strategy(num_height_groups):
+    return {"height_groups": num_height_groups}
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims="4 2 6")
+    y = lbann.LeakyRelu(x, negative_slope=0.01,
+                        data_layout='data_parallel',
+                        parallel_strategy=create_parallel_strategy(4))
+    y = lbann.Reshape(y, dims=str(sample_dims()))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x > 0, x, 0.01*x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims="4 2 6")
+    y = lbann.LeakyRelu(x, negative_slope=2,
+                        data_layout='model_parallel',
+                        parallel_strategy=create_parallel_strategy(4))
+    y = lbann.Reshape(y, dims=str(sample_dims()))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x > 0, x, 2*x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py
index 9a47d55754d..fa2ea6035ee 100644
--- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py
+++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py
@@ -1,49 +1,192 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The L1 norm is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(201910247)
+_num_samples = 23
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LogSigmoid(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x - np.log1p(np.exp(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LogSigmoid(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x - np.log1p(np.exp(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_log_sigmoid: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='log_sigmoid',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_log_sigmoid_clang4(cluster, exes, dirname):
-    skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_log_sigmoid_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname):
-    skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_log_sigmoid_intel18(cluster, exes, dirname):
-    skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe=<executable>
-def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_log_sigmoid_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py
index 85a20790d31..d541a3c7531 100644
--- a/bamboo/unit_tests/test_unit_layer_log_softmax.py
+++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py
@@ -1,49 +1,205 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910213)
+_num_samples = 15
+_sample_size = 11
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy softmax
+# ==============================================
+
+def numpy_log_softmax(x):
+    """Log-softmax, computed with NumPy
+
+    The computation is performed with 64-bit floats.
+
+    """
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    x = x - np.max(x)
+    return x - np.log(np.sum(np.exp(x)))
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LogSoftmax(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_log_softmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.LogSoftmax(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_log_softmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_log_softmax: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='log_softmax',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_log_softmax_clang4(cluster, exes, dirname):
-    skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname):
-    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_log_softmax_intel18(cluster, exes, dirname):
-    skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe=<executable>
-def test_unit_layer_log_softmax_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_log_softmax_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_log_softmax(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_matmul.py b/bamboo/unit_tests/test_unit_layer_matmul.py
new file mode 100644
index 00000000000..53dd3a2557b
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_matmul.py
@@ -0,0 +1,264 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20191111)
+_m = 11
+_n = 3
+_k = 5
+_samples = np.random.normal(size=(27,_m*_k+_k*_n)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, _m*_k, _m*_k+_k*_n]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(_m*_k)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(_k*_n)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # NN GEMM
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_m, _k]))
+    x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_k, _n]))
+    y = lbann.MatMul(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='NN GEMM'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:_m*_k].reshape([_m,_k])
+        x1 = x[_m*_k:].reshape([_k,_n])
+        y = np.matmul(x0, x1)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # TN GEMM
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_k, _m]))
+    x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_k, _n]))
+    y = lbann.MatMul(x0, x1, transpose_a=True, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='TN GEMM'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:_m*_k].reshape([_k,_m])
+        x1 = x[_m*_k:].reshape([_k,_n])
+        y = np.matmul(x0.transpose(), x1)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # NT GEMM
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_m, _k]))
+    x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_n, _k]))
+    y = lbann.MatMul(x0, x1, transpose_b=True, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='NT GEMM'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:_m*_k].reshape([_m,_k])
+        x1 = x[_m*_k:].reshape([_n,_k])
+        y = np.matmul(x0, x1.transpose())
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # TT GEMM
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = lbann.Reshape(x0_lbann, dims=tools.str_list([_k, _m]))
+    x1 = lbann.Reshape(x1_lbann, dims=tools.str_list([_n, _k]))
+    y = lbann.MatMul(x0, x1, transpose_a=True, transpose_b=True,
+                     data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='TT GEMM'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:_m*_k].reshape([_k,_m])
+        x1 = x[_m*_k:].reshape([_n,_k])
+        y = np.matmul(x0.transpose(), x1.transpose())
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py
index c21544ed295..4c4c8eb7045 100644
--- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py
+++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py
@@ -1,49 +1,204 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: MAE is not differentiable when the two inputs match, so we
+# make sure inputs have separated values.
+np.random.seed(201910248)
+_samples = np.random.uniform(-0.25, 0.25, size=(27,2,7)).astype(np.float32)
+_samples[:,1,:] += np.random.choice([-1.0,1.0], size=(27,7))
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.MeanAbsoluteError(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = np.linalg.norm(x1-x0, 1) / slice_size
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.MeanAbsoluteError(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = np.linalg.norm(x1-x0, 1) / slice_size
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_mean_absolute_error: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_mean_absolute_error_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_mean_absolute_error_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='mean_absolute_error',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_mean_absolute_error_clang4(cluster, exes, dirname):
-    skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_mean_absolute_error_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname):
-    skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_mean_absolute_error_intel18(cluster, exes, dirname):
-    skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe=<executable>
-def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py
new file mode 100644
index 00000000000..2e6a1cef5f6
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py
@@ -0,0 +1,201 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910249)
+_samples = np.random.normal(size=(27,2,13)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.MeanSquaredError(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = tools.numpy_l2norm2(x1-x0) / slice_size
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.MeanSquaredError(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = tools.numpy_l2norm2(x1-x0) / slice_size
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py
new file mode 100644
index 00000000000..6a3db01a79b
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_one_hot.py
@@ -0,0 +1,139 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+one_hot_size = 7
+seed = 201909113
+
+# Sample access functions
+def get_sample(index):
+    np.random.seed(seed+index)
+    return [np.random.uniform(-1, one_hot_size+1)]
+def num_samples():
+    return 47
+def sample_dims():
+    return (1,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Layer graph
+    x = lbann.Input()
+    y1 = lbann.OneHot(x, size=one_hot_size)
+    y2 = lbann.Concatenation([lbann.Constant(value=i+1, num_neurons='1')
+                              for i in range(one_hot_size)])
+    y = lbann.Multiply(y1, y2)
+    z = lbann.L2Norm2(y)
+
+    # Objects for LBANN model
+    layers = list(lbann.traverse_layer_graph(x))
+    metric = lbann.Metric(z, name='obj')
+    obj = lbann.ObjectiveFunction(z)
+    callbacks = []
+
+    # Compute expected metric value
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i)[0]
+        y = int(x) + 1 if (0 <= x and x < one_hot_size) else 0
+        z = y * y
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metric.name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # Construct model
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=layers,
+                       objective_function=obj,
+                       metrics=[metric],
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py
index c904cce301f..0b5700aca99 100644
--- a/bamboo/unit_tests/test_unit_layer_relu.py
+++ b/bamboo/unit_tests/test_unit_layer_relu.py
@@ -1,49 +1,194 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: ReLU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(2019102410)
+_num_samples = 23
+_sample_size = 41
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Relu(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.maximum(x, 0.0)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Relu(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.maximum(x, 0.0)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_relu(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_relu: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_relu_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_relu_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='relu',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_relu_clang4(cluster, exes, dirname):
-    skeleton_layer_relu(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_relu_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_relu(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_relu_gcc7(cluster, exes, dirname):
-    skeleton_layer_relu(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_relu_intel18(cluster, exes, dirname):
-    skeleton_layer_relu(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe=<executable>
-def test_unit_layer_relu_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_relu_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_relu(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_relu_distconv.py b/bamboo/unit_tests/test_unit_layer_relu_distconv.py
new file mode 100644
index 00000000000..eed3171b2b5
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_relu_distconv.py
@@ -0,0 +1,203 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: ReLU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(2019102410)
+_num_samples = 23
+_sample_size = 48
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def create_parallel_strategy(num_height_groups):
+    return {"height_groups": num_height_groups}
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims="4 2 6")
+    y = lbann.Relu(x, data_layout='data_parallel',
+                   parallel_strategy=create_parallel_strategy(4))
+    y = lbann.Reshape(y, dims=str(sample_dims()))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.maximum(x, 0.0)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    x = lbann.Reshape(x, dims="4 2 6")
+    y = lbann.Relu(x, data_layout='model_parallel',
+                   parallel_strategy=create_parallel_strategy(4))
+    y = lbann.Reshape(y, dims=str(sample_dims()))
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.maximum(x, 0.0)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py
index b32f8c9eb71..c8a4c3dc197 100644
--- a/bamboo/unit_tests/test_unit_layer_selu.py
+++ b/bamboo/unit_tests/test_unit_layer_selu.py
@@ -1,49 +1,210 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: SELU is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(2019102411)
+_num_samples = 20
+_sample_size = 5
+_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size))
+_samples += np.random.uniform(-0.5,0.5, size=_samples.shape)
+_samples = _samples.astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy SELU
+# ==============================================
+
+def numpy_selu(x):
+    """NumPy implementation of SELU activation.
+
+    The computation is performed with 64-bit floats.
+
+    """
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    alpha = 1.6732632423543772848170429916717
+    scale = 1.0507009873554804934193349852946
+    return scale * np.where(x < 0, alpha * np.expm1(x), x)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Selu(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_selu(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Selu(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_selu(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_selu(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_selu: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_selu_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_selu_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='selu',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_selu_clang4(cluster, exes, dirname):
-    skeleton_layer_selu(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_selu_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_selu(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_selu_gcc7(cluster, exes, dirname):
-    skeleton_layer_selu(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_selu_intel18(cluster, exes, dirname):
-    skeleton_layer_selu(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe=<executable>
-def test_unit_layer_selu_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_selu_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_selu(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py
index 268526b7644..590f0448ff5 100644
--- a/bamboo/unit_tests/test_unit_layer_sigmoid.py
+++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py
@@ -1,49 +1,196 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The L1 norm is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(2019102412)
+_num_samples = 23
+_sample_size = 17
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Sigmoid(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x >= 0,
+                     1 / (1 + np.exp(-x)),
+                     np.exp(x) / (1 + np.exp(x)))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Sigmoid(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.where(x >= 0,
+                     1 / (1 + np.exp(-x)),
+                     np.exp(x) / (1 + np.exp(x)))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_sigmoid: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_sigmoid_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_sigmoid_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='sigmoid',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_sigmoid_clang4(cluster, exes, dirname):
-    skeleton_layer_sigmoid(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_sigmoid_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname):
-    skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_sigmoid_intel18(cluster, exes, dirname):
-    skeleton_layer_sigmoid(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe=<executable>
-def test_unit_layer_sigmoid_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_sigmoid_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_sigmoid(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py
new file mode 100644
index 00000000000..59f3c580457
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_sigmoid_binary_cross_entropy.py
@@ -0,0 +1,204 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: Sigmoid cross entropy is not differentiable w.r.t. ground
+# truth at 0 and 1.
+np.random.seed(20191218)
+_samples = np.random.normal(size=(11,2,13)).astype(np.float32)
+_samples[:,1,:] = np.clip(_samples[:,1,:], 0.1, 0.9)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.SigmoidBinaryCrossEntropy(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = -x1 * np.log1p(np.exp(-x0)) - (1-x1) * np.log1p(np.exp(x0))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.SigmoidBinaryCrossEntropy(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = -x1 * np.log1p(np.exp(-x0)) - (1-x1) * np.log1p(np.exp(x0))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py
new file mode 100644
index 00000000000..8d48b436b33
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_slice.py
@@ -0,0 +1,275 @@
+import functools
+import operator
+import os
+import os.path
+import sys
+import numpy as np
+import pytest
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(20190708)
+_num_samples = 29
+_sample_dims = (7,5,3)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # --------------------------
+    # Slice along axis 0
+    # --------------------------
+
+    # LBANN implementation
+    slice_points = (2, 3, 6, 7)
+    x = x_lbann
+    x_slice = lbann.Slice(x, axis=0, slice_points=tools.str_list(slice_points))
+    y = []
+    for _ in range(len(slice_points)-1):
+        y.append(lbann.L2Norm2(x_slice))
+    z = lbann.Add(y[0], y[2])
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis0'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = []
+        for j in range(len(slice_points)-1):
+            x_slice = x[slice_points[j]:slice_points[j+1],:,:]
+            y.append(tools.numpy_l2norm2(x_slice))
+        z = y[0] + y[2]
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Slice along axis 1
+    # --------------------------
+
+    # LBANN implementation
+    slice_points = (0, 2, 3, 4)
+    x = x_lbann
+    x_slice = lbann.Slice(x, axis=1, slice_points=tools.str_list(slice_points))
+    y = []
+    for _ in range(len(slice_points)-1):
+        y.append(lbann.L2Norm2(x_slice))
+    z = lbann.Add(y[0], y[2])
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis1'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = []
+        for j in range(len(slice_points)-1):
+            x_slice = x[:,slice_points[j]:slice_points[j+1],:]
+            y.append(tools.numpy_l2norm2(x_slice))
+        z = y[0] + y[2]
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Slice along axis 2
+    # --------------------------
+
+    # LBANN implementation
+    slice_points = (1, 3)
+    x = x_lbann
+    x_slice = lbann.Slice(x, axis=2, slice_points=tools.str_list(slice_points))
+    y = []
+    for _ in range(len(slice_points)-1):
+        y.append(lbann.L2Norm2(x_slice))
+    z = y[0]
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='axis2'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = []
+        for j in range(len(slice_points)-1):
+            x_slice = x[:,:,slice_points[j]:slice_points[j+1]]
+            y.append(tools.numpy_l2norm2(x_slice))
+        z = y[0]
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Model-parallel
+    # --------------------------
+
+    # LBANN implementation
+    slice_points = (31, 54, 56, 57)
+    x = lbann.Reshape(x_lbann, dims=tools.str_list([105]))
+    x_slice = lbann.Slice(x, slice_points=tools.str_list(slice_points),
+                          data_layout='model_parallel')
+    y = []
+    for _ in range(len(slice_points)-1):
+        y.append(lbann.L2Norm2(x_slice))
+    z = lbann.Add(y[0], y[2])
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(-1).astype(np.float64)
+        y = []
+        for j in range(len(slice_points)-1):
+            x_slice = x[slice_points[j]:slice_points[j+1]]
+            y.append(tools.numpy_l2norm2(x_slice))
+        z = y[0] + y[2]
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # --------------------------
+    # Gradient checking
+    # --------------------------
+
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
+
+    # --------------------------
+    # Construct model
+    # --------------------------
+
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py
index dd4c3add193..7eaf4a9954b 100644
--- a/bamboo/unit_tests/test_unit_layer_softmax.py
+++ b/bamboo/unit_tests/test_unit_layer_softmax.py
@@ -1,49 +1,206 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(201910142)
+_num_samples = 19
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# NumPy softmax
+# ==============================================
+
+def numpy_softmax(x):
+    """NumPy implementation of softmax.
+
+    The computation is performed with 64-bit floats. There is also an
+    implementation of softmax in SciPy 1.2.0 (scipy.special.softmax).
+
+    """
+    if x.dtype is not np.float64:
+        x = x.astype(np.float64)
+    y = np.exp(x - np.max(x))
+    return y / np.sum(y)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softmax(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_softmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softmax(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = numpy_softmax(x)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_softmax: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_softmax_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='softmax',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_softmax_clang4(cluster, exes, dirname):
-    skeleton_layer_softmax(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_softmax_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_softmax(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_softmax_gcc7(cluster, exes, dirname):
-    skeleton_layer_softmax(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_softmax_intel18(cluster, exes, dirname):
-    skeleton_layer_softmax(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe=<executable>
-def test_unit_layer_softmax_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_softmax_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_softmax(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py
index 0c017c6f93e..3d2076a6e03 100644
--- a/bamboo/unit_tests/test_unit_layer_softplus.py
+++ b/bamboo/unit_tests/test_unit_layer_softplus.py
@@ -1,49 +1,192 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+# Note: The L1 norm is not differentiable at 0, so we make sure values
+# are away from 0.
+np.random.seed(2019102413)
+_num_samples = 11
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softplus(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.log1p(np.exp(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softplus(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.log1p(np.exp(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_softplus: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_softplus_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_softplus_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='softplus',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_softplus_clang4(cluster, exes, dirname):
-    skeleton_layer_softplus(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_softplus_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_softplus(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_softplus_gcc7(cluster, exes, dirname):
-    skeleton_layer_softplus(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_softplus_intel18(cluster, exes, dirname):
-    skeleton_layer_softplus(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe=<executable>
-def test_unit_layer_softplus_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_softplus_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_softplus(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py
index a7bed251425..46d983b8416 100644
--- a/bamboo/unit_tests/test_unit_layer_softsign.py
+++ b/bamboo/unit_tests/test_unit_layer_softsign.py
@@ -1,49 +1,190 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(2019102414)
+_num_samples = 11
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softsign(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x / (1 + np.abs(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Softsign(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = x / (1 + np.abs(x))
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_softsign: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_softsign_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_softsign_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='softsign',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_softsign_clang4(cluster, exes, dirname):
-    skeleton_layer_softsign(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_softsign_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_softsign(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_softsign_gcc7(cluster, exes, dirname):
-    skeleton_layer_softsign(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_softsign_intel18(cluster, exes, dirname):
-    skeleton_layer_softsign(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe=<executable>
-def test_unit_layer_softsign_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_softsign_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_softsign(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py
index a05bbcc5082..1ea15aea9bc 100644
--- a/bamboo/unit_tests/test_unit_layer_squared_difference.py
+++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py
@@ -1,49 +1,201 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(2019102415)
+_samples = np.random.normal(size=(23,2,7)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index].reshape(-1)
+def num_samples():
+    return _samples.shape[0]
+def sample_dims():
+    return (2*_samples.shape[-1],)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with weights layers so that gradient checking will
+    # verify that error signals are correct.
+    slice_size = _samples.shape[-1]
+    x0_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input0_weights')
+    x1_weights = lbann.Weights(optimizer=lbann.SGD(),
+                               initializer=lbann.ConstantInitializer(value=0.0),
+                               name='input1_weights')
+    x_slice = lbann.Slice(lbann.Input(),
+                          slice_points=tools.str_list([0, slice_size, 2*slice_size]))
+    x0 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size)))
+    x1 = lbann.Sum(x_slice,
+                   lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size)))
+    x0_lbann = x0
+    x1_lbann = x1
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.SquaredDifference(x0, x1, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = (x1-x0)**2
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    x0 = x0_lbann
+    x1 = x1_lbann
+    y = lbann.SquaredDifference(x0, x1, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        x0 = x[:slice_size]
+        x1 = x[slice_size:]
+        y = (x1-x0)**2
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_squared_difference: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_squared_difference_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_squared_difference_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='squared_difference',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_squared_difference_clang4(cluster, exes, dirname):
-    skeleton_layer_squared_difference(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x0_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_squared_difference_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname):
-    skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_squared_difference_intel18(cluster, exes, dirname):
-    skeleton_layer_squared_difference(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe=<executable>
-def test_unit_layer_squared_difference_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_squared_difference_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_squared_difference(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py
index 575bd894f89..862d97fc936 100644
--- a/bamboo/unit_tests/test_unit_layer_tessellate.py
+++ b/bamboo/unit_tests/test_unit_layer_tessellate.py
@@ -1,49 +1,198 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
-import tools
+import numpy as np
 import pytest
-import os
 
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(2019102416)
+_num_samples = 29
+_sample_dims = (3,1,4)
+_sample_size = functools.reduce(operator.mul, _sample_dims)
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_dims)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_dims)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    output_dims = (7,4,3)
+    x = x_lbann
+    y = lbann.Tessellate(x,
+                         dims=tools.str_list(output_dims),
+                         data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = np.tile(x, (3,4,1))[:7,:4,:3]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout
+    # ------------------------------------------
+
+    # LBANN implementation
+    output_dims = (2,1,9)
+    x = x_lbann
+    y = lbann.Tessellate(x,
+                         dims=tools.str_list(output_dims),
+                         data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).reshape(_sample_dims).astype(np.float64)
+        y = np.tile(x, (1,1,3))[:2,:1,:9]
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_tessellate: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_tessellate_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_tessellate_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='tessellate',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # --------------------------
+    # Gradient checking
+    # --------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_tessellate_clang4(cluster, exes, dirname):
-    skeleton_layer_tessellate(cluster, exes, dirname, 'clang4')
+    # --------------------------
+    # Construct model
+    # --------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_tessellate_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_tessellate(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_tessellate_gcc7(cluster, exes, dirname):
-    skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_tessellate_intel18(cluster, exes, dirname):
-    skeleton_layer_tessellate(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe=<executable>
-def test_unit_layer_tessellate_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_tessellate_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_tessellate(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py
index 0db001567d5..8113d45adbe 100644
--- a/bamboo/unit_tests/test_unit_layer_variance.py
+++ b/bamboo/unit_tests/test_unit_layer_variance.py
@@ -1,49 +1,244 @@
+import functools
+import operator
+import os
+import os.path
 import sys
-sys.path.insert(0, '../common_python')
+import numpy as np
+
+# Bamboo utilities
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
 import tools
-import pytest
-import os
 
+# ==============================================
+# Objects for Python data reader
+# ==============================================
+# Note: The Python data reader imports this file as a module and calls
+# the functions below to ingest data.
+
+# Data
+np.random.seed(2019102417)
+_num_samples = 11
+_sample_size = 7
+_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32)
+
+# Sample access functions
+def get_sample(index):
+    return _samples[index,:]
+def num_samples():
+    return _num_samples
+def sample_dims():
+    return (_sample_size,)
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    mini_batch_size = num_samples() // 2
+    trainer = lbann.Trainer(mini_batch_size)
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.NoOptimizer()
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # Input data
+    # Note: Sum with a weights layer so that gradient checking will
+    # verify that error signals are correct.
+    x_weights = lbann.Weights(optimizer=lbann.SGD(),
+                              initializer=lbann.ConstantInitializer(value=0.0),
+                              name='input_weights')
+    x = lbann.Sum(lbann.Reshape(lbann.Input(),
+                                dims=tools.str_list(_sample_size)),
+                  lbann.WeightsLayer(weights=x_weights,
+                                     dims=tools.str_list(_sample_size)))
+    x_lbann = x
+
+    # Objects for LBANN model
+    obj = []
+    metrics = []
+    callbacks = []
+
+    # ------------------------------------------
+    # Data-parallel layout, unbiased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Variance(x, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.cov(x, bias=False)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, unbiased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Variance(x, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.cov(x, bias=False)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Data-parallel layout, biased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Variance(x, biased=True, data_layout='data_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='data-parallel layout, biased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.cov(x, bias=True)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
+
+    # ------------------------------------------
+    # Model-parallel layout, biased
+    # ------------------------------------------
+
+    # LBANN implementation
+    x = x_lbann
+    y = lbann.Variance(x, biased=True, data_layout='model_parallel')
+    z = lbann.L2Norm2(y)
+    obj.append(z)
+    metrics.append(lbann.Metric(z, name='model-parallel layout, biased'))
+
+    # NumPy implementation
+    vals = []
+    for i in range(num_samples()):
+        x = get_sample(i).astype(np.float64)
+        y = np.cov(x, bias=True)
+        z = tools.numpy_l2norm2(y)
+        vals.append(z)
+    val = np.mean(vals)
+    tol = 8 * val * np.finfo(np.float32).eps
+    callbacks.append(lbann.CallbackCheckMetric(
+        metric=metrics[-1].name,
+        lower_bound=val-tol,
+        upper_bound=val+tol,
+        error_on_failure=True,
+        execution_modes='test'))
 
-def skeleton_layer_variance(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_layer_variance: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/layer_variance_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, num_processes=2, dir_name=dir_name,
-        data_filedir_default='', data_reader_name='synthetic',
-        model_folder='tests/layer_tests', model_name='variance',
-        optimizer_name='sgd',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
+    # ------------------------------------------
+    # Gradient checking
+    # ------------------------------------------
 
+    callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True))
 
-def test_unit_layer_variance_clang4(cluster, exes, dirname):
-    skeleton_layer_variance(cluster, exes, dirname, 'clang4')
+    # ------------------------------------------
+    # Construct model
+    # ------------------------------------------
 
+    num_epochs = 0
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(x_lbann),
+                       objective_function=obj,
+                       metrics=metrics,
+                       callbacks=callbacks)
 
-def test_unit_layer_variance_gcc4_check(cluster, exes, dirname):
-    skeleton_layer_variance(cluster, exes, dirname, 'gcc4')
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
 
+    The Python data reader will import the current Python file to
+    access the sample access functions.
 
-def test_unit_layer_variance_gcc7(cluster, exes, dirname):
-    skeleton_layer_variance(cluster, exes, dirname, 'gcc7')
+    Args:
+        lbann (module): Module for LBANN Python frontend
 
+    """
 
-def test_unit_layer_variance_intel18(cluster, exes, dirname):
-    skeleton_layer_variance(cluster, exes, dirname, 'intel18')
+    # Note: The training data reader should be removed when
+    # https://github.com/LLNL/lbann/issues/1098 is resolved.
+    message = lbann.reader_pb2.DataReader()
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'train'
+        )
+    ])
+    message.reader.extend([
+        tools.create_python_data_reader(
+            lbann,
+            current_file,
+            'get_sample',
+            'num_samples',
+            'sample_dims',
+            'test'
+        )
+    ])
+    return message
 
+# ==============================================
+# Setup PyTest
+# ==============================================
 
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe=<executable>
-def test_unit_layer_variance_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_layer_variance_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_layer_variance(cluster, exes, dirname, 'exe')
+# Create test functions that can interact with PyTest
+for test in tools.create_tests(setup_experiment, __file__):
+    globals()[test.__name__] = test
diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py
deleted file mode 100644
index 4b8491e248f..00000000000
--- a/bamboo/unit_tests/test_unit_lbann2_reload.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os, sys
-
-
-def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    lbann2 = executables[compiler_name] + '2'
-
-    # Delete directories / files if they happen to be around from the
-    # previous build.
-    os.system('rm -rf ckpt')
-    os.system('rm -rf lbann2_*')
-
-
-    # No checkpointing, printing weights to files.
-    model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}'
-    output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
-        data_reader_name='mnist',
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        dir_name=dir_name,
-        model_path=model_path,
-        optimizer_name='sgd',
-        num_epochs=2,
-        output_file_name=output_file_name,
-        error_file_name=error_file_name)
-
-    os.mkdir('lbann2_ckpt')
-    return_code = os.system(command)
-    if return_code != 0:
-        sys.stderr.write('LBANN2 LeNet execution failed, exiting with error')
-        sys.exit(1)
-
-    os.system('mv lbann2_ckpt lbann2_nockpt')
-
-    # Run to checkpoint, printing weights to files.
-    output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
-        dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests',
-        model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
-        output_file_name=output_file_name,
-        error_file_name=error_file_name)
-    return_code_ckpt_1 = os.system(command)
-    if return_code_ckpt_1 != 0:
-        sys.stderr.write(
-            'LeNet (checkpoint) execution failed, exiting with error')
-        sys.exit(1)
-
-    # Pick up from checkpoint, printing weights to files.
-    output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name)
-    os.mkdir('lbann2_ckpt')
-    command = tools.get_command(
-        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
-        dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist',
-        model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext',
-        num_epochs=2, optimizer_name='sgd', ckpt_dir='ckpt/',
-        output_file_name=output_file_name,
-        error_file_name=error_file_name)
-    return_code_ckpt_2 = os.system(command)
-    if return_code_ckpt_2 != 0:
-        sys.stderr.write(
-            'LBANN2 LeNet weight reload failed, exiting with error')
-        sys.exit(1)
-    os.system('rm lbann2_ckpt/model0-epoch*')
-    os.system('rm lbann2_nockpt/model0-epoch*')
-
-    diff_result = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/')
-    allow_epsilon_diff = False
-    if allow_epsilon_diff and (diff_result != 0):
-        equal_within_epsilon = True
-        ckpt_files = os.listdir('lbann2_ckpt')
-        for file_name in ckpt_files:
-            ckpt_file = open('lbann2_ckpt/' + file_name, 'r')
-            no_ckpt_file = open('lbann2_nockpt/' + file_name, 'r')
-            for ckpt_line in ckpt_file:
-                no_ckpt_line = next(no_ckpt_file)
-                if ckpt_line != no_ckpt_line:
-                    error_string = ('ckpt_line={ckpt_line},'
-                                    ' nockpt_line={no_ckpt_line}').format(
-                        ckpt_line=ckpt_line, no_ckpt_line=no_ckpt_line)
-                    try:
-                        ckpt_values = list(map(float, ckpt_line.split()))
-                        no_ckpt_values = list(map(float, no_ckpt_line.split()))
-                        num = len(ckpt_values)
-                        if len(no_ckpt_values) == num:
-                            for i in range(num):
-                                if abs(ckpt_values[i] - no_ckpt_values[i]) > 0.5:
-                                    # Not equal within epsilon.
-                                    equal_within_epsilon = False
-                                    print(error_string)
-                        else:
-                            # Length of lists don't match.
-                            equal_within_epsilon = False
-                            print(error_string)
-                    except ValueError:
-                        # Non-numerical diff.
-                        equal_within_epsilon = False
-                        print(error_string)
-        if equal_within_epsilon:
-            diff_result = 0
-    os.system('rm -rf ckpt')
-    os.system('rm -rf lbann2_*')
-    assert diff_result == 0
-
-
-def test_unit_lbann2_reload_clang4(cluster, exes, dirname):
-    if cluster == 'catalyst':  # STILL ERRORS
-        pytest.skip('FIXME')
-    skeleton_lbann2_reload(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_lbann2_reload_gcc4(cluster, exes, dirname):
-  skeleton_lbann2_reload(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_lbann2_reload_gcc7(cluster, exes, dirname):
-    if cluster in ['catalyst', 'pascal']:  # STILL ERRORS
-        pytest.skip('FIXME')
-    skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_lbann2_reload_intel18(cluster, exes, dirname):
-    skeleton_lbann2_reload(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe=<executable>
-def test_unit_lbann2_reload_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_lbann2_reload_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_lbann2_reload(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py
index a002db49be4..1ea9a6ae19e 100644
--- a/bamboo/unit_tests/test_unit_lbann_invocation.py
+++ b/bamboo/unit_tests/test_unit_lbann_invocation.py
@@ -1,93 +1,209 @@
 import sys
 sys.path.insert(0, '../common_python')
 import tools
-import os, sys
+import os
 
-def test_unit_no_params_bad(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n')
+
+def get_default_parameters(dir_name, two_models=True):
+    data_reader_path = '{d}/model_zoo/data_readers/data_reader_mnist.prototext'.format(
+        d=dir_name)
+    model_path = '{d}/model_zoo/tests/model_lenet_mnist_ckpt.prototext'.format(
+        d=dir_name)
+    if two_models:
+        model_path = '{{{mp},{mp}}}'.format(mp=model_path)
+    optimizer_path = '{d}/model_zoo/optimizers/opt_sgd.prototext'.format(
+        d=dir_name)
+    return data_reader_path, model_path, optimizer_path
+
+
+def get_file_names(dir_name, test_name):
+    output_file_name = '{d}/bamboo/unit_tests/output/lbann_invocation_{t}_output.txt'.format(
+        d=dir_name, t=test_name)
+    error_file_name = '{d}/bamboo/unit_tests/error/lbann_invocation_{t}_error.txt'.format(
+        d=dir_name, t=test_name)
+    return output_file_name, error_file_name
+
+
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_no_params_bad' --exes=<executable>
+def test_unit_no_params_bad(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with no params; lbann should throw exception\n')
+    (output_file_name, error_file_name) = get_file_names(dirname, 'no_params_bad')
     command = tools.get_command(
-        cluster=cluster, executable=exe, exit_after_setup=True)
+        cluster=cluster, executable=exe,
+        exit_after_setup=True,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         'Failed to load any prototext files',
+                         error_file_name)
 
 
-def test_unit_one_model_bad(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n')
-    model_path = 'prototext/model_mnist_simple_1.prototext'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_one_model_bad' --exes=<executable>
+def test_unit_one_model_bad(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n')
+    (_, model_path, _) = get_default_parameters(dirname, two_models=False)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'one_model_bad')
     command = tools.get_command(
-        cluster=cluster, executable=exe, exit_after_setup=True,
-        model_path=model_path)
+        cluster=cluster, executable=exe,
+        exit_after_setup=True,
+        model_path=model_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         'you specified 1 model filenames, and 0 optimizer filenames; you must specify either one or 1 optimizer filenames',
+                         error_file_name)
 
 
-def test_unit_two_models_bad(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n')
-    model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad' --exes=<executable>
+def test_unit_two_models_bad(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n')
+    (_, model_path, _) = get_default_parameters(dirname)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad')
     command = tools.get_command(
-        cluster=cluster, executable=exe, exit_after_setup=True,
-        model_path=model_path)
+        cluster=cluster, executable=exe,
+        exit_after_setup=True,
+        model_path=model_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames',
+                         error_file_name)
 
 
-def test_unit_two_models_bad2(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n')
-    model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad2' --exes=<executable>
+def test_unit_two_models_bad2(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with two models with missing {; lbann should throw exception\n')
+    (_, model_path, _) = get_default_parameters(dirname, two_models=False)
+    model_path = '{mp},{mp}}}'.format(mp=model_path)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad2')
     command = tools.get_command(
-        cluster=cluster, executable=exe, exit_after_setup=True,
-        model_path=model_path)
+        cluster=cluster, executable=exe,
+        exit_after_setup=True,
+        model_path=model_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         "possibly you left out '{' or '}' or both",
+                         error_file_name)
 
 
-def test_unit_missing_optimizer(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n')
-    model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
-    data_reader_path='prototext/data_reader_mnist.prototext'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_optimizer' --exes=<executable>
+def test_unit_missing_optimizer(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n')
+    (data_reader_path, model_path, _) = get_default_parameters(dirname)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'missing_optimizer')
     command = tools.get_command(
-        cluster=cluster, executable=exe, data_reader_path=data_reader_path,
+        cluster=cluster, executable=exe,
+        data_reader_path=data_reader_path,
         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        exit_after_setup=True, model_path=model_path)
+        exit_after_setup=True, model_path=model_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames',
+                         error_file_name)
 
 
-def test_unit_missing_reader(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n')
-    model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
-    optimizer_path = 'prototext/opt_sgd.prototext'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_reader' --exes=<executable>
+def test_unit_missing_reader(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n')
+    (_, model_path, optimizer_path) = get_default_parameters(dirname)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'missing_reader')
     command = tools.get_command(
-        cluster=cluster, executable=exe, exit_after_setup=True,
-        model_path=model_path, optimizer_path=optimizer_path)
+        cluster=cluster, executable=exe,
+        exit_after_setup=True,
+        model_path=model_path, optimizer_path=optimizer_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name
+    )
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_failure(return_code,
+                         'you specified 2 model filenames, and 0 reader filenames; you must specify either one or 2 reader filenames',
+                         error_file_name)
 
 
-def test_unit_bad_params(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n')
-    (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True)
-    return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe))
-    assert return_code != 0
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_bad_params' --exes=<executable>
+def test_unit_bad_params(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with ill-formed param (exit_after_setup should have `--` not `-`) lbann should throw exception\n')
+    (data_reader_path, model_path, optimizer_path) = get_default_parameters(
+        dirname)
+    (command_allocate, command_run, _, _) = tools.get_command(
+        cluster=cluster, executable=exe,
+        num_processes=1,
+        return_tuple=True)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params')
+    command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format(
+        ca=command_allocate, cr=command_run, e=exe,
+        d=data_reader_path, m=model_path, o=optimizer_path,
+        ofn=output_file_name, efn=error_file_name
+    )
+    return_code = os.system(command_string)
+    tools.assert_failure(return_code,
+                         "badly formed cmd line param; must begin with '--': -exit_after_setup",
+                         error_file_name)
 
 
-def test_unit_should_work(cluster, exes):
-    exe = exes['gcc4']
-    sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n')
-    model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
-    data_reader_path = 'prototext/data_reader_mnist.prototext'
-    optimizer_path = 'prototext/opt_sgd.prototext'
+# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_should_work' --exes=<executable>
+def test_unit_should_work(cluster, dirname, exes):
+    if isinstance(exes, dict):
+        exe = exes['gcc7']
+    else:
+        exe = exes
+    print('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n')
+    (data_reader_path, model_path, optimizer_path) = get_default_parameters(
+        dirname)
+    (output_file_name, error_file_name) = get_file_names(dirname, 'should_work')
     command = tools.get_command(
         cluster=cluster, executable=exe, data_reader_path=data_reader_path,
         data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
         exit_after_setup=True, model_path=model_path,
-        optimizer_path=optimizer_path)
+        optimizer_path=optimizer_path,
+        num_processes=1,
+        output_file_name=output_file_name,
+        error_file_name=error_file_name)
     return_code = os.system(command)
-    assert return_code != 0
+    tools.assert_success(return_code, error_file_name)
diff --git a/bamboo/unit_tests/test_unit_load_weights_lenet.py b/bamboo/unit_tests/test_unit_load_weights_lenet.py
new file mode 100644
index 00000000000..0db1f94bb04
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_load_weights_lenet.py
@@ -0,0 +1,265 @@
+import os.path
+import re
+import sys
+import math
+import numpy as np
+import google.protobuf.text_format
+import pytest
+import glob
+
+# Local files
+current_file = os.path.realpath(__file__)
+current_dir = os.path.dirname(current_file)
+sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python'))
+import tools
+
+# ==============================================
+# Options
+# ==============================================
+
+# Training options
+num_epochs = 4
+num_ckpt_epochs = int(float(num_epochs)/2)
+num_restart_epochs = num_epochs - num_ckpt_epochs
+mini_batch_size = 64
+num_nodes = 1
+lenet_fraction = 0.01
+random_seed = 20191206
+
+test_name_base='test_unit_load_weights_lenet'
+checkpoint_dir='ckpt'
+save_model_dir='model_weights'
+
+# ==============================================
+# Setup LBANN experiment
+# ==============================================
+
+def setup_experiment(lbann):
+    """Construct LBANN experiment.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+    trainer = lbann.Trainer(mini_batch_size=mini_batch_size,
+                            random_seed=random_seed)
+
+    # Checkpoint after every epoch
+    trainer.callbacks = [
+        lbann.CallbackCheckpoint(
+            checkpoint_dir=checkpoint_dir,
+            checkpoint_epochs=1,
+            checkpoint_steps=845
+        )
+    ]
+
+    model = construct_model(lbann)
+    data_reader = construct_data_reader(lbann)
+    optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9)
+    return trainer, model, data_reader, optimizer
+
+def construct_model(lbann):
+    """Construct LBANN model.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.models
+
+    # Manually override the global count so that each model is named the same
+    lbann.models.LeNet.global_count = 0
+    # Layer graph
+    input_ = lbann.Input()
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    x = lbann.models.LeNet(10)(images)
+    probs = lbann.Softmax(x)
+    loss = lbann.CrossEntropy(probs, labels)
+    acc = lbann.CategoricalAccuracy(probs, labels)
+
+    # Make sure all layers are on CPU
+    for layer in lbann.traverse_layer_graph(input_):
+        layer.device = 'cpu'
+
+    # Objects for LBANN model
+    callbacks = [lbann.CallbackPrint(),
+                 lbann.CallbackTimer(),
+                 lbann.CallbackSaveModel(dir=save_model_dir)]
+    metrics = [lbann.Metric(acc, name='accuracy', unit='%')]
+
+    # Construct model
+    return lbann.Model(num_epochs,
+                       layers=lbann.traverse_layer_graph(input_),
+                       objective_function=loss,
+                       metrics=metrics,
+                       callbacks=callbacks)
+
+def construct_data_reader(lbann):
+    """Construct Protobuf message for Python data reader.
+
+    The Python data reader will import the current Python file to
+    access the sample access functions.
+
+    Args:
+        lbann (module): Module for LBANN Python frontend
+
+    """
+
+    # TODO (tym): Figure out how to switch between LBANN builds. See
+    # GitHub Issue #1289.
+    import lbann.contrib.lc.paths
+
+    # Load data readers from prototext
+    dirname = os.path.dirname
+    lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__))))
+    pb_file = os.path.join(lbann_dir,
+                           'model_zoo',
+                           'data_readers',
+                           'data_reader_mnist.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(pb_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Set location of MNIST data
+    for reader in message.reader:
+        reader.data_filedir = lbann.contrib.lc.paths.mnist_dir()
+        reader.percent_of_data_to_use = lenet_fraction
+
+
+    # Validation set
+    message.reader[0].validation_percent = 0.1
+
+    return message
+
+# ==============================================
+# Setup PyTest
+# ==============================================
+
+def create_test_func(test_func):
+    """Augment test function to cascade multiple tests and parse results.
+
+    `tools.create_tests` creates functions that run an LBANN
+    experiment. This function creates augmented functions that parse
+    the log files after LBANN finishes running, e.g. to check metrics
+    or runtimes.
+
+    Note: The naive approach is to define the augmented test functions
+    in a loop. However, Python closures are late binding. In other
+    words, the function would be overwritten every time we define it.
+    We get around this overwriting problem by defining the augmented
+    function in the local scope of another function.
+
+    Args:
+        test_func (function): Test function created by
+            `tools.create_tests`.
+
+    Returns:
+        function: Test that can interact with PyTest.
+
+    """
+    test_name = test_func.__name__
+
+    # Define test function
+    def func(cluster, exes, dirname, weekly):
+
+        # Run LBANN experiment baseline
+        print('\n################################################################################')
+        print('Running model halfway ')
+        print('################################################################################\n')
+        baseline_test_output = test_func(cluster, exes, dirname)
+        baseline_training_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        baseline_validation_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'validation objective function')
+        baseline_test_metrics = tools.collect_metrics_from_log_func(baseline_test_output['stdout_log_file'], 'test objective function')
+
+        # Run LBANN model to checkpoint
+        print('\n################################################################################')
+        print('Running model to checkpointed weights')
+        print('################################################################################\n')
+        test_func_checkpoint = tools.create_tests(
+            setup_experiment,
+            __file__,
+            test_name_base=test_name_base,
+            nodes=num_nodes,
+            work_subdir='reload_weights_from_checkpoint',
+            lbann_args=['--disable_cuda=True',
+                        '--num_epochs='+str(num_restart_epochs),
+                        '--load_model_weights_dir='+ os.path.join(baseline_test_output['work_dir'], checkpoint_dir, 'trainer0')],
+        )
+
+        checkpoint_test_output = test_func_checkpoint[0](cluster, exes, dirname)
+        checkpoint_training_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        checkpoint_validation_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'validation objective function')
+        checkpoint_test_metrics = tools.collect_metrics_from_log_func(checkpoint_test_output['stdout_log_file'], 'test objective function')
+
+        print('\n################################################################################')
+        print('Running model from save_model weights')
+        print('################################################################################\n')
+        test_func_restart = tools.create_tests(
+            setup_experiment,
+            __file__,
+            test_name_base=test_name_base,
+            nodes=num_nodes,
+            work_subdir='reload_weights_from_save_model_cb',
+            lbann_args=['--disable_cuda=True',
+                        '--num_epochs='+str(num_restart_epochs),
+                        '--load_model_weights_dir='+ os.path.join(baseline_test_output['work_dir'], save_model_dir, 'trainer0', 'model0/'),
+                        '--load_model_weights_dir_is_complete=True'],
+        )
+
+        # Restart LBANN model and run to completion
+        restart_test_output = test_func_restart[0](cluster, exes, dirname)
+        restart_training_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'training epoch [0-9]+ objective function')
+        restart_validation_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'validation objective function')
+        restart_test_metrics = tools.collect_metrics_from_log_func(restart_test_output['stdout_log_file'], 'test objective function')
+
+        print('\n################################################################################')
+        print('Comparing results of models')
+        print('################################################################################\n')
+
+        # Check if metrics are same in baseline and test experiments
+        # Note: "Print statistics" callback will print up to 6 digits
+        # of metric values.
+
+        # Comparing training objective functions
+        tools.compare_metrics(checkpoint_training_metrics, restart_training_metrics)
+        # Comparing validation objective functions
+        tools.compare_metrics(checkpoint_validation_metrics, restart_validation_metrics)
+        # Comparing test objective functions
+        tools.compare_metrics(checkpoint_test_metrics, restart_test_metrics)
+
+        baseline_ckpt=os.path.join(baseline_test_output['work_dir'], checkpoint_dir)
+        checkpoint_ckpt=os.path.join(checkpoint_test_output['work_dir'], checkpoint_dir)
+        restart_ckpt=os.path.join(restart_test_output['work_dir'], checkpoint_dir)
+
+        err = 0
+        err_dirs = ''
+        fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt))
+        fileList, tmp_err, tmp_err_str = tools.multidir_diff(checkpoint_ckpt, restart_ckpt, fileList)
+        err += tmp_err
+        err_dirs += tmp_err_str
+
+        err_msg = "\nUnmatched checkpoints:\n"
+        for f in fileList:
+            err_msg += f + "\n"
+        assert len(fileList) == 0, \
+            'Extra checkpoint data in baseline directory: ' + err_msg
+        assert err == 0, err_dirs
+
+    # Return test function from factory function
+    func.__name__ = test_name
+    return func
+
+# Create test functions that can interact with PyTest
+for _test_func in tools.create_tests(setup_experiment,
+                                     __file__,
+                                     test_name_base=test_name_base,
+                                     nodes=num_nodes,
+                                     work_subdir='baseline',
+                                     lbann_args=['--disable_cuda=True',
+                                                 ' --num_epochs='+str(num_ckpt_epochs)]):
+    globals()[_test_func.__name__] = create_test_func(_test_func)
diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py
deleted file mode 100644
index 65a7bd54ad0..00000000000
--- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-
-def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (dir_name, compiler_name)
-    if compiler_name == 'gcc7':
-        tl = 240
-    else:
-        tl = None
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name],
-        num_nodes=1, time_limit=tl, num_processes=1,
-        dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests',
-        model_name='mnist_conv_graph',
-        optimizer_name='adam',
-        output_file_name=output_file_name,
-        error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-
-def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname):
-    skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_mnist_conv_graph_gcc4(cluster, exes, dirname):
-    skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname):
-    skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_mnist_conv_graph_intel18(cluster, exes, dirname):
-    skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe=<executable>
-def test_unit_mnist_conv_graph_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_mnist_conv_graph_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py
deleted file mode 100644
index 0d4d3994837..00000000000
--- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-
-def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_mnist_ridge_regression: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=1, dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist',
-        model_folder='tests', model_name='mnist_ridge_regression',
-        optimizer_name='adam',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-
-def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname):
-    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_mnist_ridge_regression_gcc4(cluster, exes, dirname):
-    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname):
-    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_mnist_ridge_regression_intel18(cluster, exes, dirname):
-    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe=<executable>
-def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_mnist_ridge_regression_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py
deleted file mode 100644
index 8718c0e5802..00000000000
--- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-
-def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name
-      print('Skip - ' + e)
-      pytest.skip(e)
-    output_file_name = '%s/bamboo/unit_tests/output/mnist_softmax_classifier_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/mnist_softmax_classifier_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
-        num_processes=1, dir_name=dir_name,
-        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
-        data_reader_name='mnist',
-        model_folder='tests', model_name='mnist_softmax_classifier',
-        optimizer_name='adam',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-
-def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname):
-    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4')
-
-
-def test_unit_mnist_softmax_classifier_gcc4(cluster, exes, dirname):
-    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc4')
-
-
-def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname):
-    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7')
-
-
-def test_unit_mnist_softmax_classifier_intel18(cluster, exes, dirname):
-    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel18')
-
-
-# Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe=<executable>
-def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe):
-    if exe is None:
-        e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing'
-        print('Skip - ' + e)
-        pytest.skip(e)
-    exes = {'exe': exe}
-    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py
new file mode 100644
index 00000000000..fbdd0125aa6
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py
@@ -0,0 +1,72 @@
+import sys
+sys.path.insert(0, '../common_python')
+import os
+import pytest
+import tools
+
+
+def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name,
+                                     weekly, data_reader_percent):
+    if compiler_name not in executables:
+        e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name
+        print('Skip - ' + e)
+        pytest.skip(e)
+    if cluster == 'ray':
+        e = 'skeleton_jag_reconstruction_loss: dataset does not exist on %s' % cluster
+        print('Skip - ' + e)
+        pytest.skip(e)
+    #if cluster == 'lassen':
+        #e = 'skeleton_jag_reconstruction_loss: FIXME dataset consistency issues on Lassen'
+        #print('Skip - ' + e)
+        #pytest.skip(e)
+    output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster,
+        executable=executables[compiler_name],
+        num_nodes=2,
+        num_processes=32,
+        disable_cuda=1,
+        dir_name=dir_name,
+        data_filedir_train_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers',
+        data_filedir_test_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K16trainers',
+        data_reader_name='jag',
+        data_reader_percent='prototext',
+        metadata='applications/physics/data/jag_100M_metadata.prototext',
+        model_folder='tests',
+        model_name='jag_single_layer_ae',
+        optimizer_name='adam',
+        output_file_name=output_file_name,
+        error_file_name=error_file_name, weekly=weekly)
+    return_code = os.system(command)
+    tools.assert_success(return_code, error_file_name)
+
+
+def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname,
+                                             weekly, data_reader_percent):
+    skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'clang6',
+                                     weekly, data_reader_percent)
+
+
+def test_unit_jag_reconstruction_loss_gcc7(cluster, exes, dirname,
+                                           weekly, data_reader_percent):
+    skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'gcc7',
+                                     weekly, data_reader_percent)
+
+
+def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname,
+                                              weekly, data_reader_percent):
+    skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19',
+                                     weekly, data_reader_percent)
+
+
+# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe=<executable>
+def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe,
+                                          weekly, data_reader_percent):
+    if exe is None:
+        e = 'test_unit_jag_reconstruction_loss_exe: Non-local testing'
+        print('Skip - ' + e)
+        pytest.skip(e)
+    exes = {'exe': exe}
+    skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'exe',
+                                     weekly, data_reader_percent)
diff --git a/cmake/configure_files/LBANNConfig.cmake.in b/cmake/configure_files/LBANNConfig.cmake.in
index 2ac6ed91a9f..6c03819c99b 100644
--- a/cmake/configure_files/LBANNConfig.cmake.in
+++ b/cmake/configure_files/LBANNConfig.cmake.in
@@ -10,9 +10,11 @@ list(APPEND CMAKE_MODULE_PATH "@EXTRA_CMAKE_MODULE_DIR@")
 
 set(LBANN_VERSION ${PACKAGE_VERSION})
 
+set(LBANN_BUILD_TYPE "@CMAKE_BUILD_TYPE@")
+
 # Record compiler information
 set(LBANN_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
-set(LBANN_CUDA_COMPILER "$@CMAKE_CUDA_COMPILER@")
+set(LBANN_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@")
 
 set(LBANN_CXX_FLAGS "@CMAKE_CXX_FLAGS@")
 set(LBANN_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@")
@@ -28,6 +30,7 @@ if (CMAKE_CXX_STANDARD LESS LBANN_CXX_STANDARD)
 endif ()
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 
+
 # Record the various flags and switches accumlated in LBANN
 set(LBANN_ALUMINUM_MPI_PASSTHROUGH @LBANN_ALUMINUM_MPI_PASSTHROUGH@)
 set(LBANN_BUILT_WITH_SPECTRUM @LBANN_BUILT_WITH_SPECTRUM@)
@@ -37,7 +40,7 @@ set(LBANN_GNU_LINUX @LBANN_GNU_LINUX@)
 set(LBANN_HAS_ALUMINUM @LBANN_HAS_ALUMINUM@)
 set(LBANN_HAS_CEREAL @LBANN_HAS_CEREAL@)
 set(LBANN_HAS_CNPY @LBANN_HAS_CNPY@)
-set(LBANN_HAS_CONDUIT @LBANN_HAS_CONDUIT@)
+set(LBANN_HAS_CONDUIT @LBANN_WITH_CONDUIT@)
 set(LBANN_HAS_CUDA @LBANN_HAS_CUDA@)
 set(LBANN_HAS_CUDNN @LBANN_HAS_CUDNN@)
 set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@)
@@ -46,28 +49,38 @@ set(LBANN_HAS_LBANN_PROTO @LBANN_HAS_LBANN_PROTO@)
 set(LBANN_HAS_OPENCV @LBANN_HAS_OPENCV@)
 set(LBANN_HAS_NCCL2 @LBANN_HAS_NCCL2@)
 set(LBANN_HAS_PROTOBUF @LBANN_HAS_PROTOBUF@)
+set(LBANN_HAS_PYTHON @LBANN_HAS_PYTHON@)
 set(LBANN_HAS_TBINF @LBANN_HAS_TBINF@)
 set(LBANN_HAS_VTUNE @LBANN_HAS_VTUNE@)
-set(LBANN_NO_OMP_FOR_DATA_READERS @LBANN_NO_OMP_FOR_DATA_READERS@)
 set(LBANN_NVPROF @LBANN_NVPROF@)
-set(LBANN_SEQUENTIAL_INITIALIZATION @LBANN_SEQUENTIAL_INITIALIZAION@)
 set(LBANN_TOPO_AWARE @LBANN_TOPO_AWARE@)
 
 # Setup dependencies
+find_package(Threads REQUIRED)
 
-# First, CEREAL.
 if (LBANN_HAS_CEREAL)
-  find_package(CEREAL NO_MODULE
+  find_package(CEREAL NO_MODULE QUIET
     HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR}
     PATH_SUFFIXES share/cmake/cereal
     NO_DEFAULT_PATH)
   if (NOT CEREAL_FOUND)
-    find_package(CEREAL NO_MODULE)
+    find_package(CEREAL NO_MODULE QUIET)
   endif ()
   if (NOT CEREAL_FOUND AND NOT CEREAL_DIR)
     set(CEREAL_DIR "@CEREAL_DIR@")
     find_package(CEREAL NO_MODULE REQUIRED)
   endif ()
+  if (NOT CEREAL_FOUND)
+    message(FATAL_ERROR "Required dependency CEREAL not found.")
+  endif ()
+endif ()
+
+if (NOT HWLOC_DIR)
+  set(HWLOC_DIR "@HWLOC_DIR@")
+endif ()
+if (LBANN_TOPO_AWARE)
+  find_package(HWLOC REQUIRED)
+  set(LBANN_TOPO_AWARE ${HWLOC_FOUND})
 endif ()
 
 # Next, Hydrogen. We can probably inherit Aluminum-ness from
@@ -192,6 +205,102 @@ if (LBANN_HAS_CUDA)
   include(SetupCUDAToolkit)
 endif (LBANN_HAS_CUDA)
 
+set(_LBANN_CONDUIT_DIR "@Conduit_DIR@")
+set(_LBANN_HDF5_DIR "@HDF5_DIR@")
+if (LBANN_HAS_CONDUIT)
+  # Apparently we have to find HDF5, too.
+  find_package(HDF5 CONFIG QUIET
+    HINTS ${HDF5_DIR} $ENV{HDF5_DIR} ${_LBANN_HDF5_DIR}
+    PATH_SUFFIXES share/cmake/hdf5
+    NO_DEFAULT_PATH)
+  if (NOT HDF5_FOUND)
+    find_package(HDF5 CONFIG QUIET)
+  endif ()
+  if (NOT HDF5_FOUND)
+    enable_language(C) # WHY??????????????
+    find_package(HDF5 REQUIRED)
+    set(HDF5_FOUND_WITH_MODULE TRUE)
+  else ()
+    message(STATUS "Found HDF5: ${HDF5_DIR}")
+  endif ()
+
+  find_package(Conduit CONFIG QUIET
+    HINTS ${Conduit_DIR} $ENV{Conduit_DIR}
+    ${CONDUIT_DIR} $ENV{CONDUIT_DIR}
+    ${_LBANN_CONDUIT_DIR}
+    PATH_SUFFIXES lib64/cmake lib/cmake
+    NO_DEFAULT_PATH)
+  if (NOT Conduit_FOUND)
+    find_package(Conduit CONFIG REQUIRED
+      PATH_SUFFIXES lib64/cmake lib/cmake)
+  endif ()
+  message(STATUS "Found CONDUIT: ${Conduit_DIR}")
+
+  # Ugh. I don't like that this requires intimate knowledge of
+  # specific targets that CONDUIT exports. It should support
+  # components.
+  if (NOT TARGET conduit_relay_mpi)
+    message(FATAL_ERROR "CONDUIT does not have proper MPI support.")
+  endif ()
+
+  if (NOT TARGET conduit OR NOT TARGET conduit_relay
+      OR NOT TARGET conduit_blueprint)
+    message(FATAL_ERROR "Missing some CONDUIT required library.")
+  endif ()
+
+  if (NOT TARGET conduit::conduit)
+    add_library(conduit::conduit INTERFACE IMPORTED)
+  endif ()
+
+  set(_conduit_interface_link_libs
+    "conduit;conduit_relay;conduit_relay_mpi;conduit_blueprint")
+
+  # Remove -pthread from linkage, if found
+  foreach (_lib IN LISTS _conduit_interface_link_libs)
+    if (TARGET ${_lib})
+      get_property(_tmp_interface_link_libs TARGET ${_lib}
+        PROPERTY INTERFACE_LINK_LIBRARIES)
+
+      list(FIND _tmp_interface_link_libs "-pthread" _pthread_idx)
+      if (_pthread_idx GREATER_EQUAL 0)
+        list(REMOVE_AT _tmp_interface_link_libs ${_pthread_idx})
+
+        set_property(TARGET ${_lib} PROPERTY
+          INTERFACE_LINK_LIBRARIES ${_tmp_interface_link_libs})
+      endif ()
+
+      get_property(_tmp_interface_compile_opts TARGET ${_lib}
+        PROPERTY INTERFACE_COMPILE_OPTIONS)
+      set_property(TARGET ${_lib}
+        PROPERTY INTERFACE_COMPILE_OPTIONS
+          $<$<COMPILE_LANGUAGE:CXX>:${_tmp_interface_compile_opts}>)
+    endif ()
+  endforeach ()
+
+  get_filename_component(_conduit_include_dirs
+    "${CONDUIT_INCLUDE_DIRS}" DIRECTORY)
+
+  if (HDF5_FOUND_WITH_MODULE)
+    list(APPEND _conduit_interface_link_libs
+      ${HDF5_LIBRARIES})
+
+    list(APPEND _conduit_include_dirs
+      "${HDF5_INCLUDE_DIRS}")
+  endif ()
+
+  set_property(TARGET conduit::conduit
+    PROPERTY
+    INTERFACE_INCLUDE_DIRECTORIES
+    "${_conduit_include_dirs}")
+
+  set_target_properties(conduit::conduit
+    PROPERTIES
+    INTERFACE_LINK_LIBRARIES
+    "${_conduit_interface_link_libs}")
+
+  set(CONDUIT_LIBRARIES conduit::conduit)
+endif (LBANN_HAS_CONDUIT)
+
 @PACKAGE_INIT@
 
 # Now actually import the LBANN target
diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in
index 76b50bc920c..c011e492454 100644
--- a/cmake/configure_files/lbann_config.hpp.in
+++ b/cmake/configure_files/lbann_config.hpp.in
@@ -24,19 +24,27 @@
 #cmakedefine LBANN_GNU_LINUX
 
 #cmakedefine LBANN_HAS_CEREAL
+#cmakedefine LBANN_HAS_DIHYDROGEN
 #cmakedefine LBANN_HAS_OPENCV
 #cmakedefine LBANN_HAS_TBINF
 #cmakedefine LBANN_HAS_CNPY
 #cmakedefine LBANN_HAS_VTUNE
 #cmakedefine LBANN_HAS_ALUMINUM
 #cmakedefine LBANN_ALUMINUM_MPI_PASSTHROUGH
-#cmakedefine LBANN_HAS_CONDUIT
 #cmakedefine LBANN_HAS_PYTHON
+#cmakedefine LBANN_HAS_SHMEM
+#cmakedefine LBANN_HAS_LARGESCALE_NODE2VEC
 
 #cmakedefine LBANN_DETERMINISTIC
 
 #cmakedefine LBANN_HAS_CUDA
 #cmakedefine LBANN_HAS_CUDNN
+#ifdef LBANN_HAS_CUDA
+#cmakedefine LBANN_HAS_NVSHMEM
+#endif
+
+#cmakedefine LBANN_HAS_HALF
+#cmakedefine LBANN_HAS_GPU_FP16
 
 #cmakedefine LBANN_VTUNE
 #cmakedefine LBANN_NVPROF
@@ -46,6 +54,11 @@
 #cmakedefine LBANN_HAS_STD_ANY
 #cmakedefine LBANN_HAS_STD_MAKE_UNIQUE
 
+// API support for non-portable pthread functionality.
+#cmakedefine LBANN_HAS_PTHREAD_AFFINITY_SUPPORT
+
+#cmakedefine LBANN_HAS_DISTCONV
+
 // Define the LBANN datatype
 namespace lbann
 {
diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in
index 754d2c6106d..e6ea77453ae 100644
--- a/cmake/configure_files/lbann_module.lua.in
+++ b/cmake/configure_files/lbann_module.lua.in
@@ -22,7 +22,6 @@
 -- LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@
 -- LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@
 -- LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@
--- LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@
 -- LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@
 
 help(
@@ -58,7 +57,6 @@ whatis("LBANN_NVPROF: @LBANN_NVPROF@")
 whatis("LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@")
 whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@")
 whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@")
-whatis("LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@")
 whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@")
 
 prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@")
diff --git a/cmake/configure_files/lbann_module.tcl.in b/cmake/configure_files/lbann_module.tcl.in
new file mode 100644
index 00000000000..3ec52a66ce6
--- /dev/null
+++ b/cmake/configure_files/lbann_module.tcl.in
@@ -0,0 +1,57 @@
+#%Module
+
+# Lua (and hence LMod) should be preferred, but this will
+# satisfy... less modern system needs.
+
+set name lbann
+set version @LBANN_VERSION@
+set root @CMAKE_INSTALL_PREFIX@
+
+conflict $name
+
+set fullname LBANN
+set url https://github.com/llnl/lbann
+set docs https://lbann.readthedocs.io
+
+set description "LBANN: Livermore Big Artificial Neural Network Toolkit."
+
+proc ModulesHelp { } {
+    global description url docs
+    puts stderr "Description - $description"
+    puts stderr
+  puts stderr "Docs  - $url"
+}
+
+module-whatis "Package: LBANN
+Version: @LBANN_VERSION@
+Description: Livermore Big Artificial Neural Network Toolkit. 
+             A distributed memory, HPC-optimized, model and data parallel
+             training toolkit for deep neural networks.
+URL: https://github.com/llnl/lbann
+Configuration:
+  CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@
+  CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@
+  CXX Compiler: @CMAKE_CXX_COMPILER@
+  CXX FLAGS: @CMAKE_CXX_FLAGS@
+  CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@
+  CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@
+  CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@
+  LBANN_GNU_LINUX: @LBANN_GNU_LINUX@
+  LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@
+  LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@
+  LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@
+  LBANN_HAS_CUDA: @LBANN_HAS_CUDA@
+  LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@
+  LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@
+  LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@
+  LBANN_HAS_CNPY: @LBANN_HAS_CNPY@
+  LBANN_HAS_TBINF: @LBANN_HAS_TBINF@
+  LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@
+  LBANN_NVPROF: @LBANN_NVPROF@
+  LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@
+  LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@
+  LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@
+  LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@"
+  
+prepend-path PATH $root/@CMAKE_INSTALL_BINDIR@
+prepend-path PYTHONPATH @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@
diff --git a/cmake/configure_files/setup.py.in b/cmake/configure_files/setup.py.in
index bd6dae0516b..c56a7df3a4b 100644
--- a/cmake/configure_files/setup.py.in
+++ b/cmake/configure_files/setup.py.in
@@ -9,9 +9,9 @@ config_file = '@_PYTHON_CONFIG_INI@'
 
 # Get relative paths
 # Note: setuptools does not accept absolute paths
-current_dir = os.path.dirname(os.path.abspath(__file__))
-src_dir = os.path.relpath(os.path.abspath(src_dir), current_dir)
-config_file = os.path.relpath(os.path.abspath(config_file), current_dir)
+current_dir = os.path.dirname(os.path.realpath(__file__))
+src_dir = os.path.relpath(os.path.realpath(src_dir), current_dir)
+config_file = os.path.relpath(os.path.realpath(config_file), current_dir)
 
 # Setup package
 setuptools.setup(
@@ -24,8 +24,7 @@ setuptools.setup(
     packages=setuptools.find_packages(src_dir),
     package_dir={'': src_dir},
     data_files=[('lbann', [config_file])],
-    install_requires=['graphviz>=0.10.1',
-                      'matplotlib>=2.0.2',
+    install_requires=['matplotlib>=2.0.2',
                       'numpy>=1.16.0',
                       'onnx>=1.3.0',
                       'pandas>=0.24.1',
diff --git a/cmake/modules/FindBreathe.cmake b/cmake/modules/FindBreathe.cmake
index c1f2d2c5fa2..36f9499c1b8 100644
--- a/cmake/modules/FindBreathe.cmake
+++ b/cmake/modules/FindBreathe.cmake
@@ -10,7 +10,7 @@ find_program(BREATHE_EXECUTABLE breathe-apidoc
   PATH_SUFFIXES bin
   DOC "The breathe documentation tool."
   NO_DEFAULT_PATH)
-find_program(BREATHE_EXECUTABLE breathe-build)
+find_program(BREATHE_EXECUTABLE breathe-apidoc)
 
 # Standard handling of the package arguments
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/modules/FindClara.cmake b/cmake/modules/FindClara.cmake
new file mode 100644
index 00000000000..ff2f02cafd3
--- /dev/null
+++ b/cmake/modules/FindClara.cmake
@@ -0,0 +1,34 @@
+# Output variables
+#
+#   Clara_FOUND
+#   Clara_LIBRARIES
+#   Clara_INCLUDE_PATH
+#
+# Also creates an imported target clara::clara
+
+# Find the header
+find_path(CLARA_INCLUDE_PATH clara.hpp
+  HINTS ${CLARA_DIR} $ENV{CLARA_DIR} ${Clara_DIR} $ENV{Clara_DIR}
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH)
+find_path(CLARA_INCLUDE_PATH clara.hpp)
+
+# Handle the find_package arguments
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Clara DEFAULT_MSG CLARA_INCLUDE_PATH)
+
+# Build the imported target
+if (NOT TARGET clara::clara)
+  add_library(clara::clara INTERFACE IMPORTED)
+endif()
+
+set_property(TARGET clara::clara
+  PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+  ${CLARA_INCLUDE_PATH})
+
+# Set the last of the output variables
+set(CLARA_LIBRARIES clara::clara)
+
+# Cleanup
+mark_as_advanced(FORCE CLARA_INCLUDE_PATH)
diff --git a/cmake/modules/FindNVSHMEM.cmake b/cmake/modules/FindNVSHMEM.cmake
new file mode 100644
index 00000000000..b711c48e55e
--- /dev/null
+++ b/cmake/modules/FindNVSHMEM.cmake
@@ -0,0 +1,46 @@
+# Output variables
+#
+#   NVSHMEM_FOUND
+#   NVSHMEM_LIBRARY
+#   NVSHMEM_INCLUDE_DIRS
+#
+# Also creates an imported target NVSHMEM::NVSHMEM
+
+# Find the library
+find_library(NVSHMEM_LIBRARY nvshmem
+  HINTS ${NVSHMEM_DIR} $ENV{NVSHMEM_DIR}
+  PATH_SUFFIXES lib lib64
+  NO_DEFAULT_PATH
+  DOC "The location of NVSHMEM library.")
+find_library(NVSHMEM_LIBRARY nvshmem)
+
+# Find the header
+find_path(NVSHMEM_INCLUDE_DIRS nvshmem.h
+  HINTS ${NVSHMEM_DIR} $ENV{NVSHMEM_DIR}
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  DOC "The location of NVSHMEM headers.")
+find_path(NVSHMEM_INCLUDE_DIRS nvshmemx.h)
+
+# Handle the find_package arguments
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  NVSHMEM DEFAULT_MSG NVSHMEM_LIBRARY NVSHMEM_INCLUDE_DIRS)
+
+# Build the imported target
+if (NOT TARGET NVSHMEM::NVSHMEM)
+  add_library(NVSHMEM::NVSHMEM INTERFACE IMPORTED)
+  set_property(TARGET NVSHMEM::NVSHMEM PROPERTY
+    INTERFACE_LINK_LIBRARIES ${NVSHMEM_LIBRARY})
+  set_property(TARGET NVSHMEM::NVSHMEM PROPERTY
+    INTERFACE_INCLUDE_DIRECTORIES ${NVSHMEM_INCLUDE_DIRS})
+endif ()
+
+if (NVSHMEM_FOUND)
+  # Workaround for separable compilation with cooperative threading. see
+  # https://stackoverflow.com/questions/53492528/cooperative-groupsthis-grid-causes-any-cuda-api-call-to-return-unknown-erro.
+  # Adding this to INTERFACE_COMPILE_OPTIONS does not seem to solve the problem.
+  # It seems that CMake does not add necessary options for device linking when cuda_add_executable/library is NOT used. See also
+  # https://github.com/dealii/dealii/pull/5405
+  string(APPEND CMAKE_CUDA_FLAGS " -gencode=arch=compute_70,code=compute_70")
+endif ()
diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake
index 62c7945174f..39d5430461e 100644
--- a/cmake/modules/FindPython.cmake
+++ b/cmake/modules/FindPython.cmake
@@ -64,13 +64,19 @@ execute_process(
   COMMAND "${Python_EXECUTABLE}" "-c"
   "import sys; from distutils.sysconfig import get_config_var; sys.stdout.write(get_config_var('LIBDIR'))"
   OUTPUT_VARIABLE _LIB_DIR)
-if (BUILD_SHARED_LIBS)
-  set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_SHARED_LIBRARY_SUFFIX}")
-ELSE (BUILD_SHARED_LIBS)
-  set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_STATIC_LIBRARY_SUFFIX}")
-endif (BUILD_SHARED_LIBS)
-FILE(GLOB _GLOB_RESULT "${_GLOB_EXPR}")
-get_filename_component(Python_LIBRARIES "${_GLOB_RESULT}" ABSOLUTE)
+
+set(_PY_MAJ_MIN_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+find_library(Python_LIBRARY
+  NAMES python python${_PY_MAJ_MIN_VERSION}m python${_PY_MAJ_MIN_VERSION}
+  python${Python_VERSION_MAJOR}m python${Python_VERSION_MAJOR}
+  HINTS ${_LIB_DIR}
+  DOC "The python${Python_VERSION_MAJOR} library."
+  NO_DEFAULT_PATH)
+if (NOT Python_LIBRARY)
+  message(FATAL_ERROR "Could not find Python library for version "
+    "${_PY_MAJ_MIN_VERSION} in directory: ${_LIB_DIR}")
+endif ()
+set(Python_LIBRARIES "${Python_LIBRARY}")
 
 # Handle the find_package arguments
 include(FindPackageHandleStandardArgs)
diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py
index 2f6b5969f08..bfdc1caf249 100644
--- a/docs/RSTDocsFlavorText.py
+++ b/docs/RSTDocsFlavorText.py
@@ -9,6 +9,7 @@
     'callbacks' : 'Callback Interface',
     'data_readers' : 'Data Readers Interface',
     'data_store' : 'Data Store Interface',
+    'execution_contexts' : 'Execution Context Interface',
     'layers' : 'Layer Interface',
     'layers/activations' : 'Activation Layers',
     'layers/image' : 'Image Layers',
@@ -26,6 +27,9 @@
     'objective_functions/weight_regularization' : 'Objective Functions for Weight Regularization',
     'optimizers' : 'Optimizer Interface',
     'proto' : 'Protobuf and Front-End Utilities',
+    'trainers' : 'Trainer Interface',
+    'training_algorithms' : 'Training Algorithm Interface',
+    'transforms' : 'Transform Interface',
     'utils' : 'General Utilities',
     'utils/threads' : 'Multithreading Utilities',
     'weights' : 'Weights Interface'
@@ -33,10 +37,14 @@
 
 lbann_rst_flavor_text = {
     '.' : '''
-Welcome to the LBANN developers' documentation. The documentation is
-laid out following a similar structure to the source code to aid in
-navigation.
-    ''',
+The LBANN API documentation is almost entirely generated by `Doxygen
+<http://www.doxygen.nl>`_. We encourage developers to view the
+`Doxygen-generated documentation
+<../_static/doxygen/html/index.html>`_. The API documentation is largely
+reproduced here (using `Breathe
+<https://breathe.readthedocs.io/en/latest>`_) for those who prefer the
+Sphinx/RTD style. It is laid out following a similar structure to the
+source code to aid in navigation.''',
 
     'callbacks' : '''
 Callbacks give users information about their model as it is trained.
@@ -52,6 +60,21 @@
 The data store provides in-memory caching of the data set and
 inter-epoch data shuffling.''',
 
+    'execution_contexts' : '''
+When a model is attached to a trainer, the execution context of the
+training algorithm is stored in an `execution_context` (or sub-class)
+object per execution mode.  Thus there is one execution context per
+model and mode that contains all of the state with respect to the
+training algorithm being applied to the model.
+
+For example it tracks the current:
+
+* step
+* execution mode
+* epoch
+* and a pointer back to the trainer.
+''',
+
     'layers' : '''
 LBANN models are defined in model prototext files. The bulk of these
 defintions will be the series of layers which make up the model
@@ -103,6 +126,31 @@
 python front end of LBANN will emit a network description in the
 protobuf format that is ingested at runtime.''',
 
+    'trainers' : '''
+A trainer is a collection of compute resources and defines an explicit
+communication domain.  It manages the execution for both the training
+and inference of a trained model.  Once constructed, a trainer owns an
+`lbann_comm` object that defines both intra- and inter-trainer
+communication domains.  Additionally, a trainer will contain an I/O
+thread pool that is used to fetch and preprocess data that will be
+provided to the trainer's models.
+
+A trainer owns:
+
+* `lbann_comm` object,
+* I/O thread pool,
+* One or more models, and
+* Execution context for each model.
+
+In the future, it will also contain the data readers.
+''',
+
+    'training_algorithms' : '''
+The training algorithm defines the optimization that is to be
+applied to the model(s) being trained.  Additionally, it can
+specify how to evaluate the model.
+''',
+
     'utils' : 'Utility classes and functions.',
 
     'utils/threads' : 'TODO: Something about utils/threads',
diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile
index eb38cd65aa7..8fcbae615bd 100644
--- a/docs/SourceTreeDoxyfile
+++ b/docs/SourceTreeDoxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = doxy_out
+OUTPUT_DIRECTORY       = _static/doxygen
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -763,7 +763,8 @@ WARN_LOGFILE           =
  INPUT                  = ../README.md \
                           ../docs \
                           ../src \
-                          ../include
+                          ../include \
+                          ../unit_test
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1359,7 +1360,7 @@ ECLIPSE_DOC_ID         = org.doxygen.Project
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-DISABLE_INDEX          = YES
+DISABLE_INDEX          = NO
 
 # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
 # structure should be generated to display hierarchical information. If the tag
@@ -1564,7 +1565,7 @@ EXTRA_SEARCH_MAPPINGS  =
 # If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
 # The default value is: YES.
 
-GENERATE_LATEX         = YES
+GENERATE_LATEX         = NO
 
 # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -1616,7 +1617,7 @@ PAPER_TYPE             = a4wide
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = amsmath
+EXTRA_PACKAGES         = amsmath, amssymb, amsfonts, latexsym
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1841,7 +1842,7 @@ GENERATE_XML           = YES
 # The default directory is: xml.
 # This tag requires that the tag GENERATE_XML is set to YES.
 
-XML_OUTPUT             = xml
+XML_OUTPUT             = ../../doxy_out/xml
 
 # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
diff --git a/docs/build_osx.rst b/docs/build_osx.rst
index 3e5ce179a09..753bfeef4b1 100644
--- a/docs/build_osx.rst
+++ b/docs/build_osx.rst
@@ -5,25 +5,16 @@
 Building LBANN on OS X
 =========================
 
-.. warning:: This section is still under development and being
-             tested. It contains known issues. This warning will be
-             removed when it is believed to be generally usable.
+.. warning:: If using OSX 10.14 or newer, be sure that
+             :bash:`/usr/include` has been restored. In version 10.14,
+             this may be accomplished by installing
+             :bash:`/Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg`.
+             If this package is not available, it's possible command
+             line tools have not been installed; do so by executing
+             :bash:`xcode-select --install`.
 
 
---------------------
-Getting Started
---------------------
-
-.. _osx-setup-spack:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Setup Spack and local base tools
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To get started follow the general directions on building LBANN to
-`setup spack
-<https://lbann.readthedocs.io/en/latest/building_lbann.html#setup-spack-and-local-base-tools>`_.
-
+.. _osx-basic-setup:
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Setup Homebrew
@@ -31,104 +22,41 @@ Setup Homebrew
 
 .. note:: Setting up Homebrew only needs to be done once per system,.
 
-1.  Download and install `Homebrew <https://brew.sh>`_.  Setup base
-    development packages.  Note that at the moment we use brew to
-    install llvm, open-mpi, scalapack, and cmake.
-
-    .. code-block:: bash
+Download and install `Homebrew <https://brew.sh>`_.  Setup base
+development packages. Note that at the moment we use brew to install
+LLVM, Open-MPI, ScaLAPACK, and CMake.
 
-       brew install llvm
-       brew install open-mpi
-       brew install scalapack
-       brew install cmake
+.. code-block:: bash
 
-    Put the brew based clang in your path:
+   brew install llvm
+   brew install open-mpi
+   brew install cmake
+   brew install hwloc
 
-    .. code-block:: bash
+Put the brew-based :code:`clang` in your path:
 
-       export PATH="/usr/local/opt/llvm/bin:$PATH";
+.. code-block:: bash
 
-    Install lmmod so that we can use modules to put spack built
-    packages into your path.
+   export PATH=/usr/local/opt/llvm/bin:$PATH;
 
-    .. code-block:: bash
+Install :code:`lmod` so that we can use modules to put Spack-built
+packages into your path:
 
-       brew install lmod
-       brew install luarocks
+.. code-block:: bash
 
-    Update your .profile to enable use of modules via lmod
+   brew install lmod
+   brew install luarocks
 
-    .. code-block:: bash
+Update your shell configuration files to enable use of modules via
+:code:`lmod`:
 
-       source $(brew --prefix lmod)/init/$(basename $SHELL)
+.. code-block:: bash
 
-.. _osx-build-install-as-developer:
+   source $(brew --prefix lmod)/init/$(basename $SHELL)
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Building & Installing LBANN as a developer
+Building & Installing LBANN
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-1.  Establish a Spack environment and install software dependencies.
-
-    .. note:: This spack environment has to be setup once each time
-              you create a new build directory.
-
-    .. code-block:: bash
-
-        export LBANN_HOME=/path/to/lbann/git/repo
-        export LBANN_BUILD_DIR=/path/to/a/build/directory
-        export LBANN_INSTALL_DIR=/path/to/an/install/directory
-        cd ${LBANN_BUILD_DIR}
-        spack env create -d . ${LBANN_HOME}/spack_environments/developer_release_osx_spack.yaml
-        spack install
-        spack env loads # Spack creates a file named loads that has all of the correct modules
-        source loads
-        unset LIBRARY_PATH
-
-
-2.  Build LBANN locally from source and build Hydrogen and Aluminum
-    using the superbuild. See :ref:`here <building-with-the-superbuild>`
-    for a list and descriptions of all CMake flags known to LBANN's
-    "Superbuild" build system. A representative CMake command line
-    that expects :bash:`LBANN_HOME`, :bash:`LBANN_BUILD_DIR`,
-    :bash:`LBANN_INSTALL_DIR` environment variables might be:
-
-    .. code-block:: console
-
-        cd ${LBANN_BUILD_DIR}
-        cmake \
-          -G Ninja \
-          -D CMAKE_BUILD_TYPE:STRING=Release \
-          -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \
-          \
-          -D LBANN_SB_BUILD_ALUMINUM=ON \
-          -D ALUMINUM_ENABLE_MPI_CUDA=OFF \
-          -D ALUMINUM_ENABLE_NCCL=OFF \
-          \
-          -D LBANN_SB_BUILD_HYDROGEN=ON \
-          -D Hydrogen_ENABLE_ALUMINUM=ON \
-          -D Hydrogen_ENABLE_CUB=OFF \
-          -D Hydrogen_ENABLE_CUDA=OFF \
-          \
-          -D LBANN_SB_BUILD_LBANN=ON \
-          -D LBANN_DATATYPE:STRING=float \
-          -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \
-          -D LBANN_WITH_ALUMINUM:BOOL=ON \
-          -D LBANN_WITH_CONDUIT:BOOL=ON \
-          -D LBANN_WITH_CUDA:BOOL=OFF \
-          -D LBANN_WITH_CUDNN:BOOL=OFF \
-          -D LBANN_WITH_NCCL:BOOL=OFF \
-          -D LBANN_WITH_NVPROF:BOOL=OFF \
-          -D LBANN_WITH_SOFTMAX_CUDA:BOOL=OFF \
-          -D LBANN_WITH_TOPO_AWARE:BOOL=ON \
-          -D LBANN_WITH_TBINF=OFF \
-          -D LBANN_WITH_VTUNE:BOOL=OFF \
-          \
-          -D CMAKE_CXX_COMPILER=$(which clang) \
-          -D CMAKE_C_COMPILER=$(which clang) \
-          -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_LIB_NAMES=omp \
-          -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_FLAGS=-fopenmp \
-          -D LBANN_SB_FWD_ALUMINUM_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \
-          ${LBANN_HOME}/superbuild
-
-        ninja
+From this point, follow the instructions for :ref:`building LBANN with
+Spack <building-with-spack>`.
diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst
index f6f49ebd305..d694e8dd11c 100644
--- a/docs/build_with_cmake.rst
+++ b/docs/build_with_cmake.rst
@@ -8,7 +8,7 @@ Building LBANN with `CMake <https://cmake.org>`_
 ==================================================
 
 LBANN uses `CMake <https://cmake.org>`_ for its build system and a
-version newer than or equal to 3.9.0 is required. LBANN development is
+version newer than or equal to 3.12.0 is required. LBANN development is
 done primarily on UNIX-based platforms. As such, the build is tested
 regularly on Linux-based machines, occasionally on OSX, and never on
 Windows machines.
@@ -22,6 +22,87 @@ is missing, please `open an issue <https://github.com/LLNL/lbann/issues/new>`_.
 It is required that LBANN be built out-of-source. That is, CMake must
 not be invoked in a directory containing a CMakeLists.
 
+--------------------
+Dependencies
+--------------------
+
+The following packages and tools are required to build LBANN. All
+packages listed below may be installed using `Spack
+<https://github.com/llnl/spack>`_. See :ref:`the Spack installation
+instructions <building-with-spack>` for more details on using Spack to
+build a complete LBANN environment.
+
+The following basic tools are **required**.
+
++ A C++11-compliant compiler.
+
++ OpenMP, version 3.0 or newer.
+
++ An MPI-3.0 implementation.
+
++ `CEREAL <https://github.com/USCiLab/cereal>`_ is used to handle
+  complex serialization tasks.
+
++ `CMake <https://cmake.org>`_, version 3.12 or newer.
+
+The following LLNL-maintained packages are **required**.
+
++ `Hydrogen <https://github.com/llnl/elemental>`_ is a fork of the
+  `Elemental <https://github.com/elemental/elemental>`_ distributed
+  dense linear-algebra library and it may be installed via
+  `Spack <https://github.com/llnl/spack>`_ using the package name
+  "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will
+  inherit this support.
+
+The following third-party packages are **required**.
+
++ `CNPY <https://github.com/rogersce/cnpy.git>`_ is used to ingest data
+  in NumPy format. In principle this should be optional, but at time
+  of writing, LBANN will not build without it.
+
++ `OpenCV <https://github.com/opencv/opencv>`_ is used to preprocess
+  image data. For performance reasons, it is recommend to build OpenCV
+  with `JPEG-turbo <https://github.com/libjpeg-turbo/libjpeg-turbo>`_
+  for JPEG format support.
+
++ `ProtoBuf <https://github.com/protocolbuffers/protobuf>`_ is used to
+  express models in a portable format.
+
+The following LLNL-maintained packages are **optional**.
+
++ `Aluminum <https://github.com/llnl/aluminum>`_ is a
+  communication library optimized for machine learning and interaction
+  with GPUs. We cannot recommend its use strongly enough. It can be
+  built using `Spack <https://github.com/llnl/spack>`_.
+
++ `CONDUIT <https://github.com/llnl/conduit>`_ is used to ingest
+  structured data produced by scientific simulations.
+
++ `DiHydrogen <https://github.com/llnl/dihydrogen>`_ is going to
+  become the linear algebra interface; currently, it can be used to
+  manage metaprogramming and some utilities.
+  
+The following third-party packages are **optional**.
+
++ `CUDA <https://developer.nvidia.com/cuda-toolkit>`_. The development
+  team currently uses CUDA version 9.2. Building with CUDA support
+  requires that Hydrogen has been built with CUDA support (see below).
+
++ `cuDNN <https://developer.nvidia.com/cudnn>`_ is required if
+  building LBANN with CUDA support. It is freely available as a binary
+  distribution from NVIDIA.
+
++ `HWLOC <https://www.open-mpi.org/projects/hwloc/>`_. HWLOC enables
+  LBANN to make certain optimizations based on the hardware
+  topology. Its use is strongly recommended.
+
++ NVTX. LBANN supports some improved annotations for NVPROF using
+  NVTX. NVTX is provided as part of the CUDA toolkit.
+
++ VTune. LBANN supports some improved annotations for VTune.
+
+
+
 --------------------
 LBANN CMake options
 --------------------
@@ -37,6 +118,10 @@ The following options are exposed in the CMake build system.
 
 + :code:`LBANN_WITH_CONDUIT` (Default: :code:`OFF`): Build with support for CONDUIT.
 
++ :code:`LBANN_WITH_DIHYDROGEN` (Default: :code:`OFF`): Build with
+  DiHydrogen support. This will replace temporary implementations in
+  LBANN with permanent implementations from DiHydrogen.
+
 + :code:`LBANN_WITH_NVPROF` (Default: :code:`OFF`): Build with extra annotations for NVPROF.
 
 + :code:`LBANN_WITH_TOPO_AWARE` (Default: :code:`ON`): Use HWLOC for topology-aware choices.
@@ -106,6 +191,12 @@ The latter option is recommended.
   file. Must set :code:`LBANN_WITH_CONDUIT=ON` to enable CONDUIT
   support.
 
++ :code:`DIHYDROGEN_DIR` or :code:`H2_DIR`: The
+  path to *either* the DiHydrogen installation prefix *or* the
+  :code:`DiHydrogenConfig.cmake` file. Alternatively,
+  :code:`DiHydrogen_DIR` can be set to the path of the
+  :code:`DiHydrogenConfig.cmake` file.
+
 + :code:`HDF5_DIR`: The path to *either* the HDF5 installation prefix
   *or* the :code:`hdf5_config.cmake` file. There is a known issue with
   CONDUIT that it may link to HDF5 but not properly export that
@@ -147,6 +238,28 @@ documentation of the packages that are causing the issues as they may
 require additional CMake/environment flags to be set before properly
 resolving.
 
+------------------------------
+Building JAG utilities
+------------------------------
+The JAG utility executables are not part of the `all` target. In order
+to use or install them, they must be built using the `jag-utils`
+target. In order to install them, this must be done before installing.
+
+.. code-block:: bash
+                
+    # Configure LBANN
+    cmake <see below... or above> /path/to/lbann
+
+    # Build main LBANN library and front-ends
+    cmake --build .
+
+    # If JAG utilities are required, build them
+    cmake --build . --target jag-utils
+
+    # Install all (built) targets
+    cmake --build . --target install
+
+
 ------------------------------
 Example CMake invocation
 ------------------------------
diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst
index b374170bb07..c3053300fd4 100644
--- a/docs/building_lbann.rst
+++ b/docs/building_lbann.rst
@@ -10,83 +10,7 @@ Download
 --------------------
 
 LBANN source code can be obtained from the `Github
-repo <https://github.com/LLNL/lbann>`_.
-
---------------------
-Dependencies
---------------------
-
-The following packages and tools are required to build LBANN. All
-packages listed below may be installed using `Spack
-<https://github.com/llnl/spack>`_. See :ref:`below
-<building-with-spack>` for more details on using Spack to build a
-complete LBANN environment.
-
-The following basic tools are **required**.
-
-+ A C++11-compliant compiler.
-
-+ OpenMP, version 3.0 or newer.
-
-+ An MPI-3.0 implementation.
-
-+ `CEREAL <https://github.com/USCiLab/cereal>`_ is used to handle
-  complex serialization tasks.
-
-+ `CMake <https://cmake.org>`_, version 3.9 or newer.
-
-The following LLNL-maintained packages are **required**.
-
-+ `Hydrogen <https://github.com/llnl/elemental>`_ is a fork of the
-  `Elemental <https://github.com/elemental/elemental>`_ distributed
-  dense linear-algebra library and it may be installed via
-  `Spack <https://github.com/llnl/spack>`_ using the package name
-  "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will
-  inherit this support.
-
-The following third-party packages are **required**.
-
-+ `CNPY <https://github.com/rogersce/cnpy.git>`_ is used to ingest data
-  in NumPy format. In principle this should be optional, but at time
-  of writing, LBANN will not build without it.
-
-+ `OpenCV <https://github.com/opencv/opencv>`_ is used to preprocess
-  image data. For performance reasons, it is recommend to build OpenCV
-  with `JPEG-turbo <https://github.com/libjpeg-turbo/libjpeg-turbo>`_
-  for JPEG format support.
-
-+ `ProtoBuf <https://github.com/protocolbuffers/protobuf>`_ is used to
-  express models in a portable format.
-
-The following LLNL-maintained packages are **optional**.
-
-+ `Aluminum <https://github.com/llnl/aluminum>`_ is a
-  communication library optimized for machine learning and interaction
-  with GPUs. We cannot recommend its use strongly enough. It can be
-  built using `Spack <https://github.com/llnl/spack>`_.
-
-+ `CONDUIT <https://github.com/llnl/conduit>`_ is used to ingest
-  structured data produced by scientific simulations.
-
-The following third-party packages are **optional**.
-
-+ `CUDA <https://developer.nvidia.com/cuda-toolkit>`_. The development
-  team currently uses CUDA version 9.2. Building with CUDA support
-  requires that Hydrogen has been built with CUDA support (see below).
-
-+ `cuDNN <https://developer.nvidia.com/cudnn>`_ is required if
-  building LBANN with CUDA support. It is freely available as a binary
-  distribution from NVIDIA.
-
-+ `HWLOC <https://www.open-mpi.org/projects/hwloc/>`_. HWLOC enables
-  LBANN to make certain optimizations based on the hardware
-  topology. Its use is strongly recommended.
-
-+ NVTX. LBANN supports some improved annotations for NVPROF using
-  NVTX. NVTX is provided as part of the CUDA toolkit.
-
-+ VTune. LBANN supports some improved annotations for VTune.
-
+repository <https://github.com/LLNL/lbann>`_.
 
 .. _building-with-spack:
 
@@ -94,8 +18,14 @@ The following third-party packages are **optional**.
 Building with `Spack <https://github.com/llnl/spack>`_
 ------------------------------------------------------------
 
+.. note:: Users attempting to install LBANN on a Mac OSX machine may
+          need to do :ref:`additional setup <osx-basic-setup>` before
+          continuing. In particular, installing LBANN requires a
+          different compiler than the default OSX command line tools
+          and an MPI library.
+
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Setup Spack and local base tools
+Setup Spack
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 1.  Download and install `Spack <https://github.com/llnl/spack>`_.
@@ -104,201 +34,162 @@ Setup Spack and local base tools
 
     .. code-block:: bash
 
-        . ${SPACK_ROOT}/share/spack/setup-env.sh
-
-
-2.  Setup your compiler and external software environment. For example,
-    on LLNL\'s LC machines, one might load the following modules:
-
-    .. code-block:: bash
-
-        ml gcc/7.3.0 mvapich2/2.3 cuda/10.0.130 # Pascal
-
-    or
+        source ${SPACK_ROOT}/share/spack/setup-env.sh
 
-    .. code-block:: bash
-
-        ml gcc/7.3.1 cuda/9.2.148 spectrum-mpi/rolling-release  # Lassen / Sierra
 
+2.  LBANN will use `Spack environments
+    <https://spack.readthedocs.io/en/latest/environments.html>`_ to
+    specify and manage both compilers and versions of dependent
+    libraries.  Go to the install instructions for :ref:`users
+    <install_lbann_as_user>` or :ref:`developers
+    <build_lbann_from_source>`.
 
-    + Note to unload unwanted modules you can execute :bash:`ml` with
-      package names prepended with a dash, e.g.: :bash:`ml -intel`. To
-      unload all currently loaded modules, use :bash:`ml purge`.
+.. note:: Optionally, setup your Spack environment to take advantage
+          of locally installed tools.  Unless your Spack environment
+          is explicitly told about tools such as CMake, Python, MPI,
+          etc., it will install everything that LBANN and all of its
+          dependencies require. This can take quite a long time but
+          only has to be done once for a given spack repository. Once
+          all of the standard tools are installed, rebuilding LBANN
+          with Spack is quite fast.
 
-3.  Optionally, setup your spack environment to take advantages of
-    locally installed tools.  Note that unless your spack environment
-    is explicitly told about tools such as cmake, python, mpi, etc. it
-    will install everything that LBANN and all of its dependencies
-    require. This can take quite a long time, but only has to be done
-    once for a given spack repository.  Once all of the standard tools
-    are installed, rebuilding LBANN with spack is quite fast.
+          Advice on setting up paths to external installations is
+          beyond the scope of this document but is covered in the
+          `Spack Documentation
+          <https://spack.readthedocs.io/en/latest/configuration.html>`_.
 
-    + Advice on setting up paths to external installations is beyond
-      the scope of this document, but is covered in the `Spack
-      Documentation <https://spack.readthedocs.io/>`_.
+.. _install_lbann_as_user:
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Building & Installing LBANN as a user
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. warning:: This section is still under development and being
-             tested. It contains known issues. This warning will be
-             removed when it is believed to be generally usable.
-
 With Spack setup and installed into your path, it can be used to
 install the LBANN executables. This approach is appropriate for users
-that want to train new or existing models using the python front-end.
+that want to train new or existing models using the Python front-end.
 
 .. note:: If your model requires custom layers or data readers, you
           may need to install LBANN as a developer, which would allow
           you to modify and recompile the source code.
 
-Here are three easy ways to install LBANN:
+Users comfortable with Spack and `its idioms for installing packages
+<https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html>`_
+or those who already have `customizations to their Spack ecosystem
+<https://spack.readthedocs.io/en/latest/configuration.html>`_ in place
+may simply use
 
-- Using the Spack environment method, (e.g., for an x86_64 LLNL LC
-  system with GPU support):
+.. code-block:: bash
 
-  .. note:: This method provides a consistent set of dependencies during
-      installation.
+   spack install lbann <customization options>
 
-  .. code-block:: bash
+In this case, it is not even necessary to clone the LBANN repository
+from Github; Spack will handle this in its installation.
 
-      cd <path to LBANN repo>/spack_environments/users/llnl_lc/<arch>_gpu/ # where <arch> = x86_64 | ppc64le
-      spack install
-      ml load lbann
+For users that are new to spack, LBANN provides a script that will do
+some basic configuration and then install LBANN using the Spack
+environment method:
 
-- Building with the latest released versions and GPU support (use the
-  user's defaults for specifying the compiler, MPI library, etc.):
+.. code-block:: bash
 
-  .. code-block:: bash
+   <path lbann repo>/scripts/install_lbann.sh -e lbann
+   spack env activate -p lbann
 
-      spack install lbann +gpu +nccl
-      ml load lbann
+Options exist in the script to disable the GPUs and change the
+name of the Spack environment. These can be viewed by passing the
+:code:`-h` option to the script.
 
-- Building with the head of develop branch for lbann, hydrogen and
-  aluminum with GPU support (use the user's defaults for specifying
-  the compiler, MPI library, etc.):
+.. note:: Currently this script will clone a second LBANN repository
+          that Spack will use to build the LBANN library and
+          executables. We are working on simplifying this further.
 
-  .. code-block:: bash
 
-      spack install lbann@develop +gpu +nccl ^hydrogen@develop ^aluminum@master
-      ml load lbann
-
-There are numerous options for all of these packages. These options
-can be viewed via commands such as :bash:`spack info lbann`. To
-specify the compiler, one can add options such as :code:`%gcc@7.3.0`.
-For further information about specifying dependencies, such as the MPI
-library, please consult `the Spack documentation
-<https://spack.readthedocs.io>`_.
+.. _build_lbann_from_source:
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Building & Installing LBANN as a developer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Developers of LBANN will often need to interact with the source code
-and/or advanced configuration options for Aluminum, Hydrogen, and
-LBANN while the other dependencies remain constant. The Spack
-installation instructions below set up a Spack environment with the
-remaining dependencies, requiring the developer to build Aluminum,
-Hydrogen, and LBANN separately, by whatever means they choose.
+and/or set advanced configuration options for Aluminum, Hydrogen, and
+LBANN while the other dependencies remain constant. The installation
+instructions below provide a script that will setup a Spack
+environment with the remaining dependencies, and then invoke the LBANN
+CMake infrastructure to build LBANN from the local source. The
+provided script will build with a standard compiler for a given
+platform and the nominal options in the CMake build environment.
+Expert developers should refer to :ref:`the "Superbuild" documentation
+<building-with-the-superbuild>` for a list and descriptions of all
+CMake flags known to LBANN's "Superbuild" build system.
+
+1.  Install all of the external packages via Spack (Aluminum,
+    Hydrogen, etc).
+
+    Install packages into a Spack environment. This is only done when
+    initially installing or upgrading the dependencies. LBANN provides
+    a script to install the basic dependencies in their default
+    configurations and it can be found at:
 
-1.  Establish a Spack environment and install software dependencies.
-    Note that there are four environments to pick from along two axes:
+    .. code-block:: bash
 
-    .. note:: This spack environment has to be setup once each time
-              you create a new build directory.
+        <path to lbann repo>/scripts/install_lbann.sh -d
 
-    1. developers or users
-    2. x86_64 and ppc64le
+    Note that the named environment can be controlled via the
+    :code:`-e` flag. A full list of options can be viewed with the
+    :code:`-h` flag.
 
-    For example if you are a developer and want to build the inside of
-    the git repo use the following instructions:
+2.  Setup the LBANN CMake environment using the Spack environment for
+    the dependencies.
 
     .. code-block:: bash
 
-        export LBANN_HOME=/path/to/lbann/git/repo
-        export LBANN_BUILD_DIR=/path/to/a/build/directory
-        export LBANN_INSTALL_DIR=/path/to/an/install/directory
-        cd ${LBANN_BUILD_DIR}
-        spack env create -d . ${LBANN_HOME}/spack_environments/developer_release_<arch>_cuda_spack.yaml # where <arch> = x86_64 | ppc64le
-        cp ${LBANN_HOME}/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml .
-        cp ${LBANN_HOME}/spack_environments/externals_<arch>_llnl_lc_cz.yaml . # where <arch> = x86_64 | ppc64le
-        spack install
-        spack env loads # Spack creates a file named loads that has all of the correct modules
-        source loads
-        unset LIBRARY_PATH
-
-
-    + Note that the environments provided here have a set of external
-      packages and compilers that are installed on an LLNL LC CZ
-      system.  Please update these for your system environment.
-      Alternatively, you can create baseline versions of the
-      user-level Spack configuration files and remove the externals
-      and compilers from the :code:`spack.yaml` file. More details are
-      provided :ref:`here <setup-spack-env>`.
-
-    + Note that the initial build of all of the standard packages in Spack
-      will take a while.
-
-    + Note that the Spack module files set the :bash:`LIBRARY_PATH` environment
-      variable. This behavior allows autotools-based builds to pickup the
-      correct libraries but interferes with the way that CMake sets up
-      RPATHs.  To correctly establish the RPATH, please unset the variable
-      as noted above, or you can explicitly pass the RPATH fields to CMake
-      using a command such as:
-
-      .. code-block:: bash
-
-          cmake -DCMAKE_INSTALL_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \
-                -DCMAKE_BUILD_RPATH=$(sed 's/:/;/g' <<< "${LIBRARY_PATH}") \
-                ...
-
-2.  Build LBANN locally from source and build Hydrogen and Aluminum
-    using the superbuild. See :ref:`here <building-with-the-superbuild>`
-    for a list and descriptions of all CMake flags known to LBANN's
-    "Superbuild" build system. A representative CMake command line
-    that expects :bash:`LBANN_HOME`, :bash:`LBANN_BUILD_DIR`,
-    :bash:`LBANN_INSTALL_DIR` environment variables might be:
+        <path to lbann repo>/scripts/build_lbann_from_source.sh
+
+
+   Options exist in the script to disable the GPUs, set a build and
+   install prefix, separately set the build and install
+   directories, or use a different spack environment. These options
+   can be viewed using the :code:`-h` flag.
+
+   The environments provided by this script have a set of external
+   packages and compilers that are installed on an LLNL LC CZ, NERSC,
+   or LLNL-configured OS X system. If you are not on one of these
+   systems, please update the externals and compilers for your system
+   environment. Alternatively, you can create baseline versions of
+   the user-level Spack configuration files and remove the externals
+   and compilers from the :code:`spack.yaml` file. More details are
+   provided :ref:`here <setup-spack-env>`.
+
+   .. warning:: Depending on the completeness of the externals
+                specification, the initial build of all of the
+                standard packages in Spack can take a long time.
+
+3.  Once the installation has completed, you can load the module file
+    for LBANN with the following command
 
     .. code-block:: console
 
-        cd ${LBANN_BUILD_DIR}
-        cmake \
-          -G Ninja \
-          -D CMAKE_BUILD_TYPE:STRING=Release \
-          -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \
-          \
-          -D LBANN_SB_BUILD_ALUMINUM=ON \
-          -D ALUMINUM_ENABLE_MPI_CUDA=OFF \
-          -D ALUMINUM_ENABLE_NCCL=ON \
-          \
-          -D LBANN_SB_BUILD_HYDROGEN=ON \
-          -D Hydrogen_ENABLE_ALUMINUM=ON \
-          -D Hydrogen_ENABLE_CUB=ON \
-          -D Hydrogen_ENABLE_CUDA=ON \
-          \
-          -D LBANN_SB_BUILD_LBANN=ON \
-          -D LBANN_DATATYPE:STRING=float \
-          -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \
-          -D LBANN_WITH_ALUMINUM:BOOL=ON \
-          -D LBANN_WITH_CONDUIT:BOOL=ON \
-          -D LBANN_WITH_CUDA:BOOL=ON \
-          -D LBANN_WITH_CUDNN:BOOL=ON \
-          -D LBANN_WITH_NCCL:BOOL=ON \
-          -D LBANN_WITH_NVPROF:BOOL=ON \
-          -D LBANN_WITH_SOFTMAX_CUDA:BOOL=ON \
-          -D LBANN_WITH_TOPO_AWARE:BOOL=ON \
-          -D LBANN_WITH_TBINF=OFF \
-          -D LBANN_WITH_VTUNE:BOOL=OFF \
-          ${LBANN_HOME}/superbuild
-
-        ninja
-        ml use ${LBANN_INSTALL_DIR}/etc/modulefiles/
+        ml use <path to installation>/etc/modulefiles
         ml load lbann-0.99.0
 
 
-The complete documentation for building LBANN directly with CMake can
-be found :ref:`here <build-with-cmake>`.
+    For advanced users, :ref:`the LBANN superbuild system
+    <building-with-the-superbuild>` provides additional control over
+    the dependencies, especially Aluminum and Hydrogen.
+
+4.  After the initial setup of the LBANN CMake environment, you can
+    rebuild by activating the Spack environment and then re-running
+    ninja.
+
+    .. code-block:: console
+
+         spack env activate -p <environmment>
+         cd <build directory>/lbann/build
+         unset CPATH # Can cause bad include resolution
+         ninja
+
+For more control over the LBANN build, please see :ref:`the complete
+documentation for building LBANN directly with CMake
+<build-with-cmake>`.
 
 ------------------------------
 Advanced build methods
diff --git a/docs/conf.py b/docs/conf.py
index d1763486df5..a921a29a037 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,9 +18,13 @@
 
 import subprocess, os, runpy
 
-rebuild_doxygen = not os.path.isdir("doxy_out/xml")
+rebuild_doxygen = not os.path.isdir("doxy_out/xml") or not os.path.isdir("_static/doxygen/html")
 
+if not os.path.isdir("_static"):
+    os.makedirs("_static")
+    
 if rebuild_doxygen:
+    os.makedirs("doxy_out/xml")
     subprocess.call('doxygen SourceTreeDoxyfile', shell=True)
 
 #exec(open("./BuildRSTDocs.py").read())
@@ -75,6 +79,7 @@
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 
+html_static_path = ['_static']
 
 # -- Options for HTML output -------------------------------------------------
 
diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst
new file mode 100644
index 00000000000..ddced95ebff
--- /dev/null
+++ b/docs/continuous_integration.rst
@@ -0,0 +1,221 @@
+.. role:: bash(code)
+          :language: bash
+
+.. role:: python(code)
+          :language: python
+
+LBANN CI
+====================
+
+Bamboo is the continuous integration (CI) framework we use.
+A Bamboo plan consists of stages (which run sequentially),
+which consist of jobs (which run in parallel),
+which consist of tasks (which run sequentially).
+
+The LBANN build project has many plans.
+Two plans run off of `LLNL/lbann/develop <https://github.com/LLNL/lbann/tree/develop>`_
+- Nightly Develop and Weekly Develop.
+Nightly Develop runs every night (except Saturday) at midnight.
+Weekly Develop runs every Saturday at midnight.
+The other plans in the build project are for each individual LBANN developer's
+fork of LBANN.
+
+All plans run off the latest *pushed* commits to the repository.
+That means if you have local commits that you have not pushed to your fork,
+these commits will *not* be tested by Bamboo.
+If you have pushed commits to your fork but have not merged your branch into
+the main repository's "develop",
+your commits will be tested on your individual plan,
+but not on Nightly Develop or Weekly Develop.
+
+Plan Configuration
+----------------------------------------
+Each plan is identical (except Weekly Develop, which will be explained below).
+The plans consist of a single stage "Tests".
+The stage consists of two jobs - "x86_cpu" (Catalyst), and "x86_gpu" (Pascal).
+Each of these jobs can run in parallel.
+They consist of an identical list of tasks:
+
+1. Checkout Default Repository (checkout the repository)
+
+2. Run :bash:`./allocate_and_run.sh`;
+   Weekly Develop adds the :bash:`--weekly` option.
+   This script allocates nodes and then runs "run.sh" which does the following:
+
+   a. Remove Generated Files (each build creates a large number of files.
+      We may look at these files between builds,
+      so we cannot delete them at the end of a build.
+      So, instead we delete them before doing any real work in the next build.
+      This also ensures the generated files came from the latest build and not
+      a previous build).
+
+   b. Compiler Tests (run tests in "bamboo/compiler_tests")
+
+   c. Integration Tests (run tests in "bamboo/integration_tests")
+
+   d. Unit Tests (run tests in "bamboo/unit_tests")
+
+3. JUnit Parser (this allows Bamboo to render test results in a nice UI)
+
+
+The tests in Task 2 run
+:bash:`$PYTHON -m pytest -s -vv --durations=0 [--weekly] --junitxml=results.xml`,
+which will run all the pytests in the job's associated directory.
+Note that :bash:`$PYTHON` refers to the Python build to use.
+Also note that only Weekly Develop adds the :bash:`--weekly` option.
+Many (mostly longer-running) tests are set to not run unless this option is on.
+Weekly Develop runs a superset of the tests that Nightly Develop runs.
+
+Directory Structure
+----------------------------------------
+
+"bamboo/compiler_tests", "bamboo/integration_tests", "bamboo/unit_tests" each
+have a "conftest.py" that pytest requires.
+They also contain one or more python files.
+Each of these files have a number of tests to run.
+
+Writing Your Own Tests
+----------------------------------------
+
+A side effect of our Bamboo setup is that tests must be written using pytest.
+Test files must begin with :bash:`test_` to be recognized by pytest.
+Individual test methods must also begin with :python:`test_`.
+Test methods should use the :python:`assert` keyword or raise an
+:python:`AssertionError`.
+A test will only fail if the assertion turns out to be false.
+Not putting an assertion will automatically cause the test to pass.
+
+How then to test non-Python code?
+You can just wrap your test with Python.
+A test can be as simple as asserting the output code of a shell command is 0.
+The output code of a command can be found using Python's :python:`os.system()`.
+
+Running Tests On Your Individual Plan
+----------------------------------------
+
+Unlike Nightly Develop, the individual plans are triggered to run by polling
+your fork for commits.
+They do not run nightly.
+If you push new commits to your fork, a new build should start automatically.
+You can also manually start a build by navigating to your individual plan and
+clicking Run > Run plan
+(this will say "Run branch" if you have plan branches set up).
+Once again, keep in mind that the tests will run off what has been pushed to
+your GitHub fork of LBANN and not your local copy of the LBANN repository.
+
+Plan branches allow you to test multiple branches simultaneously instead
+of simply testing "<fork-name>/develop".
+You can create plan branches by navigating to your individual plan,
+clicking Actions > Configure plan > Branches > Create plan branch.
+
+Navigating Bamboo
+----------------------------------------
+
+From the `LBANN Project Summary <https://lc.llnl.gov/bamboo/browse/LBANN>`_,
+click on a plan.
+From there, click on a build (builds are listed under "Recent History" and can
+also be accessed from the pass/fail marks in the top right,
+to the left of the "Run" button).
+This will bring you to a certain build's page.
+The most relevant tabs are "Tests" and "Logs".
+It is recommended to look at failures first in the "Tests" tab,
+as the build logs can be difficult to parse through.
+The build's "Tests" tab shows "New test failures", "Existing test failures",
+"Fixed tests", and "Skipped Tests".
+
+From the build's page, you can also click on individual	jobs,
+which have the same tabs.
+The "Tests" tabs of the individual jobs have two sub-tabs,
+"Failed tests" and "Successful tests".
+They do not display skipped tests.
+The Bamboo agent that ran the job can be found by looking at the "Agent" field
+under the "Job Summary" tab.
+Alternatively, you can determine the agent from one of the first lines in the
+build logs:
+"Build working directory is /usr/workspace/wsb/lbannusr/bamboo/<bamboo-agent-name>/xml-data/build-dir/<build-plan-and-job>".
+
+
+Bamboo Agent Properties
+----------------------------------------
+
+Bamboo agent properties are used to specify requirements for each job.
+
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+| Agents (jobs)                  | agent_owner | architecture | cluster  | gpu_architecture | sys_type               |
++================================+=============+==============+==========+==================+========================+
+| Catalyst Agents (x86_cpu)      | lbannusr    | x86_64       | catalyst | none             | toss_3_x86_64_ib       |
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+| Corona Agents (x86_cpu_corona) | lbannusr    | x86_64       | corona   | none             | toss_3_x86_64_ib       |
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+| Lassen Agents (ppc64le_gpu)    | lbannusr    | ppc64le      | lassen   | volta            | blueos_3_ppc64le_ib_p9 |
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+| Pascal Agents (x86_gpu_pascal) | lbannusr    | x86_64       | pascal   | pascal           | chaos_6_x86_64_ib      |
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+| Ray Agents (ppc64le_gpu)       | lbannusr    | ppc64le      | ray      | pascal           | blueos_3_ppc64le_ib    |
++--------------------------------+-------------+--------------+----------+------------------+------------------------+
+
+Currently, "agent_owner", "architecture", and "gpu_architecture" are used to
+determine agents to run a job.
+
+Running Tests From The Command Line
+----------------------------------------
+
+Navigate to "bamboo/compiler_tests", "bamboo/integration_tests",
+or "bamboo/unit_tests".
+
+To run all the tests in a subdirectory: :bash:`python -m pytest -s --weekly`.
+Note that running all tests can take a substantial amount of time.
+
+To run the tests that Nightly Develop or the individual plans run in a
+subdirectory: :bash:`python -m pytest -s`.
+
+To run a specific test file: :bash:`python -m pytest -s <test_file>.py`.
+
+To run a specific test:
+:bash:`python -m pytest -s <test_file>.py -k '<test_name>'`.
+
+Most integration and unit tests allow for running a test with a different
+executable.
+The convention is to have a similarly structured test replacing
+:python:`_<compiler_name>` with :python:`_exe`.
+These tests are set to be skipped in Bamboo, but can be run locally.
+There should be a line above the test that gives the command to run the test
+locally, likely in the following form:
+:bash:`python -m pytest -s <test_file>.py -k '<test_name>' --exe=<executable>`.
+
+If you have an executable, you can run the :python:`_exe` tests with
+:bash:`local_test.sh`. Use :bash:`local_test.cmd` as a template for writing
+a batch script. You can run only integration tests, only unit tests, or both.
+
+Helpful Files
+----------------------------------------
+
+First, run :bash:`sudo lbannusr`.
+
+To look at output and error from previous builds:
+:bash:`cd /usr/workspace/wsb/lbannusr/bamboo/<bamboo-agent-name>/xml-data/build-dir/<build-plan-and-job>/bamboo/<compiler_tests, integration_tests, or unit_tests>/<error or output>`.
+If the test uses the Python Front-End, use:
+:bash:`cd /usr/workspace/wsb/lbannusr/bamboo/<bamboo-agent-name>/xml-data/build-dir/<build-plan-and-job>/bamboo/<compiler_tests, integration_tests, or unit_tests>/experiments/<test-folder>`.
+(Note that these files can also be read by clicking on the "Artifacts" tab on
+the Bamboo build).
+
+To look at archived results from previous builds:
+:bash:`cd /usr/workspace/wsb/lbannusr/archives/<build-plan>`
+
+To look at Bamboo agent properties:
+:bash:`cat /usr/global/tools/bamboo/agents/lbannusr/<bamboo-agent-name>/bin/bamboo-capabilities.properties`
+
+You can copy these files over to your own machine as follows:
+
+- :bash:`sudo lbannusr`
+
+- :bash:`give <lc-username> <absolute-path>`
+
+- :bash:`exit` - to go back to your own LC account, not lbannusr's.
+
+- :bash:`take lbannusr` - now the file exists on your LC account,
+  but not yet on your own machine.
+
+From your own machine, not a ssh terminal:
+
+- :bash:`scp <lc-username>@<cluster>.llnl.gov:<absolute-path> .`
diff --git a/docs/documentation_building.rst b/docs/documentation_building.rst
new file mode 100644
index 00000000000..7800f699d44
--- /dev/null
+++ b/docs/documentation_building.rst
@@ -0,0 +1,56 @@
+.. role:: bash(code)
+          :language: bash
+
+LBANN Documentation Building
+============================
+
+.. warning:: Some of the directions in this section are Mac-specific.
+
+Adding Documentation Outside Code
+----------------------------------
+
+1. Create a file such as "new_docs.rst" in "lbann/docs".
+
+2. Add "new_docs" (no ".rst") to the appropriate documentation block in
+   "lbann/docs/index.rst".
+
+3. Look at the other ".rst" files in "lbann/docs" to see how to get
+   certain formatting.
+
+4. When you want to see how your code looks, you have a couple options:
+
+   a. Push your docs to your fork/branch on GitHub and look at how
+      the text renders. This is a very simplified look compared to
+      Read-the-Docs.
+
+   b. From "lbann/docs" run :bash:`make html` and then
+      :bash:`open -a <preferred web browser> _build/html/index.html`.
+      This is exactly how the docs will look.
+
+5. Merge your code into "lbann/develop" and then have someone with
+   correct permissions on Read-the-Docs update the
+   `official docs <http://software.llnl.gov/lbann/>`_.
+
+Making The Build Work
+----------------------------------
+
+In order to make :bash:`make html` work, you may need to do a few steps:
+
+1. Run :bash:`pip3 install sphinx breathe sphinx-rtd-theme`.
+
+2. Download Doxygen by going to the
+   `Doxygen downloads page <http://www.doxygen.nl/download.html#srcbin>`_,
+   downloading "Doxygen-1.8.15.dmg", and
+   dragging the app to the "Applications" folder.
+
+3. Determine the directory Doxygen is in by running `which Doxygen`.
+   If nothing is returned, see if `doxygen` is in
+   "/Applications/Doxygen.app/Contents/Resources" or
+   "/Applications/Doxygen.app/Contents/MacOS".
+
+4. Add Doxygen to your path with
+   :bash:`PATH="<doxygen directory>:${PATH}"`.
+   You may want to add this to your "~/.bash_profile" so your :bash:`PATH` is
+   always correct. Run :bash:`source ~/.bash_profile` to run that code.
+
+5. Try running :bash:`make html` again.
diff --git a/docs/index.rst b/docs/index.rst
index b07bfae9bb9..e4603712d64 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,7 +11,7 @@ LBANN provides model-parallel acceleration through domain
 decomposition to optimize for strong scaling of network training.  It
 also allows for composition of model-parallelism with both data
 parallelism and ensemble training methods for training large neural
-networks with massive amounts of data.  LBANN is able to advantage of
+networks with massive amounts of data.  LBANN is able to take advantage of
 tightly-coupled accelerators, low-latency high-bandwidth networking,
 and high-bandwidth parallel file systems.
 
@@ -22,6 +22,8 @@ recurrent neural networks via back propagation through time (BPTT)
 training, transfer learning, and multi-model and ensemble training
 methods.
 
+Users are advised to view `the Doxygen API Documentation
+<_static/doxygen/html/index.html>`_ for API information.
 
 .. toctree::
    :maxdepth: 2
@@ -40,8 +42,11 @@ methods.
    :maxdepth: 2
    :caption: Developer Documentation
 
+   lbann
    lbann/lbann
    style_guide
+   continuous_integration
+   documentation_building
 
 ==================
 
diff --git a/docs/lbann.rst b/docs/lbann.rst
new file mode 100644
index 00000000000..30438e05af2
--- /dev/null
+++ b/docs/lbann.rst
@@ -0,0 +1,85 @@
+**************************************************
+LBANN Software Architecture and Class Overview
+**************************************************
+
+Trainers (i.e. execution environment)
+******************************************
+
+A trainer is a collection of compute resources and defines a explicit
+communication domain.  It provides the execution for both the training
+and inference of a trained model.  Once constructed a trainer owns an
+LBANN comm object that defines both intra- and inter-trainer
+communication domains.  Additionally, a trainer will contain an I/O
+thread pool that is used to fetch and pre-process data that will be
+provided to the trainer's models.
+
+A trainer owns:
+
+* comm object
+* I/O thread pool
+* One or more models
+* Execution context for each model
+* In the future, it will also contain the data readers.
+
+Execution Context
+******************************************
+
+When a model is attached to a trainer the execution context of the
+training algorithm is stored in an execution_context class (or
+sub-class) per execution mode.  Thus there is one execution context
+per model and mode that contains all of the state with respect to the
+training algorithm being applied to the model.
+
+For example it tracks the current:
+
+* step
+* execution mode
+* epoch
+* and a pointer back to the trainer
+
+Termination Criteria (Pending)
+******************************************
+
+(Pending feature) When a model is going to be trained or evaluated,
+the termination criteria is specified in an object that is passed into
+the training algorithm.  (Note that this feature is under development,
+currently the termination criteria is dictated by when the training
+algorithm executes a fixed number of epochs.)
+
+Training Algorithms
+******************************************
+
+The training algorithm defines the optimization that is to be
+applied to the model(s) being trained.  Additionally, it can
+specify how to evaluate the model.
+
+Model
+******************************************
+
+A model is a collection of operations with dependencies encoded as a
+directed acyclic graph (DAG).  In a typical formulation, these
+operations form a neural network that will be either trained or used
+for inference.  Each operation in the model is an instance of the
+layer class.  The model is then a collection of layers that perform
+transformations and mathematical operations on data that is passed
+between layers.  The model's DAG is executed in topological order.
+Inside of some layer types are weight matrices that define a trained
+model.  (Note that LBANN should be able to support non-DNN models, but
+this is a subject for future work.)
+
+Each layer in the graph contains a set of tensors that holds the
+inputs, computed outputs, gradients with respect to the outputs, and
+gradients with respect to the inputs.  Furthermore, for each layer in
+the graph with learnable parameters, there is an associated weight
+tensor that form the learned weights of the model.  The model also
+owns the objective function, since that is integrally tied into the
+model's computational graph.  Additionally, the model owns both the
+default optimizer that is used to provide a standard optimizer for the
+model's weight tensors.  Once each weight tensor is instantiated, it
+will owns an instance of an optimizer.
+
+The model also owns the max_mini_batch_size that is supported by the
+model.  This is due to the fact that it changes the size and shape of
+input, output, and gradient tensors.  Additionally, the model owns a
+field that controls if background I/O is allowed for this model and
+associated data reader.
diff --git a/docs/publications.rst b/docs/publications.rst
index c2bb25449cd..aa22a58b0e6 100644
--- a/docs/publications.rst
+++ b/docs/publications.rst
@@ -3,10 +3,31 @@ Papers, Presentations, and Posters
 
 Publications about or related to using LBANN:
 
++ Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir,
+  Brian Van Essen. "Channel and Filter Parallelism for Large-Scale
+  CNN Training", to appear in *International Conference for High
+  Performance Computing, Networking, Storage and Analysis (SC'19)*, 2019.
+
++ Sam Ade Jacobs, Brian Van Essen, Tim Moon, Jae Seung Yeom, David
+  Hysom, Brian Spears, Rushil Anirudh, Jayaraman Thiagaranjan, Shusen
+  Liu, Jim Gaffney, Peer-Timo Bremer, Tom Benson, Peter Robinson, and
+  Luc Peterson, "Parallelizing Training of Deep Generative Models on
+  Massive Scientific Datasets", to appear in *Proceedings of Cluster
+  Computing*, 2019
+
++ Shusen Liu, Di Wang, Dan Maljovec, Rushil Anirudh,
+  Jayaraman J. Thiagarajan, Sam Ade Jacobs, Brian C. Van Essen, David
+  Hysom, Jae-Seung Yeom, Jim Gaffney, Luc Peterson, Peter B. Robinson,
+  Harsh Bhatia, Valerio Pascucci, Brian K. Spears, Peer-Timo Bremer.
+  `"Scalable Topological Data Analysis and Visualization for
+  Evaluating Data-Driven Models in Scientific Applications"
+  <https://arxiv.org/abs/1907.08325>`_, to appear in *IEEE Transactions
+  on Visualization and Computer Graphics*, 2019
+
 + Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir,
   Brian Van Essen. `"Improving Strong-Scaling of CNN Training by
   Exploiting Finer-Grained Parallelism"
-  <https://arxiv.org/abs/1903.06681>`_, to appear in *IEEE
+  <https://arxiv.org/abs/1903.06681>`_, in *Proceedings of IEEE
   International Parallel & Distributed Processing Symposium*, 2019.
 
   + `IPDPS'19 <http://www.ipdps.org/ipdps2019/2019-advance-program.html>`_
diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst
index d98e5fa62fe..124d9d28cbf 100644
--- a/docs/running_lbann.rst
+++ b/docs/running_lbann.rst
@@ -1,96 +1,454 @@
 .. role:: bash(code)
           :language: bash
+.. role:: python(code)
+          :language: python
 
-====================
+============================================================
 Running LBANN
-====================
+============================================================
 
-The basic template for running LBANN is
+------------------------------------------------
+Anatomy of an LBANN experiment
+------------------------------------------------
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LBANN is run under `MPI
+<https://en.wikipedia.org/wiki/Message_Passing_Interface>`_, i.e. with
+multiple processes that communicate with message passing. This set of
+processes is subdivided into one or more "trainers." Conceptually, a
+trainer owns parallel objects, like models and data readers, and
+generally operates independently of other trainers.
+
+Comments:
+
++ LBANN targets HPC systems with homogeneous compute nodes and GPU
+  accelerators, which motivates some simplifying assumptions:
+
+  - All trainers have the same number of processes.
+
+  - If GPU acceleration is enabled, each MPI process corresponds to
+    one GPU.
+
++ Processors are block assigned to trainers based on MPI rank.
+
+  - In order to minimize the cost of intra-trainer communication, make
+    sure to map processes to the hardware and network
+    topologies. Typically, this just means choosing a sensible number
+    of processes per trainer, e.g. a multiple of the number of GPUs
+    per compute node.
+
++ Generally, increasing the number of processes per trainer will
+  accelerate computation but require more intra-trainer
+  communication. There is typically a sweet spot where run time is
+  minimized, but it is complicated and sensitive to the nature of the
+  computation, the mini-batch size, the data partitioning scheme,
+  hardware and network properties, the communication algorithms, and
+  myriad other factors.
+
+  - Rule-of-thumb: Configure experiments so that the bulk of run time
+    is taken by compute-bound operations (e.g. convolution or matrix
+    multiplication) and so that each process has enough work to
+    achieve a large fraction of peak performance (e.g. by making the
+    mini-batch size sufficiently large).
+
++ Most HPC systems are managed with job schedulers like `Slurm
+  <https://slurm.schedmd.com/overview.html>`_. Typically, users can
+  not immediately access compute nodes but must request them from
+  login nodes. The login nodes can be accessed directly (e.g. via
+  :bash:`ssh`), but users are discouraged from doing heavy computation
+  on them.
+
+  - For debugging and quick testing, it's convenient to request an
+    interactive session (:bash:`salloc` or :bash:`sxterm` with Slurm).
+
+  - If you need to run multiple experiments or if experiments are not
+    time-sensitive, it's best to submit a batch job (:bash:`sbatch`
+    with Slurm).
+
+  - When running an experiment, make sure you know what scheduler
+    account to charge (used by the scheduler for billing and
+    determining priority) and what scheduler partition to run on
+    (compute nodes on a system are typically subdivided into multiple
+    groups, e.g. for batch jobs and for debugging).
+
+    + With :bash:`salloc`, specify the partition using the
+      :bash:`--partition` command-line argument and specify the
+      account using :bash:`--account`.
+
+  - Familiarize yourself with the rules for the systems you use
+    (e.g. the expected work for each partition, time limits, job
+    submission limits) and be a good neighbor.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Model components
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++ Layer: A tensor operation, arranged within a directed acyclic graph.
+
+  - During evaluation ("forward prop"), a layer receives input tensors
+    from its parents and sends an output tensor to each child.
+
+  - During automatic differentiation ("backprop"), a layer receives
+    "input error signals" (objective function gradients w.r.t. output
+    tensors) from its children and sends "output error signals"
+    (objective function gradients w.r.t. input tensors) to its
+    parents. If the layer has any associated weight tensors, it will
+    also compute objective function gradients w.r.t. the weight
+    tensors.
+
+  - Most layers require a specific number of parents and children, but
+    LBANN will insert layers into the graph if there is a mismatch and
+    the intention is obvious. For example, if a layer expects one
+    child but has multiple, then a split layer (with multiple output
+    tensors all identical to the input tensor) is inserted. Similarly,
+    if a layer has fewer children than expected, dummy layers will be
+    inserted. However, this does not work if there is any
+    ambiguity. In such cases (common with input and slice layers), it
+    is recommended to manually insert identity layers so that the
+    parent/child relationships are absolutely unambiguous.
+
+  - See `lbann/src/proto/layers.proto
+    <https://github.com/LLNL/lbann/blob/develop/src/proto/layers.proto>`_
+    for a full list of supported layers.
+
++ Weights: A tensor consisting of trainable parameters, typically
+  associated with one or more layers. A weight tensor owns an
+  initializer to initially populate its values and an optimizer to
+  find values that minimize the objective function.
+
+  - A weight tensor without a specified initializer will use a zero
+    initializer.
+
+  - A weight tensor without a specified optimizer will use the model's
+    default optimizer.
+
+  - If a layer requires weight tensors and none are specified, it will
+    create the needed weight tensors. The layer will pick sensible
+    initializers and optimizers for the weight tensors. For example, a
+    convolution layer will initialize its kernel tensor with He normal
+    initialization and with the model's default optimizer.
+
+  - The dimensions of a weight tensor is determined by their
+    associated layers. The user can not set it directly.
+
++ Objective function: Mathematical expression that the optimizers will
+  attempt to minimize. It is made up of multiple terms that are added
+  together (possibly with scaling factors).
+
+  - An objective function term can get its value from a scalar-valued
+    layer, i.e. a layer with an output tensor with one entry.
+
++ Metric: Mathematical expression that will be reported to the
+  user. This typically does not affect training, but is helpful for
+  evaluating the progress of training. A canonical example for
+  classification problems is classification accuracy.
+
++ Callback: Function that is performed at various points during an
+  experiment. Callbacks are helpful for reporting, debugging, and
+  performing advanced training techniques.
+
+  - This is the natural home for experimental training
+    techniques.
+
+  - A common use-case is to export values with the "dump outputs"
+    callback so that the user can perform data post-processing or
+    visualization.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Data readers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning:: The core infrastructure for data readers is slated for
+             significant refactoring, so expect major changes in the
+             future.
+
+Data readers are responsible for managing a data set and providing
+data samples to models. A data set is comprised of independent data
+samples, each of which is made up of multiple tensors. For example, a
+data sample for a labeled image classification problem consists of an
+image tensor and a one-hot label vector.
+
+.. note:: The data readers are currently hard-coded to assume this
+          simple classification paradigm. Hacks are needed if your
+          data does not match it exactly, e.g. if a data sample is
+          comprised of more than two tensors. The most basic approach
+          is to flatten all tensors and concatenate them into one
+          large vector. The model is then responsible for slicing this
+          vector into the appropriate chunks and resizing the chunks
+          into the appropriate dimensions. Done correctly, this should
+          not impose any additional overhead.
+
+Specifically, data readers and models interact via input layers. Each
+model must have exactly one input layer and its output tensors are
+populated by a data reader every mini-batch step. This is typically
+performed by a background thread pool, so data ingestion will
+efficiently overlap with other computation, especially if the data
+reader's work is IO-bound or if the computation is largely on GPUs.
+
+.. note:: An input layer has an output tensor for each data sample
+          tensor. Since each data sample has two tensors (one for the
+          data and one for the label), it follows that every input
+          layer should have two child layers. To make parent/child
+          relationships unambiguous, we recommend manually creating
+          identity layers as children of the input layer.
+
+Note that layers within a model treat the data for a mini-batch as a
+single tensor where the leading dimension is the mini-batch size.
+Thus, corresponding tensors in all data samples must have the same
+dimensions. The data dimensions must be known from the beginning of
+the experiment and can not change. However, real data is rarely so
+consistent and some preprocessing is typically required. See
+`lbann/src/proto/transforms.proto
+<https://github.com/LLNL/lbann/blob/develop/src/proto/transforms.proto>`_
+for a list of available preprocessing transforms.
+
+.. warning:: The Python data reader will trigger some process forking
+             that doesn't interact with InfiniBand all that well by
+             default. Users may encounter hangs on clusters that use
+             InfiniBand. To avoid this, ensure that
+             :bash:`IBV_FORK_SAFE=1` is exported into the environment
+             when running LBANN.
+
+------------------------------------------------
+Python frontend
+------------------------------------------------
+
+LBANN provides a Python frontend with syntax reminiscent of `PyTorch
+<https://pytorch.org/>`_. See `a simple implementation of LeNet
+<https://github.com/LLNL/lbann/blob/develop/applications/vision/lenet.py>`_.
+
+Comments:
+
++ Under-the-hood, the Python frontend is actually a convenience
+  wrapper around the Protobuf frontend. The core infrastructure allows
+  users to configure an experiment and "compiles" it to a Prototext
+  text file.
+
++ The Python interface can only configure and launch experiments. It
+  is not active during an experiment and it does not allow for any
+  dynamic control flow.
+
++ Only Python 3 is supported.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Setup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :python:`lbann` Python package is installed as part of the LBANN
+build process. However, it is necessary to update the
+:bash:`PYTHONPATH` environment variable to make sure Python detect
+it. There are several ways to do this:
+
++ If LBANN has been built with the Spack user build process, loading
+  LBANN will automatically update :bash:`PYTHONPATH`:
 
 .. code-block:: bash
 
-    <mpi-launcher> <mpi-options> \
-        lbann <lbann-options> \
-        --model=model.prototext \
-        --optimizer=opt.prototext \
-        --reader=data_reader.prototext
-
-When using GPGPU accelerators, users should be aware that LBANN is
-optimized for the case in which one assigns one GPU per MPI
-*rank*. This should be borne in mind when choosing the parameters for
-the MPI launcher.
-
-A list of options for LBANN may be found by running :bash:`lbann
---help`.
-
-.. note:: At time of writing, it is known that some of these are
-          out-of-date. An
-          `issue <https://github.com/LLNL/lbann/issues/864>`_ has been
-          opened to track this.
-
-.. _using-the-model-zoo:
-
---------------------
-Using the model zoo
---------------------
-
-LBANN ships with prototext descriptions of a variety of models,
-optimizers and data readers. These may be found in the :code:`model_zoo/`
-directory of the source repository or the :code:`share/model_zoo/` directory
-of the install directory.
-
-.. warning:: Some of these prototexts point to specific data locations
-             on LLNL LC clusters. Users may have to modify such paths
-             to point to locations on their own systems. This can be
-             done by modifying the prototext directly or overriding
-             the options on the command line with, e.g., the
-             :code:`--data_filedir_train` and
-             :code:`--data_filedir_test` options.
-
-The following is an example invocation of LBANN on a machine using
-Slurm's :bash:`srun` as an MPI launcher. In the example command,
-a machine with 2 GPGPUs per node are available, 4 nodes will be used,
-:bash:`${LBANN_EXE}` is the path to the :code:`lbann` executable, and
-:bash:`${LBANN_MODEL_ZOO_DIR}` is the path to the :code:`model_zoo/` directory in
-either the source tree or the install tree. Note that the options
-passed to :bash:`srun` are not likely to be portable to other MPI
-launchers. The example will train Alexnet with SGD optimization on the
-Imagenet dataset for 5 epochs.
+    module load lbann
+
+.. warning:: The above will *not* work if LBANN has been built with
+             :bash:`scripts/build_lbann_lc.sh` or with the Spack
+             developer build process.
+
++ LBANN includes a modulefile that updates :bash:`PYTHONPATH`:
 
 .. code-block:: bash
 
-    srun -N4 --ntasks-per-node=2 \
-        ${LBANN_EXE} \
-        --model=${LBANN_MODEL_ZOO_DIR}/models/alexnet/alexnet.prototext \
-        --optimizer=${LBANN_MODEL_ZOO_DIR}/optimizers/opt_sgd.prototext \
-        --reader=${LBANN_MODEL_ZOO_DIR}/data_readers/data_reader_imagenet.prototext \
-        --num_epochs=5
-    
----------------------------------------------
-Using the Python interface for prototext
----------------------------------------------
-
-There is a python interface for generating model prototext
-files. Example Python scripts may be found in the
-:code:`scripts/proto/lbann/models` directory of the source
-repository. Running the Python script will generate a prototext that
-can be passed to the :code:`--model` option for LBANN.
+    module use <install directory>/etc/modulefiles
+    module load lbann-<version>
+
++ Directly manipulate :bash:`PYTHONPATH`:
 
 .. code-block:: bash
-                
-    python3 alexnet.py alexnet.prototext
-    <mpi-launcher> <mpi-options> \
-        lbann --model=alexnet.prototext <other-lbann-options>
 
-where :code:`<other-lbann-options>` are as documented
-:ref:`above <using-the-model-zoo>`, with optimizer and data reader
-prototexts coming from the appropriate :code:`model_zoo/` directories.
+    export PYTHONPATH=<install directory>/lib/python<version>/site-packages:${PYTHONPATH}
+
+Note that LBANN depends on the Protobuf Python package, which can be
+installed with:
+
+.. code-block:: bash
+
+    pip install protobuf
+
+If the user does not own the site-packages directory, then it may be
+necessary to pass the :bash:`--user` flag to pip.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Basic usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A typical workflow involves the following steps:
+
+1. Configuring a :python:`Trainer`.
+
+2. Configuring LBANN model components (like the graph of
+   :python:`Layer` s) and creating a :python:`Model`.
+
+  + Classes for model components are automatically generated from the
+    LBANN Protobuf specifications in `lbann/src/proto
+    <https://github.com/LLNL/lbann/blob/develop/src/proto>`_. These
+    files are currently the best source of documentation. Message
+    fields in the Protobuf specification are optional keyword
+    arguments for the corresponding Python class constructor. If a
+    keyword argument is not provided, it is logically zero (e.g. false
+    for Boolean fields and empty for string fields)
+
+3. Configuring the default :python:`Optimizer` to be used by the
+   :python:`Weights` objects.
+
+4. Loading in a Protobuf text file describing the data reader.
+
+   + The Python frontend currently does not have good support for
+     specifying data readers. If any data reader properties need to be
+     set programmatically, the user must do it directly via the
+     Protobuf Python API.
+
+5. Launching LBANN by calling :python:`run`.
+
+   + :python:`lbann.run` should be run from a compute node. If a node
+     allocation is not available, the :python:`batch_job` option can
+     be set to submit a batch job to the scheduler.
+
+   + A timestamped work directory will be created each time LBANN is
+     run. The default location of these work directories can be set
+     with the environment variable :bash:`LBANN_EXPERIMENT_DIR`.
+
+   + Supported job managers are Slurm and LSF.
+
+   + LLNL users and collaborators may prefer to use
+     :python:`lbann.contrib.launcher.run`. This is similar to
+     :python:`lbann.run`, with defaults and optimizations for certain
+     systems.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A simple example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    import lbann
+
+    # ----------------------------------
+    # Construct layer graph
+    # ----------------------------------
 
-------------------------------
-Running the inference engine
-------------------------------
+    # Input data
+    input = lbann.Input()
+    image = lbann.Identity(input)
+    label = lbann.Identity(input)
 
-This section is under construction, requiring input from other team
-members. Until it is complete, please ask questions on the
-`issue tracker <https://github.com/llnl/lbann/issues>`_.
+    # Softmax classifier
+    y = lbann.FullyConnected(image, num_neurons = 10, has_bias = True)
+    pred = lbann.Softmax(y)
+
+    # Loss function and accuracy
+    loss = lbann.CrossEntropy([pred, label])
+    acc = lbann.CrossEntropy([pred, label])
+
+    # ----------------------------------
+    # Setup experiment
+    # ----------------------------------
+
+    # Setup trainer
+    trainer = lbann.Trainer()
+
+    # Setup model
+    mini_batch_size = 64
+    num_epochs = 5
+    model = lbann.Model(mini_batch_size,
+                        num_epochs,
+                        layers=lbann.traverse_layer_graph(input),
+                        objective_function=loss,
+                        metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
+                        callbacks=[lbann.CallbackPrint(), lbann.CallbackTimer()])
+
+    # Setup optimizer
+    opt = lbann.SGD(learn_rate=0.01, momentum=0.9)
+
+    # Load data reader from prototext
+    import google.protobuf.text_format
+    data_reader_proto = lbann.lbann_pb2.LbannPB()
+    with open('path/to/lbann/model_zoo/data_readers/data_reader_mnist.prototext', 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), data_reader_proto)
+    data_reader_proto = data_reader_proto.data_reader
+
+    # ----------------------------------
+    # Run experiment
+    # ----------------------------------
+
+    lbann.run(trainer, model, data_reader_proto, opt)
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Useful submodules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+^^^^^^^^^^^^^^^^^^^^^^^^
+:python:`lbann.modules`
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+A :python:`Module` is a pattern of layers that can be applied multiple
+times in a neural network. Once created, a :python:`Module` is
+*callable*, taking a layer as input and returning a layer as
+output. They will create and manage :python:`Weights` es internally,
+so they are convenient for weight sharing between different
+layers. They are also useful for complicated patterns like RNN cells.
+
+*A possible note of confusion*: "Modules" in LBANN are similar to
+"layers" in PyTorch, TensorFlow, and Keras. LBANN uses "layer" to
+refer to tensor operations, in a similar manner as Caffe.
+
+^^^^^^^^^^^^^^^^^^^^^^^^
+:python:`lbann.models`
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Several common and influential neural network models are implemented
+as :python:`Module` s. They can be used as building blocks within more
+complicated models.
+
+^^^^^^^^^^^^^^^^^^^^^^^^
+:python:`lbann.proto`
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :python:`save_prototext` function will export a Protobuf text
+file, which can be fed into the Protobuf frontend.
+
+^^^^^^^^^^^^^^^^^^^^^^^^
+:python:`lbann.onnx`
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+This contains functionality to convert between LBANN and ONNX
+models. See `python/docs/onnx/README.md
+<https://github.com/LLNL/lbann/blob/develop/python/docs/onnx/README.md>`_
+for full documentation.
+
+------------------------------------------------
+Protobuf frontend (advanced)
+------------------------------------------------
+
+The main LBANN driver uses Protobuf text files (sometimes called
+prototext files) to specify experiments. The Python frontend operates
+by "compiling" an experiment configuration into a Protobuf text file
+and passing it into the LBANN driver. Aside from quick debugging,
+there are very few situations where directly manipulating Protobuf
+text files is superior to using the Python frontend. In fact, it is
+possible to use Protobuf's Python API to programmatically manipulate
+Protobuf messages, if such fine control is necessary.
+
+In order to fully specify an experiment, the user must provide
+Protobuf text files for the model, default optimizer, and data
+reader. These can be provided as three separate files or one unified
+file. The basic template for running LBANN is
+
+.. code-block:: bash
+
+    <mpi-launcher> <mpi-options> \
+        lbann --prototext=experiment.prototext
 
+The LBANN Protobuf format is defined in `src/proto/lbann.proto
+<https://github.com/LLNL/lbann/blob/develop/src/proto/lbann.proto>`_. It
+is important to remember that the default value of a Protobuf field is
+logically zero (e.g. false for Boolean fields and empty for string
+fields).
diff --git a/external/TBinf/TBinf.cpp b/external/TBinf/TBinf.cpp
index b92141f9bc5..90e9dabdd8e 100644
--- a/external/TBinf/TBinf.cpp
+++ b/external/TBinf/TBinf.cpp
@@ -68,6 +68,26 @@ void SummaryWriter::add_scalar(const std::string tag, float value,
   write_summary_event(s, step);
 }
 
+void SummaryWriter::add_image(const std::string& tag,
+                              std::string encoded_img,
+                              const std::vector<size_t>& dims,
+                              int64_t step){
+
+  auto s = std::unique_ptr<tensorflow::Summary>(new tensorflow::Summary());
+  tensorflow::Summary::Value *v = s->add_value();
+  v->set_tag(tag);
+  tensorflow::Summary_Image *img = v->mutable_image();
+  img->Clear();
+  img->set_colorspace(dims[0]);
+  img->set_height(dims[1]);
+  img->set_width(dims[2]);
+
+  img->set_encoded_image_string(std::move(encoded_img));
+
+  write_summary_event(s.release(), step);
+}
+
+
 void SummaryWriter::add_histogram(const std::string tag,
                                   std::vector<float>::const_iterator first,
                                   std::vector<float>::const_iterator last,
diff --git a/external/TBinf/TBinf.hpp b/external/TBinf/TBinf.hpp
index 0a11937da71..0edcad8aa54 100644
--- a/external/TBinf/TBinf.hpp
+++ b/external/TBinf/TBinf.hpp
@@ -39,27 +39,40 @@
 namespace TBinf {
 
 /**
- * Write data to a Tensorboard logging directory.
- * This writes data in the same format as Tensorflow does.
+ * @brief Write data to Tensorboard logging directory in Tensorflow format.
  */
 class SummaryWriter {
  public:
   /**
-   * Create a new event file in logdir to write to.
+   * @brief Create a new event file in logdir to write to.
    * @param logdir The directory where the event file will be written.
    */
   SummaryWriter(const std::string logdir);
   ~SummaryWriter();
 
   /**
-   * Add a scalar value to the event file.
+   * @brief Add a scalar value to the event file.
    * @param tag The tag for this summary.
    * @param value The scalar value.
    * @param step Optional global step.
    */
   void add_scalar(const std::string tag, float value, int64_t step = -1);
+
+  /**
+   * @brief Add an image to the event file.
+   * @param tag The tag for this summary.
+   * @param encoded_img The image to be written.
+   * @param dims The dimensions of the image.
+   * @param step Optional global step.
+   */
+
+ void add_image(std::string const& tag,
+                std::string encoded_img,
+                const std::vector<size_t>& dims,
+                int64_t step = -1);
+
   /**
-   * Add a histogram of values to the event file.
+   * @brief Add a histogram of values to the event file.
    * @param tag The tag for this summary.
    * @param first Iterator to the first value to add.
    * @param last Iterator past the last value to add.
@@ -70,7 +83,7 @@ class SummaryWriter {
                      std::vector<float>::const_iterator last,
                      int64_t step = -1);
   /**
-   * Add a histogram based upon buckets to the event file.
+   * @brief Add a histogram based upon buckets to the event file.
    * @param tag The tag for this summary.
    * @param buckets The histogram buckets.
    * @param min The minimum value in the dataset.
@@ -85,44 +98,44 @@ class SummaryWriter {
                      double min, double max, double num,
                      double sum, double sqsum,
                      int64_t step = -1);
-  /** Return the current histogram buckets. */
+  /** @brief Return the current histogram buckets. */
   const std::vector<double>& get_histogram_buckets() const;
-  /** Return the default histogram buckets. */
+  /** @brief Return the default histogram buckets. */
   static std::vector<double> get_default_histogram_buckets();
 
-  /** Ensure all events are written out. */
+  /** @brief Ensure all events are written out. */
   void flush();
 
  private:
   /**
-   * Write a summary to the event file.
+   * @brief Write a summary to the event file.
    * @param s The summary to write.
    * @param step Optional global step for the event.
    */
   void write_summary_event(tensorflow::Summary *s, int64_t step = -1);
 
   /**
-   * Write an event to the event file.
+   * @brief Write an event to the event file.
    * @param e The event to write.
    */
   void write_event(tensorflow::Event& e);
 
-  /** Get current wall time in fractional seconds. */
+  /** @brief Get current wall time in fractional seconds. */
   double get_time_in_seconds();
 
-  /** Initialize histogram buckets. */
+  /** @brief Initialize histogram buckets. */
   void init_histogram_buckets();
 
-  /** Current event version. */
+  /** @brief Current event version. */
   static constexpr const char *EVENT_VERSION = "brain.Event:2";
 
-  /** Filename to write to. */
+  /** @brief Filename to write to. */
   std::string filename;
 
-  /** File stream for writing. */
+  /** @brief File stream for writing. */
   std::fstream file;
 
-  /** Current histogram buckets. */
+  /** @brief Current histogram buckets. */
   std::vector<double> histogram_buckets;
 };
 
diff --git a/include/lbann/CMakeLists.txt b/include/lbann/CMakeLists.txt
index 28123a8350b..bfd9b756b61 100644
--- a/include/lbann/CMakeLists.txt
+++ b/include/lbann/CMakeLists.txt
@@ -8,15 +8,21 @@ set_full_path(THIS_DIR_HEADERS
 
 # Add the subdirectories
 add_subdirectory(callbacks)
+add_subdirectory(data_coordinator)
 add_subdirectory(data_readers)
 add_subdirectory(data_store)
+add_subdirectory(execution_contexts)
 add_subdirectory(io)
 add_subdirectory(layers)
+add_subdirectory(macros)
 add_subdirectory(metrics)
 add_subdirectory(models)
 add_subdirectory(objective_functions)
 add_subdirectory(optimizers)
 add_subdirectory(proto)
+add_subdirectory(trainers)
+add_subdirectory(training_algorithms)
+add_subdirectory(transforms)
 add_subdirectory(utils)
 add_subdirectory(weights)
 
diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp
index a4baa63c443..996bb655524 100644
--- a/include/lbann/base.hpp
+++ b/include/lbann/base.hpp
@@ -27,24 +27,35 @@
 #ifndef LBANN_BASE_HPP_INCLUDED
 #define LBANN_BASE_HPP_INCLUDED
 
-#include "El.hpp"
-#include "lbann/Elemental_extensions.hpp"
-#include "lbann/utils/cyg_profile.hpp"
-#include "lbann/utils/file_utils.hpp"
+#include <El.hpp>
 
 // Defines, among other things, DataType.
 #include "lbann_config.hpp"
 
+#include "lbann/Elemental_extensions.hpp"
+#include "lbann/utils/cyg_profile.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/enum_iterator.hpp"
+#ifdef LBANN_HAS_HALF
+#include "lbann/utils/serialization.hpp"
+#endif // LBANN_HAS_HALF
+
 // Support for OpenMP macros
 #include "lbann/utils/omp_pragma.hpp"
 
 #include <functional>
+#include <iostream>
+#include <memory>
+#include <string>
 
 namespace lbann {
 
 // Forward-declaration.
 class lbann_comm;
 
+/// Creating an observer_ptr to complement the unique_ptr and shared_ptr
+template <typename T> using observer_ptr = typename std::add_pointer<T>::type;
+
 // Note that this should only be used to wrap the thing coming out of
 // initialize()! This will be removed when we have proper RAII around
 // these things.
@@ -63,7 +74,7 @@ using world_comm_ptr =
  *  @param seed RNG seed.
  *  @return     LBANN communicator corresponding to MPI_COMM_WORLD.
  */
-world_comm_ptr initialize(int& argc, char**& argv, int seed = -1);
+world_comm_ptr initialize(int& argc, char**& argv);
 
 /** Destroy LBANN communicator.
  *
@@ -72,6 +83,14 @@ world_comm_ptr initialize(int& argc, char**& argv, int seed = -1);
  */
 void finalize(lbann_comm* comm = nullptr);
 
+#ifdef LBANN_HAS_HALF
+using cpu_fp16 = El::cpu_half_type;
+#endif
+
+#ifdef LBANN_HAS_GPU_FP16
+using fp16 = El::gpu_half_type;
+#endif
+
 // Typedefs for Elemental matrices
 using AbsMat = El::AbstractMatrix<DataType>;
 using CPUMat = El::Matrix<DataType, El::Device::CPU>;
@@ -79,6 +98,7 @@ using CPUMat = El::Matrix<DataType, El::Device::CPU>;
 using GPUMat = El::Matrix<DataType, El::Device::GPU>;
 #endif // LBANN_HAS_GPU
 using AbsDistMat = El::AbstractDistMatrix<DataType>;
+using BaseDistMat = El::BaseDistMatrix;
 
 // Deprecated typedefs
 /// @todo Remove
@@ -90,22 +110,45 @@ template <El::Device D>
 using AbsDistMatReadProxy = El::AbstractDistMatrixReadDeviceProxy<DataType, D>;
 using ElMat      = El::ElementalMatrix<DataType>;
 using BlockMat   = El::BlockMatrix<DataType>;
+
+template <typename TensorDataType>
+using CPUMatDT = El::Matrix<TensorDataType, El::Device::CPU>;
+
+template <typename TensorDataType, El::Device D>
+using MCMRMatDT   = El::DistMatrix<TensorDataType, El::MC  , El::MR  , El::ELEMENT, D>;
+template <typename TensorDataType, El::Device D>
+using CircMatDT   = El::DistMatrix<TensorDataType, El::CIRC, El::CIRC, El::ELEMENT, D>;
+template <typename TensorDataType, El::Device D>
+using StarMatDT   = El::DistMatrix<TensorDataType, El::STAR, El::STAR, El::ELEMENT, D>;
+template <typename TensorDataType, El::Device D>
+using StarVCMatDT = El::DistMatrix<TensorDataType, El::STAR, El::VC  , El::ELEMENT, D>;
+template <typename TensorDataType, El::Device D>
+using VCStarMatDT = El::DistMatrix<TensorDataType, El::VC  , El::STAR, El::ELEMENT, D>; /// ColSumStarVCMat
+template <typename TensorDataType, El::Device D>
+using MCStarMatDT = El::DistMatrix<TensorDataType, El::MC  , El::STAR, El::ELEMENT, D>; /// RowSumMat
+template <typename TensorDataType, El::Device D>
+using MRStarMatDT = El::DistMatrix<TensorDataType, El::MR  , El::STAR, El::ELEMENT, D>; /// ColSumMat
+template <typename TensorDataType, El::Device D>
+using StarMRMatDT = El::DistMatrix<TensorDataType, El::STAR, El::MR  , El::ELEMENT, D>;
+template <typename TensorDataType>
+using DistMatDT   = MCMRMatDT<TensorDataType, El::Device::CPU>;
+
 template <El::Device D>
-using MCMRMat    = El::DistMatrix<DataType, El::MC  , El::MR  , El::ELEMENT, D>;
+using MCMRMat    = MCMRMatDT<DataType, D>;
 template <El::Device D>
-using CircMat    = El::DistMatrix<DataType, El::CIRC, El::CIRC, El::ELEMENT, D>;
+using CircMat    = CircMatDT<DataType, D>;
 template <El::Device D>
-using StarMat    = El::DistMatrix<DataType, El::STAR, El::STAR, El::ELEMENT, D>;
+using StarMat    = StarMatDT<DataType, D>;
 template <El::Device D>
-using StarVCMat  = El::DistMatrix<DataType, El::STAR, El::VC  , El::ELEMENT, D>;
+using StarVCMat  = StarVCMatDT<DataType, D>;
 template <El::Device D>
-using VCStarMat  = El::DistMatrix<DataType, El::VC  , El::STAR, El::ELEMENT, D>; /// ColSumStarVCMat
+using VCStarMat  = VCStarMatDT<DataType, D>; /// ColSumStarVCMat
 template <El::Device D>
-using MCStarMat  = El::DistMatrix<DataType, El::MC  , El::STAR, El::ELEMENT, D>; /// RowSumMat
+using MCStarMat  = MCStarMatDT<DataType, D>; /// RowSumMat
 template <El::Device D>
-using MRStarMat  = El::DistMatrix<DataType, El::MR  , El::STAR, El::ELEMENT, D>; /// ColSumMat
+using MRStarMat  = MRStarMatDT<DataType, D>; /// ColSumMat
 template <El::Device D>
-using StarMRMat  = El::DistMatrix<DataType, El::STAR, El::MR  , El::ELEMENT, D>;
+using StarMRMat  = StarMRMatDT<DataType, D>;
 using DistMat = MCMRMat<El::Device::CPU>;
 using Mat = El::Matrix<DataType, El::Device::CPU>; // Temporarily define as CPUMat
 
@@ -116,42 +159,25 @@ using EvalType = double;
 /// Distributed matrix format
 enum class matrix_format {MC_MR, CIRC_CIRC, STAR_STAR, STAR_VC, MC_STAR, invalid};
 
+/// @todo This should move to hydrogen
+std::string to_string(El::Device const& d);
+El::Device device_from_string(std::string const& str);
+
 /// Data layout that is optimized for different modes of parallelism
 enum class data_layout {MODEL_PARALLEL, DATA_PARALLEL, invalid};
-static matrix_format __attribute__((used)) data_layout_to_matrix_format(data_layout layout) {
-  matrix_format format;
-  switch(layout) {
-  case data_layout::MODEL_PARALLEL:
-    format = matrix_format::MC_MR;
-    break;
-  case data_layout::DATA_PARALLEL:
-    /// Weights are stored in STAR_STAR and data in STAR_VC
-    format = matrix_format::STAR_STAR;
-    break;
-  default:
-    throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " Invalid data layout selected");
-  }
-  return format;
-}
+matrix_format data_layout_to_matrix_format(data_layout layout);
+std::string to_string(data_layout const& dl);
+data_layout data_layout_from_string(std::string const& str);
 
 /// Neural network execution mode
 enum class execution_mode {training, validation, testing, prediction, invalid};
-static const char *__attribute__((used)) _to_string(execution_mode m) {
-  switch(m) {
-  case execution_mode::training:
-    return "training";
-  case execution_mode::validation:
-    return "validation";
-  case execution_mode::testing:
-    return "testing";
-  case execution_mode::prediction:
-    return "prediction";
-  case execution_mode::invalid:
-    return "invalid";
-  default:
-    throw("Invalid execution mode specified"); /// @todo this should be an lbann_exception but then the class has to move to resolve dependencies
-  }
-}
+std::string to_string(execution_mode m);
+using execution_mode_iterator = enum_iterator<execution_mode, execution_mode::training, execution_mode::invalid>;
+
+/** @brief Convert a string to an execution_mode. */
+execution_mode exec_mode_from_string(std::string const& str);
+/** @brief Extract an execution_mode from a stream. */
+std::istream& operator>>(std::istream& os, execution_mode& e);
 
 /** Pooling layer mode */
 enum class pool_mode {invalid, max, average, average_no_pad};
@@ -159,56 +185,26 @@ enum class pool_mode {invalid, max, average, average_no_pad};
 /** returns a string representation of the pool_mode */
 std::string get_pool_mode_name(pool_mode m);
 
-// NA - Not applicable, used for input layers that don't produce a second output
-enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, NA};
-
 /*
  * endsWith: http://thispointer.com/c-how-to-check-if-a-string-ends-with-an-another-given-string/
  * Case Sensitive Implementation of endsWith()
  * It checks if the string 'mainStr' ends with given string
  * 'toMatch'
  */
-static bool __attribute__((used)) endsWith(const std::string mainStr, const std::string &toMatch)
-{
-  if(mainStr.size() >= toMatch.size() &&
-     mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0)
-    return true;
-  else
-    return false;
-}
+bool endsWith(const std::string mainStr, const std::string &toMatch);
 
 /// Print the dimensions and name of a Elemental matrix
-static void __attribute__((used)) _print_matrix_dims(AbsDistMat *m, const char *name) {
-  std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl;
-}
-#define PRINT_MATRIX_DIMS(x) _print_matrix_dims(x, #x);
+void print_matrix_dims(AbsDistMat *m, const char *name);
+#define LBANN_PRINT_MATRIX_DIMS(x) print_matrix_dims(x, #x);
 
 /// Print the dimensions and name of a Elemental matrix
-static void __attribute__((used)) _print_local_matrix_dims(AbsMat *m, const char *name) {
-  std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl;
-}
-#define PRINT_LOCAL_MATRIX_DIMS(x) _print_local_matrix_dims(x, #x);
-
-// FIXME
-#if 1
-// __FILE__
-#define log_msg(...) {\
-  char str[256];\
-  sprintf(str, __VA_ARGS__);\
-  std::cout << "[" << m_comm->get_trainer_rank() << "." << m_comm->get_rank_in_trainer() << "][" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \
-  }
-#define log_simple_msg(...) {\
-  char str[256];\
-  sprintf(str, __VA_ARGS__);\
-  std::cout << "[" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \
-  }
-#else
-#define log_msg(...)
-#define log_simple_msg(...)
-#endif
+void print_local_matrix_dims(AbsMat *m, const char *name);
+#define LBANN_PRINT_LOCAL_MATRIX_DIMS(x) print_local_matrix_dims(x, #x);
+
+#define LBANN_MAKE_STR_(x) #x
+#define LBANN_MAKE_STR(x) LBANN_MAKE_STR_(x)
 
-#define LBANN_MAKE_STR(x) _LBANN_MAKE_STR(x)
-#define _LBANN_MAKE_STR(x) #x
+void lbann_mpi_err_handler(MPI_Comm *comm, int *err_code, ... );
 
 } // namespace lbann
 
diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt
index 8466fdf53ef..db67fe83570 100644
--- a/include/lbann/callbacks/CMakeLists.txt
+++ b/include/lbann/callbacks/CMakeLists.txt
@@ -1,35 +1,44 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   callback.hpp
-  callback_check_dataset.hpp
-  callback_check_gradients.hpp
-  callback_check_init.hpp
-  callback_check_metric.hpp
-  callback_checknan.hpp
-  callback_checksmall.hpp
-  callback_confusion_matrix.hpp
-  callback_debug.hpp
-  callback_debug_io.hpp
-  callback_dump_outputs.hpp
-  callback_dump_error_signals.hpp
-  callback_dump_gradients.hpp
-  callback_dump_minibatch_sample_indices.hpp
-  callback_dump_weights.hpp
-  callback_early_stopping.hpp
-  callback_hang.hpp
-  callback_imcomm.hpp
-  callback_io.hpp
-  callback_learning_rate.hpp
-  callback_ltfb.hpp
-  callback_perturb_adam.hpp
-  callback_print.hpp
-  callback_save_images.hpp
-  callback_save_model.hpp
-  callback_summary.hpp
-  callback_timer.hpp
-  callback_variable_minibatch.hpp
+  check_dataset.hpp
+  check_gradients.hpp
+  check_init.hpp
+  check_metric.hpp
+  check_nan.hpp
+  check_small.hpp
+  checkpoint.hpp
+  confusion_matrix.hpp
+  debug.hpp
+  debug_io.hpp
+  dump_error_signals.hpp
+  dump_gradients.hpp
+  dump_minibatch_sample_indices.hpp
+  dump_outputs.hpp
+  dump_weights.hpp
+  early_stopping.hpp
+  gpu_memory_usage.hpp
+  hang.hpp
+  imcomm.hpp
+  learning_rate.hpp
+  ltfb.hpp
+  mixup.hpp
+  monitor_io.hpp
+  perturb_adam.hpp
+  perturb_dropout.hpp
+  print_model_description.hpp
+  print_statistics.hpp
   profiler.hpp
-  callback_gpu_memory_usage.hpp
+  replace_weights.hpp
+  save_images.hpp
+  save_model.hpp
+  save_topk_models.hpp
+  set_weights_value.hpp
+  summary.hpp
+  sync_layers.hpp
+  timeline.hpp
+  timer.hpp
+  variable_minibatch.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp
index fae45448bb8..36b0cd8cb67 100644
--- a/include/lbann/callbacks/callback.hpp
+++ b/include/lbann/callbacks/callback.hpp
@@ -23,20 +23,36 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 //
-// lbann_callback .hpp - Base class for LBANN callbacks
+// callback .hpp - Base class for LBANN callbacks
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
-#define __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
+#ifndef LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
 
-#include "lbann/base.hpp"
-#include "lbann/utils/summary.hpp"
-#include "lbann/models/model.hpp"
+#include "lbann/trainers/trainer.hpp"
 #include "lbann/layers/layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/description.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/summary.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+
+#include <google/protobuf/message.h>
+
+#include <algorithm>
+#include <string>
+
+/** @brief A utility macro for easily adding default-constructed sub-class
+ *  builders.*/
+#define LBANN_ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName)  \
+  inline std::unique_ptr<callback_base> FunctionName(           \
+    const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&) {          \
+    return lbann::make_unique<Class>();                          \
+  }
 
 namespace lbann {
 
-/** @class lbann_callback
+/** @class callback_base
  *  @brief Base class for callbacks during training/testing.
  *
  *  The method of each callback is called at a given point during
@@ -44,37 +60,35 @@ namespace lbann {
  *  care about.  Callbacks may be passed a lbann_summary instance,
  *  which they can use to log any relevant information.
  */
-class lbann_callback {
+class callback_base {
 public:
 
   /** @name Constructors and destructor */
   ///@{
 
-  /** @brief Initialize a callback with an optional batch interval and
-   *         summarizer.
+  /** @brief Initialize a callback with an optional batch interval
    */
-  lbann_callback(int batch_interval = 1,
-                 lbann_summary *summarizer = nullptr) :
-    m_batch_interval(std::max(batch_interval, 1)), m_summarizer(summarizer) {}
-  lbann_callback(const lbann_callback&) = default;
-  virtual ~lbann_callback() {}
+  callback_base(int batch_interval = 1) :
+    m_batch_interval(std::max(batch_interval, 1)) {}
+  callback_base(const callback_base&) = default;
+  virtual ~callback_base() = default;
 
   ///@}
   /** @name Polymorphic copy */
   ///@{
 
-  virtual lbann_callback* copy() const = 0;
+  virtual callback_base* copy() const = 0;
 
   ///@}
   /** @name Modifiers */
   ///@{
 
-  void set_summarizer(lbann_summary *summarizer) {
-    m_summarizer = summarizer;
-  }
+  /** @brief Called once to set up the callback on the trainer
+   */
+  virtual void setup(trainer *t) {};
 
-  /** @brief Called once to set up the callback (after all layers are
-   *         set up).
+  /** @brief Called once to set up the callback on the model
+   *         (after all layers are set up).
    */
   virtual void setup(model *m) {};
 
@@ -82,6 +96,8 @@ class lbann_callback {
   /** @name Callback hooks */
   ///@{
 
+  /** @brief Called at the end of setup. */
+  virtual void on_setup_end(model *m) {}
   /** @brief Called at the beginning of training. */
   virtual void on_train_begin(model *m) {}
   /** @brief Called at the end of training. */
@@ -166,25 +182,58 @@ class lbann_callback {
   /** @brief Return this callback's name. */
   virtual std::string name() const = 0;
 
+  /** @brief Human-readable description. */
+  virtual description get_description() const;
+
   ///@}
 
+  /** @brief Build a standard directory hierachy including trainer,
+   * execution context, and model information (in that order).
+   */
+  inline std::string get_multi_trainer_ec_model_path(const model& m,
+                                                     const std::string& root_dir) {
+    std::string dir = root_dir;
+    if (dir.empty()) { dir = "./"; }
+    if (dir.back() != '/') { dir += "/"; }
+
+    const auto& c = static_cast<const sgd_execution_context&>(m.get_execution_context());
+    return build_string(dir,
+                        c.get_trainer().get_name(), '/',
+                        c.get_state_string(), '/',
+                        m.get_name(), '/');
+  }
+
+  /** @brief Build a standard directory hierachy including trainer,
+   * model information in that order.
+   */
+  inline std::string get_multi_trainer_model_path(const model& m,
+                                                  const std::string& root_dir) {
+    std::string dir = root_dir;
+    if (dir.empty()) { dir = "./"; }
+    if (dir.back() != '/') { dir += "/"; }
+
+    const auto& c = static_cast<const sgd_execution_context&>(m.get_execution_context());
+    return build_string(dir,
+                        c.get_trainer().get_name(), '/',
+                        m.get_name(), '/');
+  }
+
+
 protected:
 
   /** @brief Copy-assignment operator.
    *
    *  Performs a shallow (pointer) copy of the summarizer.
    */
-  lbann_callback& operator=(const lbann_callback&) = default;
+  callback_base& operator=(const callback_base&) = default;
 
 protected:
-  /** @todo Make lbann_callback data private */
+  /** @todo Make callback data private */
 
   /** @brief Batch methods should once every this many steps. */
   int m_batch_interval;
-  /** @brief Optional summarizer for the callbacks to use. */
-  lbann_summary *m_summarizer;
 };
 
 }  // namespace lbann
 
-#endif  // __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
+#endif  // LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/callback_check_dataset.hpp
deleted file mode 100644
index 09ce25d723f..00000000000
--- a/include/lbann/callbacks/callback_check_dataset.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
-
-#include <set>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Save the sample indices for each mini-batch to ordered set.
- * Check to make sure that all samples were properly processed.
- */
-class lbann_callback_check_dataset : public lbann_callback {
- public:
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_evaluate_forward_prop_end;
-
-  lbann_callback_check_dataset() :
-    lbann_callback() {}
-  lbann_callback_check_dataset(
-    const lbann_callback_check_dataset&) = default;
-  lbann_callback_check_dataset& operator=(
-    const lbann_callback_check_dataset&) = default;
-  lbann_callback_check_dataset* copy() const override {
-    return new lbann_callback_check_dataset(*this);
-  }
-  void on_forward_prop_end(model *m, Layer *l) override;
-  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
-  void on_epoch_end(model *m) override;
-  void on_validation_end(model *m) override;
-  void on_test_end(model *m) override;
-
-  void add_to_set(model *m, Layer *l, int64_t step, std::set<long> &set);
-
-  std::string name() const override { return "check data set indices"; }
- private:
-  /** @brief Basename for writing files. */
-  std::string m_basename;
-
-  std::set<long> training_set;
-  std::set<long> validation_set;
-  std::set<long> testing_set;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp
deleted file mode 100644
index 8433a00d5f1..00000000000
--- a/include/lbann/callbacks/callback_check_gradients.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Gradient checking callback.
- *  Gradient checking is performed at the beginning of the test
- *  phase. Using a fourth-order finite difference scheme, a numerical
- *  partial derivative is computed for every weight parameter. If the
- *  numerical derivative differs signifcantly from the analytical
- *  derivative computed during backprop, the gradient check has
- *  failed.
- */
-class lbann_callback_check_gradients : public lbann_callback {
-public:
-
-  /** Constructor.
-   *  @param step_size          Step size for numerical
-   *                            differentiation (with a step size of
-   *                            zero, the step size is chosen to
-   *                            minimize the numerical error).
-   *  @param verbose            Whether to print results for each
-   *                            parameter.
-   *  @param error_on_failure   Whether to throw an exception for
-   *                            large gradient errors.
-   */
-  lbann_callback_check_gradients(DataType step_size = DataType(0),
-                                 bool verbose = false,
-                                 bool error_on_failure = false);
-  lbann_callback_check_gradients* copy() const override {
-    return new lbann_callback_check_gradients(*this);
-  }
-  void on_test_begin(model *m) override;
-  std::string name() const override { return "check gradients"; }
-
-  /** Compute objective function value.
-   *  It is assumed that input data has already been loaded into the
-   *  activations of the first layer.
-   */
-  DataType compute_objective_function(model *m);
-
-private:
-
-  /** Step size for numerical differentiation. */
-  DataType m_step_size;
-  /** Whether to print results for each parameter. */
-  bool m_verbose;
-  /** Whether to throw an exception for large gradient errors. */
-  bool m_error_on_failure;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/callback_check_init.hpp
deleted file mode 100644
index 6d5572379fb..00000000000
--- a/include/lbann/callbacks/callback_check_init.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_check_init .hpp .cpp - Check multi-model init
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Verify that every model uses the same initialization.
- */
-class lbann_callback_check_init : public lbann_callback {
- public:
-  lbann_callback_check_init() : lbann_callback() {}
-  lbann_callback_check_init(const lbann_callback_check_init&) = default;
-  lbann_callback_check_init& operator=(
-    const lbann_callback_check_init&) = default;
-  lbann_callback_check_init* copy() const override {
-    return new lbann_callback_check_init(*this);
-  }
-  /** Check initializations. */
-  void on_train_begin(model *m) override;
-  std::string name() const override { return "check init"; }
- private:
-  /** Return true if x == y. */
-  bool check_equal(const AbsMat& x, const AbsMat& y) const;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_check_metric.hpp b/include/lbann/callbacks/callback_check_metric.hpp
deleted file mode 100644
index 8b094c8c395..00000000000
--- a/include/lbann/callbacks/callback_check_metric.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include <set>
-
-namespace lbann {
-
-/** Metric checking callback.
- *  Checks if a metric value falls within an expected range.
- */
-class lbann_callback_check_metric : public lbann_callback {
-public:
-
-  lbann_callback_check_metric(std::string metric_name,
-                              std::set<execution_mode> modes,
-                              EvalType lower_bound,
-                              EvalType upper_bound,
-                              bool error_on_failure);
-  lbann_callback_check_metric* copy() const override { return new lbann_callback_check_metric(*this); }
-  std::string name() const override { return "check metric"; }
-
-  void on_epoch_end(model* m) override      { check_metric(*m); }
-  void on_validation_end(model* m) override { check_metric(*m); }
-  void on_test_end(model* m) override       { check_metric(*m); }
-
-private:
-
-  /** Metric name. */
-  std::string m_metric_name;
-
-  /** Execution modes with metric checks. */
-  std::set<execution_mode> m_modes;
-
-  /** Lower bound for metric value. */
-  EvalType m_lower_bound;
-  /** Upper bound for metric value. */
-  EvalType m_upper_bound;
-
-  /** Whether to throw an exception if metric check fails. */
-  bool m_error_on_failure;
-
-  /** Perform metric check.
-   *  Does nothing if current execution mode is not in m_modes;
-   */
-  void check_metric(const model& m) const;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp
deleted file mode 100644
index c45a7eee95c..00000000000
--- a/include/lbann/callbacks/callback_checknan.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_checknan .hpp .cpp - Check matrices for invalid numbers
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Check matrices for whether they include any NaNs or infs to help debugging.
- * This will kill the rank if such values are discovered.
- */
-class lbann_callback_checknan : public lbann_callback {
- public:
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_end;
-
-  lbann_callback_checknan() : lbann_callback() {}
-  lbann_callback_checknan(const lbann_callback_checknan&) = default;
-  lbann_callback_checknan& operator=(
-    const lbann_callback_checknan&) = default;
-  lbann_callback_checknan* copy() const override {
-    return new lbann_callback_checknan(*this);
-  }
-  /** Check that activations are good. */
-  void on_forward_prop_end(model *m, Layer *l) override;
-  /** Check that error signals are good. */
-  void on_backward_prop_end(model *m, Layer *l) override;
-  /** Check that gradients are good. */
-  void on_backward_prop_end(model *m) override;
-  /** Check that weights are good. */
-  void on_batch_end(model *m) override;
-  std::string name() const override { return "checknan"; }
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_checkpoint.hpp b/include/lbann/callbacks/callback_checkpoint.hpp
deleted file mode 100644
index ebeacdeaa7e..00000000000
--- a/include/lbann/callbacks/callback_checkpoint.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_checkpoint .hpp .cpp - Callback hooks to checkpoint model
-////////////////////////////////////////////////////////////////////////////////
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include "lbann/io/persist.hpp"
-
-namespace lbann {
-
-/** @brief Checkpoint at given interval in given directory */
-class lbann_callback_checkpoint : public lbann_callback {
- public:
-
-  /** @brief Construct the checkpoint callback
-   *
-   *  It may be beneficial to the distributed checkpoints at a higher
-   *  tempo than the shared checkpoints because they are less
-   *  expensive.
-   *
-   *  @param checkpoint_dir directory to save checkpoint files
-   *  @param checkpoint_epochs interval to checkpoint
-   *  @param checkpoint_steps interval to checkpoint
-   *  @param checkpoint_secs interval to checkpoint
-   *  @param per_rank_dir The directory into which to dump distributed checkpoints
-   *  @param ckpt_dist_epochs The frequency of distributed checkpoints in epochs
-   *  @param ckpt_dist_steps The frequence of distributed checkpoints in steps
-   */
-  lbann_callback_checkpoint(std::string checkpoint_dir,
-                            int checkpoint_epochs,
-                            int checkpoint_steps,
-                            int checkpoint_secs,
-                            std::string per_rank_dir,
-                            int ckpt_dist_epochs,
-                            int ckpt_dist_steps) :
-    lbann_callback(),
-    m_checkpoint_dir(checkpoint_dir),
-    m_checkpoint_epochs(checkpoint_epochs),
-    m_checkpoint_steps(checkpoint_steps),
-    m_checkpoint_secs(checkpoint_secs),
-    m_per_rank_dir(per_rank_dir),
-    m_ckpt_dist_epochs(ckpt_dist_epochs),
-    m_ckpt_dist_steps(ckpt_dist_steps) {}
-  lbann_callback_checkpoint(const lbann_callback_checkpoint&) = default;
-  lbann_callback_checkpoint& operator=(const lbann_callback_checkpoint&) = default;
-  lbann_callback_checkpoint* copy() const override { return new lbann_callback_checkpoint(*this); }
-  void setup(model *m) override;
-  void on_epoch_end(model *m) override;
-  void on_batch_end(model *m) override;
-  void on_validation_end(model *m) override;
-
-  inline void set_checkpoint_dir(std::string dir){
-    m_checkpoint_dir= dir;
-  }
-
-  inline void set_checkpoint_epochs(int epochs){
-    m_checkpoint_epochs= epochs;
-  }
-
-  inline void set_checkpoint_steps(int steps){
-    m_checkpoint_steps= steps;
-  }
-
-  inline void set_checkpoint_secs(EvalType secs){
-    m_checkpoint_secs= secs;
-  }
-
-  inline void set_per_rank_dir(std::string dir){
-    m_per_rank_dir = dir;
-  }
-
-  inline void set_ckpt_dist_epochs(int ckpt_dist_epochs){
-    m_ckpt_dist_epochs = ckpt_dist_epochs;
-  }
-
-  inline void set_ckpt_dist_steps(int ckpt_dist_steps){
-    m_ckpt_dist_steps = ckpt_dist_steps;
-  }
-
-  bool need_checkpoint(model *m);
-  bool checkpoint(model *m);
-  bool restart(model *m);
-  std::string name() const override { return "checkpoint"; }
- protected:
-  std::string m_checkpoint_dir;
-  int m_checkpoint_epochs;
-  int m_checkpoint_steps;
-  EvalType m_checkpoint_secs;
-  std::string m_per_rank_dir;
-  int m_ckpt_dist_epochs;
-  int m_ckpt_dist_steps;
-  EvalType m_checkpoint_last;
-  persist p;
-  bool m_checkpoint_dist;
-  bool m_checkpoint_shared;
-
-  template<size_t _max_dir_len>
-  struct header_t {
-    int epoch;
-    int step;
-    int shared;
-    char dirname[_max_dir_len];
-  };
-};
-
-static inline std::string get_last_shared_checkpoint_filename(model *m, std::string dir) {
-  lbann_comm *comm = m->get_comm();
-  std::stringstream ss;
-  ss << dir << "/";
-  ss << m->get_name().c_str() << ".";
-  ss << comm->get_trainer_rank() << ".last.shared.checkpoint";
-  return ss.str();
-}
-
-static inline std::string get_shared_checkpoint_dirname(model *m, std::string dir, int epoch, int step) {
-  lbann_comm *comm = m->get_comm();
-  std::stringstream ss;
-  ss << dir << "/" << m->get_name().c_str();
-  ss << "." << comm->get_trainer_rank();
-  ss << ".shared.epoch." << epoch;
-  ss << ".step."<< step << "/";
-  return ss.str();
-}
-
-static inline std::string get_last_distributed_checkpoint_filename(model *m, std::string dir) {
-  lbann_comm *comm = m->get_comm();
-  std::stringstream ss;
-  ss << dir << "/";
-  ss << m->get_name().c_str() << ".";
-  ss << comm->get_trainer_rank() << ".last.distributed.checkpoint";
-  return ss.str();
-}
-
-static inline std::string get_distributed_checkpoint_dirname(model *m, std::string dir, int epoch, int step) {
-  lbann_comm *comm = m->get_comm();
-  std::stringstream ss;
-  ss << dir << "/" << m->get_name().c_str();
-  ss << "." << comm->get_trainer_rank();
-  ss << ".rank." << comm->get_rank_in_trainer();
-  ss << ".epoch." << epoch;
-  ss << ".step."<< step << "/";
-  return ss.str();
-}
-
-// Print last checkpoint to file, used to determine which checkpoint to load from.
-static inline bool write_latest(std::string filename, int epoch, int train) {
-  // open the file for writing
-  int fd = openwrite(filename.c_str());
-  if (fd != -1) {
-    char field[256];
-    sprintf(field, "epoch=%d step=%d\n", epoch, train);
-    write_string(fd, filename.c_str(), field, strlen(field));
-    // close our file
-    closewrite(fd, filename.c_str());
-  }
-  return true;
-}
-
-/** \brief Reads the "latest" file and returns the epoch number and
- *        sample offset for most recent checkpoint
- */
-static inline bool read_latest(std::string filename, int *epochLast, int *trainLast) {
-  // assume we don't have a file, we'll return -1 in that case
-  *epochLast = -1;
-  *trainLast = -1;
-  // open the file for reading
-  int fd = openread(filename.c_str());
-  if (fd != -1) {
-    // read epoch from file
-    char field[256];
-    read_string(fd, filename.c_str(), field, sizeof(field));
-    int ret = sscanf(field, "epoch=%d step=%d\n", epochLast, trainLast);
-    // close our file
-    closeread(fd, filename.c_str());
-    if(ret != 2) { return false; }
-  }
-  return true;
-}
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp
deleted file mode 100644
index 2f66a04d2d9..00000000000
--- a/include/lbann/callbacks/callback_checksmall.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_checksmall .hpp .cpp - Check matrices for small values
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Check matrices for whether they include any very small values to avoid
- * getting denormalized values. Denormalized values can significantly slow
- * floating point computations.
- * Since we often square values, the check is based on the square root of the
- * smallest floating point value.
- * This will kill the rank if such values are discovered.
- */
-class lbann_callback_checksmall : public lbann_callback {
- public:
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_end;
-
-  lbann_callback_checksmall() : lbann_callback() {}
-  lbann_callback_checksmall(const lbann_callback_checksmall&) = default;
-  lbann_callback_checksmall& operator=(
-    const lbann_callback_checksmall&) = default;
-  lbann_callback_checksmall* copy() const override {
-    return new lbann_callback_checksmall(*this);
-  }
-  /** Check that activations are good. */
-  void on_forward_prop_end(model *m, Layer *l) override;
-  /** Check that gradients are good. */
-  void on_backward_prop_end(model *m) override;
-  /** Check that weights are good. */
-  void on_batch_end(model *m) override;
-  std::string name() const override { return "checksmall"; }
- private:
-  /** Smallest allowable value. */
-  const DataType m_threshold = std::sqrt(std::numeric_limits<DataType>::min());
-  /** Return true if there are no problems with m. */
-  bool is_good(const AbsDistMat& m);
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_confusion_matrix.hpp b/include/lbann/callbacks/callback_confusion_matrix.hpp
deleted file mode 100644
index b87dc8b24a0..00000000000
--- a/include/lbann/callbacks/callback_confusion_matrix.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Compute confusion matrix.
- *  Confusion matrices are saved in CSV files of the form
- *  "<prefix><mode>.csv". The (i,j)-entry is the proportion of samples
- *  with prediction i and label j. The prediction and label layers are
- *  assumed to output one-hot vectors for each mini-batch sample.
- */
-class lbann_callback_confusion_matrix : public lbann_callback {
-public:
-
-  lbann_callback_confusion_matrix(std::string prediction_layer,
-                                  std::string label_layer,
-                                  std::string prefix);
-  lbann_callback_confusion_matrix(const lbann_callback_confusion_matrix&);
-  lbann_callback_confusion_matrix& operator=(const lbann_callback_confusion_matrix&);
-  lbann_callback_confusion_matrix* copy() const override {
-    return new lbann_callback_confusion_matrix(*this);
-  }
-  std::string name() const override { return "confusion matrix"; }
-
-  void setup(model *m) override;
-
-  void on_epoch_begin(model *m) override      { reset_counts(*m); }
-  void on_epoch_end(model *m) override        { save_confusion_matrix(*m); }
-  void on_validation_begin(model *m) override { reset_counts(*m); }
-  void on_validation_end(model *m) override   { save_confusion_matrix(*m); }
-  void on_test_begin(model *m) override       { reset_counts(*m); }
-  void on_test_end(model *m) override         { save_confusion_matrix(*m); }
-  void on_batch_end(model *m) override          { update_counts(*m); }
-  void on_batch_evaluate_end(model *m) override { update_counts(*m); }
-
-private:
-
-  /** Name of prediction layer.
-   *  This layer is assumed to output one-hot vectors.
-   */
-  std::string m_prediction_layer;
-  /** Name of label layer.
-   *  This layer is assumed to output one-hot vectors.
-   */
-  std::string m_label_layer;
-  /** Prefix for output files. */
-  std::string m_prefix;
-
-  /** Confusion matrix counts.
-   *  Each vector should be interpreted as a num_classes x num_classes
-   *  matrix in row-major order. The (i,j)-entry is the number of
-   *  samples with prediction i and label j.
-   */
-  std::map<execution_mode,std::vector<El::Int>> m_counts;
-
-  /** "View" into prediction matrix.
-   *  This is a CPU matrix. If the prediction layer keeps data on GPU,
-   *  then this will be a matrix copy rather than a matrix view.
-   */
-  std::unique_ptr<AbsDistMat> m_predictions_v;
-  /** "View" into label matrix.
-   *  This is a CPU matrix. If the label layer keeps data on GPU or in
-   *  a different distribution than the prediction layer, then this
-   *  will be a matrix copy rather than a matrix view.
-   */
-  std::unique_ptr<AbsDistMat> m_labels_v;
-
-  /** Get prediction matrix. */
-  const AbsDistMat& get_predictions(const model& m) const;
-  /** Get label matrix. */
-  const AbsDistMat& get_labels(const model& m) const;
-
-  /** Reset confusion matrix counts. */
-  void reset_counts(const model& m);
-  /** Update confusion matrix counts.
-   *  Counts are updated with current mini-batch predictions and
-   *  labels.
-   */
-  void update_counts(const model& m);
-  /** Output confusion matrix to file. */
-  void save_confusion_matrix(const model& m);
-
-};
-
-} // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_debug.hpp b/include/lbann/callbacks/callback_debug.hpp
deleted file mode 100644
index c342c7ad778..00000000000
--- a/include/lbann/callbacks/callback_debug.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * @brief Phase specific "printf debugging"
- *
- * Print verbose status updates to standard error stream.
- * This callback is useful for "printf debugging."
- *
- * Takes a prototext parameter @c phase: train | validate | test | \<empty\>
- * if \<empty\> will print messages for all phases
- *
- */
-class lbann_callback_debug : public lbann_callback {
- public:
-
-  /** @brief Constructor.
-   *
-   *  If modes is empty, status updates will be printed for all
-   *  execution modes.
-   */
-  lbann_callback_debug(std::set<execution_mode> modes,
-                       lbann_summary *summarizer = nullptr) :
-    lbann_callback(1, summarizer), m_modes(std::move(modes)) {}
-  lbann_callback_debug(const lbann_callback_debug&) = default;
-  lbann_callback_debug& operator=(const lbann_callback_debug&) = default;
-  lbann_callback_debug* copy() const override { return new lbann_callback_debug(*this); }
-  std::string name() const override { return "debug"; }
-
-  /** @brief Print that a batch is beginning. */
-  void on_batch_begin(model *m) override;
-  /** @brief Print that a batch is ending. */
-  void on_batch_end(model *m) override;
-  /** @brief Print that a layer's forward prop is beginning. */
-  void on_batch_evaluate_begin(model *m) override;
-  /** @brief Print that a layer's forward prop is ending. */
-  void on_batch_evaluate_end(model *m) override;
-
-  using lbann_callback::on_forward_prop_begin;
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_begin;
-  using lbann_callback::on_backward_prop_end;
-  using lbann_callback::on_evaluate_forward_prop_begin;
-  using lbann_callback::on_evaluate_forward_prop_end;
-  using lbann_callback::on_optimize_begin;
-  using lbann_callback::on_optimize_end;
-
-  /** @brief Print that a layer's forward prop is beginning. */
-  void on_forward_prop_begin(model *m, Layer *l) override;
-  /** @brief Print that a layer's forward prop is ending. */
-  void on_forward_prop_end(model *m, Layer *l) override;
-  /** @brief Print that a layer's backward prop is beginning. */
-  void on_backward_prop_begin(model *m, Layer *l) override;
-  /** @brief Print that a layer's backward prop is ending. */
-  void on_backward_prop_end(model *m, Layer *l) override;
-  /** @brief Print that a layer's backward prop is beginning. */
-  void on_evaluate_forward_prop_begin(model *m, Layer *l) override;
-  /** @brief Print that a layer's backward prop is ending. */
-  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
-
-  /** @brief Print that a weights' optimization step is beginning. */
-  void on_optimize_begin(model *m, weights *w) override;
-  /** @brief Print that a weights' optimization step is ending. */
-  void on_optimize_end(model *m, weights *w) override;
-
- private:
-
-  /** @brief Execution modes for which status updates will be printed.
-   *
-   *  If empty, status updates are printed for all execution modes.
-   */
-  std::set<execution_mode> m_modes;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_debug_io.hpp b/include/lbann/callbacks/callback_debug_io.hpp
deleted file mode 100644
index ffaff0af567..00000000000
--- a/include/lbann/callbacks/callback_debug_io.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_debug .hpp .cpp - Callback hooks to debug LBANN
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
-
-#include <chrono>
-#include <vector>
-#include "lbann/callbacks/callback.hpp"
-#include "lbann/layers/io/input/input_layer.hpp"
-
-namespace lbann {
-
-/**
- * Print status updates on where training is.
- */
-class lbann_callback_debug_io : public lbann_callback {
- public:
-  using lbann_callback::on_forward_prop_begin;
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_begin;
-  using lbann_callback::on_backward_prop_end;
-  using lbann_callback::on_evaluate_forward_prop_begin;
-  using lbann_callback::on_evaluate_forward_prop_end;
-
-  /**
-   * Debug a particular phase; use invalid to debug every phase.
-   */
-  lbann_callback_debug_io(execution_mode phase = execution_mode::invalid,
-                          int debug_lvl = 0,
-                          lbann_summary *summarizer = nullptr) :
-    lbann_callback(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {}
-  lbann_callback_debug_io(const lbann_callback_debug_io&) = default;
-  lbann_callback_debug_io& operator=(
-    const lbann_callback_debug_io&) = default;
-  lbann_callback_debug_io* copy() const override { return new lbann_callback_debug_io(*this); }
-  /** Print that a training epoch is being started. */
-  void on_epoch_begin(model *m) override;
-  /** Print that forward prop for a layer is beginning. */
-  void on_forward_prop_begin(model *m, Layer *l) override;
-
-  /** Print I/O details at the beginning of validation. */
-  void on_validation_begin(model *m) override;
-  /** Print that an evaluation forward prop is beginning. */
-  void on_evaluate_forward_prop_begin(model *m, Layer *l) override;
-
-  /** Print I/O details at the beginning of testing. */
-  void on_test_begin(model *m) override;
-
-  /** Common format for printing I/O stats at the start of a mini-batch */
-  void print_fp_start(model *m, generic_input_layer *input);
-  /** Common format for printing I/O stats at the start of a phase */
-  void print_phase_start(model *m, execution_mode mode);
-
-  std::string name() const override { return "debug_io"; }
- private:
-  /** The phase to debug. */
-  execution_mode m_debug_phase;
-  int m_debug_lvl; /** Debugging level: 0 - epoch begin, 1 - fwd prop */
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_dump_error_signals.hpp b/include/lbann/callbacks/callback_dump_error_signals.hpp
deleted file mode 100644
index 0c5571d9597..00000000000
--- a/include/lbann/callbacks/callback_dump_error_signals.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Dump gradients w.r.t. inputs to file.
- *  After each layer performs a backward prop step, this callback will
- *  dump the gradients w.r.t. inputs (the "error signals") to a
- *  human-readable ASCII file. This is slow and produces a lot of output.
- */
-class lbann_callback_dump_error_signals : public lbann_callback {
- public:
-
-  /** Constructor.
-   *  @param basename The basename for output files.
-   */
-  lbann_callback_dump_error_signals(std::string basename = "")
-    : lbann_callback(), m_basename(basename) {}
-  lbann_callback_dump_error_signals* copy() const override {
-    return new lbann_callback_dump_error_signals(*this);
-  }
-  std::string name() const override { return "dump error signals"; }
-
-  /** Write error signals to file after each backward prop step. */
-  void on_backward_prop_end(model *m, Layer *l) override;
-
- private:
-  /** Basename for output files. */
-  std::string m_basename;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_dump_gradients.hpp b/include/lbann/callbacks/callback_dump_gradients.hpp
deleted file mode 100644
index b0a6d587446..00000000000
--- a/include/lbann/callbacks/callback_dump_gradients.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
-
-#include <utility>
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * @brief Dump gradient matrices to files.
- * @details This will dump each hidden layer's gradient matrix after
- * each minibatch.  The matrices are written to files using
- * Elemental's simple ASCII format. This is not meant for
- * checkpointing, but for exporting gradient matrices for analysis
- * that isn't easily done in LBANN.  Note this dumps matrices during
- * each mini-batch. This will be slow and produce a lot of output.
- */
-class lbann_callback_dump_gradients : public lbann_callback {
- public:
-  using lbann_callback::on_backward_prop_end;
-
-  /**
-   * @param basename The basename for writing files.
-   * @param batch_interval The frequency at which to dump the gradients
-   */
-  lbann_callback_dump_gradients(std::string basename, int batch_interval = 1) :
-    lbann_callback(batch_interval), m_basename(std::move(basename)) {}
-  lbann_callback_dump_gradients(
-    const lbann_callback_dump_gradients&) = default;
-  lbann_callback_dump_gradients& operator=(
-    const lbann_callback_dump_gradients&) = default;
-  lbann_callback_dump_gradients* copy() const override {
-    return new lbann_callback_dump_gradients(*this);
-  }
-  void on_backward_prop_end(model *m) override;
-  std::string name() const override { return "dump gradients"; }
- private:
-  /** @brief Basename for writing files. */
-  std::string m_basename;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp
deleted file mode 100644
index 8840b1a83c5..00000000000
--- a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_minibatch_sample_indices .hpp .cpp - Callbacks
-// to dump the list of indices per minibatch
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
-
-#include <utility>
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * @brief Dump sample indices for each minibatch to files.
- * @details This will dump the list of indices from the training /
- * validation / testing data that was processed Note this dumps
- * vectors during each mini-batch. This will be slow and produce a lot
- * of output.
- */
-class lbann_callback_dump_minibatch_sample_indices : public lbann_callback {
- public:
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_evaluate_forward_prop_end;
-
-  /**
-   * @param basename The basename for writing files.
-   * @param batch_interval The frequency at which to dump sample indices
-   */
-  lbann_callback_dump_minibatch_sample_indices(std::string basename,
-                                               int batch_interval = 1) :
-    lbann_callback(batch_interval), m_basename(std::move(basename)) {}
-  lbann_callback_dump_minibatch_sample_indices(
-    const lbann_callback_dump_minibatch_sample_indices&) = default;
-  lbann_callback_dump_minibatch_sample_indices& operator=(
-    const lbann_callback_dump_minibatch_sample_indices&) = default;
-  lbann_callback_dump_minibatch_sample_indices* copy() const override {
-    return new lbann_callback_dump_minibatch_sample_indices(*this);
-  }
-  void on_forward_prop_end(model *m, Layer *l) override;
-  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
-
-  void dump_to_file(model *m, Layer *l, int64_t step);
-
-  std::string name() const override { return "dump minibatch sample indices"; }
- private:
-  /** Basename for writing files. */
-  std::string m_basename;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/callback_dump_outputs.hpp
deleted file mode 100644
index 0ad260be495..00000000000
--- a/include/lbann/callbacks/callback_dump_outputs.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-#include <set>
-#include <string>
-
-namespace lbann {
-
-/** @brief Dump layer output tensors to files.
- *
- *  Saves a file for each output tensor of each selected layer,
- *  computed at each mini-batch step. Output files have the form
- *  "<model>-<mode>-epoch<#>-step<#>-<layer>-output<#>.<format>".
- *  This is primarily intended as a debugging tool, although it can be
- *  used for inference when performance is not critical.
- *
- *  For NumPy file formats (npy and npz), tensor dimensions are
- *  recorded. For text file formats (CSV and TSV), each line contains
- *  flattened tensor data corresponding to one mini-batch sample
- *  (which is the transpose of the column-major matrix representation
- *  we use internally).
- *
- *  CNPY is required to export to NumPy file formats (npy and npz).
- */
-class lbann_callback_dump_outputs : public lbann_callback {
-public:
-
-  /** @brief Construct a callback to dump outputs.
-   *
-   *  @param layer_names    Names of layers with output dumps
-   *                        (default: dump outputs for all layers).
-   *  @param modes          Execution modes with output dumps
-   *                        (default: dump outputs for all modes).
-   *  @param batch_interval Frequency of output dumps (default: dump
-   *                        outputs at each mini-batch step).
-   *  @param directory      Directory for output files (default: current
-   *                        working directory).
-   *  @param file_format    Output file format. Options are csv, tsv,
-   *                        npy, npz (default: csv).
-   */
-  lbann_callback_dump_outputs(
-    std::set<std::string> layer_names,// = std::set<std::string>(),
-    std::set<execution_mode> modes, // = std::set<std::string>(),
-    El::Int batch_interval = 0,
-    std::string directory = "",
-    std::string file_format = "");
-
-  lbann_callback_dump_outputs* copy() const override {
-    return new lbann_callback_dump_outputs(*this);
-  }
-  std::string name() const override { return "dump outputs"; }
-
-  void on_forward_prop_end(model* m, Layer* l) override          { dump_outputs(*m, *l); }
-  void on_evaluate_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); }
-
-private:
-
-  /** @brief   Names of layers with output dumps.
-   *  @details If empty, outputs will be dumped for all layers.
-   */
-  std::set<std::string> m_layer_names;
-
-  /** @brief   Execution modes with output dumps.
-   *  @details If empty, outputs will be dumped for all execution modes.
-   */
-  std::set<execution_mode> m_modes;
-
-  /** @brief   Directory for output files.
-   *  @details Pathname has trailing '/'.
-   */
-  std::string m_directory;
-
-  /** @brief Output file format. */
-  std::string m_file_format;
-
-  /** @brief   Dump outputs to file.
-   *  @details Returns immediately if an output dump is not needed.
-   */
-  void dump_outputs(const model& m, const Layer& l);
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_dump_weights.hpp b/include/lbann/callbacks/callback_dump_weights.hpp
deleted file mode 100644
index 7edb2aacc20..00000000000
--- a/include/lbann/callbacks/callback_dump_weights.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
-
-#include <utility>
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Dump weight matrices to files.
- * This will dump each hidden layer's weight/bias matrix after each epoch.
- * The matrices are written to files using Elemental's simple ASCII format. This
- * is not meant for checkpointing, but for exporting weight matrices for
- * analysis that isn't easily done in LBANN.
- */
-class lbann_callback_dump_weights : public lbann_callback {
- public:
-  /**
-   * @param basename The basename for writing files.
-   */
-  lbann_callback_dump_weights(std::string basename) :
-    lbann_callback(), m_basename(std::move(basename)) {}
-  lbann_callback_dump_weights(const lbann_callback_dump_weights&) = default;
-  lbann_callback_dump_weights& operator=(
-    const lbann_callback_dump_weights&) = default;
-  lbann_callback_dump_weights* copy() const override {
-    return new lbann_callback_dump_weights(*this);
-  }
-  void on_train_begin(model *m) override;
-  void on_epoch_end(model *m) override;
-  std::string name() const override { return "dump weights"; }
- private:
-  /** Basename for writing files. */
-  std::string m_basename;
-  /// Dump weights from learning layers.
-  void dump_weights(model *m, std::string s = "");
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_early_stopping.hpp b/include/lbann/callbacks/callback_early_stopping.hpp
deleted file mode 100644
index e02fe4d3601..00000000000
--- a/include/lbann/callbacks/callback_early_stopping.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
-#define LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
-
-#include <unordered_set>
-#include <unordered_map>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Stop training after validation error stops improving.
- */
-class lbann_callback_early_stopping : public lbann_callback {
- public:
-  /**
-   * Continue training until score has not improved for patience epochs.
-   */
-  lbann_callback_early_stopping(int64_t patience);
-  lbann_callback_early_stopping(const lbann_callback_early_stopping&) = default;
-  lbann_callback_early_stopping& operator=(
-    const lbann_callback_early_stopping&) = default;
-  lbann_callback_early_stopping* copy() const override {
-    return new lbann_callback_early_stopping(*this);
-  }
-  /** Update validation score and check for early stopping. */
-  void on_validation_end(model *m) override;
-  std::string name() const override { return "early stopping"; }
- private:
-  /** Number of epochs to wait for improvements. */
-  int64_t m_patience;
-  /** Last recorded score. */
-  EvalType m_last_score = std::numeric_limits<EvalType>::max();
-  /** Current number of epochs without improvement. */
-  int64_t m_wait = 0;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/callback_gpu_memory_usage.hpp
deleted file mode 100644
index aa890efcc87..00000000000
--- a/include/lbann/callbacks/callback_gpu_memory_usage.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_gpu_memory_usage .hpp .cpp - Callbacks for printing GPU memory usage
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
-#define __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-/** Callback hooks for printing GPU memory usage. */
-class lbann_callback_gpu_memory_usage : public lbann_callback {
- public:
-
-  /** Constructor.
-   */
-  lbann_callback_gpu_memory_usage() = default;
-  lbann_callback_gpu_memory_usage(const lbann_callback_gpu_memory_usage&) = default;
-  lbann_callback_gpu_memory_usage& operator=(const lbann_callback_gpu_memory_usage&) = default;
-  lbann_callback_gpu_memory_usage* copy() const override { return new lbann_callback_gpu_memory_usage(*this); }
-  void on_epoch_begin(model *m) override;
-  std::string name() const override { return "GPU memory usage"; }
-};
-
-}  // namespace lbann
-
-#endif  // __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_hang.hpp b/include/lbann/callbacks/callback_hang.hpp
deleted file mode 100644
index 2ec4c68b835..00000000000
--- a/include/lbann/callbacks/callback_hang.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_hang .hpp .cpp - Callback to hang LBANN for debuggers
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Hang LBANN as training starts so debuggers can attach.
- * This will cause either a specific rank (in COMM_WORLD) or every rank to hang.
- * Attach to the hung ranks and set the hang flag to false with a debugger to
- * proceed.
- */
-class lbann_callback_hang : public lbann_callback {
- public:
-  /**
-   * @param rank_to_hang The rank to hang; -1 for every rank (default).
-   */
-  lbann_callback_hang(int rank_to_hang = -1) :
-    m_rank_to_hang(rank_to_hang) {}
-  lbann_callback_hang(const lbann_callback_hang&) = default;
-  lbann_callback_hang& operator=(const lbann_callback_hang&) = default;
-  lbann_callback_hang* copy() const override { return new lbann_callback_hang(*this); }
-  /// Hang on train begin.
-  void on_train_begin(model* m) override {
-    if (m_rank_to_hang == -1 ||
-        m_rank_to_hang == m->get_comm()->get_rank_in_world()) {
-      // Set this flag to false with your debugger to resume execution.
-      volatile bool lbann_hang = true;
-      while (lbann_hang) {}
-    }
-  }
-  std::string name() const override { return "hang"; }
- protected:
-  /// The rank that will hang; -1 for every rank.
-  int m_rank_to_hang;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_imcomm.hpp b/include/lbann/callbacks/callback_imcomm.hpp
deleted file mode 100644
index fb52daa2bee..00000000000
--- a/include/lbann/callbacks/callback_imcomm.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_imcomm .hpp .cpp - Send gradient updates between models
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
-
-#include <vector>
-#include <unordered_set>
-#include <unordered_map>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Support inter-model communication after each mini-batch to synchronize
- * gradient updates.
- */
-class lbann_callback_imcomm : public lbann_callback {
- public:
-  using lbann_callback::on_backward_prop_end;
-
-  enum comm_type {
-    NONE,  /** Do no gradient updates. */
-    NORMAL,  /** Simply sum gradient updates. */
-  };
-
-  /**
-   * Initialize with ct being used for all weights.
-   */
-  lbann_callback_imcomm(comm_type ct = NORMAL,
-                        lbann_summary *summarizer = nullptr);
-  lbann_callback_imcomm(const lbann_callback_imcomm&) = default;
-  lbann_callback_imcomm& operator=(const lbann_callback_imcomm&) = default;
-  lbann_callback_imcomm* copy() const override {
-    return new lbann_callback_imcomm(*this);
-  }
-  /**
-   * Convenience initialization to do one update type for specific weights.
-   * Implies no inter-model updates for other weights.
-   */
-  lbann_callback_imcomm(comm_type ct, std::unordered_set<weights *> weights_list,
-                        lbann_summary *summarizer = nullptr);
-
-  /** Choose comm type ct for weights. */
-  void set_weights_comm(weights *w, comm_type ct);
-
-  /** Do initialization for this model. */
-  void setup(model *m) override;
-  /** Make sure all models have the same weights. */
-  void on_train_begin(model *m) override;
-  /** Do inter-model gradient updates. */
-  void on_backward_prop_end(model *m) override;
-
-  std::string name() const override { return "imcomm"; }
-
- private:
-  /** Parameters for a given set of weights. */
-  struct imcomm_params {
-    /** Type of communication done. */
-    comm_type ct = NONE;
-  };
-  /** Default communication type. */
-  comm_type m_default_ct;
-  /** Per-weights parameters. */
-  std::unordered_map<weights *, imcomm_params> m_weights_params;
-
-  /** Summarize relevant statistics. */
-  void do_summary(model *m, weights *w, EvalType im_time);
-};
-
-
-/** returns a string representation of the weight_initialization */
-std::string get_comm_type_name(lbann_callback_imcomm::comm_type m);
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/callback_io.hpp
deleted file mode 100644
index 2ed29430a05..00000000000
--- a/include/lbann/callbacks/callback_io.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_io .hpp .cpp - Callback hooks for I/O monitoring
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED
-#define LBANN_CALLBACKS_IO_HPP_INCLUDED
-
-#include <unordered_set>
-#include <unordered_map>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Print information on the amount of IO that layers do.
- */
-class lbann_callback_io : public lbann_callback {
- public:
-  lbann_callback_io();
-  lbann_callback_io(const lbann_callback_io&) = default;
-  lbann_callback_io& operator=(const lbann_callback_io&) = default;
-  lbann_callback_io* copy() const override { return new lbann_callback_io(*this); }
-  /** Only apply to specific layers. */
-  lbann_callback_io(std::unordered_set<Layer *> layers);
-  /** Report how much I/O has occured per data reader */
-  void on_epoch_end(model *m) override;
-  void on_test_end(model *m) override;
-  std::string name() const override { return "io"; }
- private:
-  /** Indicies of layers to monitor. */
-  std::unordered_set<Layer *> m_layer_indices;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_IO_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/callback_learning_rate.hpp
deleted file mode 100644
index 55dd090a7ea..00000000000
--- a/include/lbann/callbacks/callback_learning_rate.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
-#define LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
-
-#include <unordered_set>
-#include <unordered_map>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-// Different schedules should inherit from lbann_callback_learning_rate.
-
-/**
- * Base class for learning rate schedules.
- * Child classes should implement the schedule method to make changes.
- */
-class lbann_callback_learning_rate : public lbann_callback {
- public:
-  lbann_callback_learning_rate();
-  lbann_callback_learning_rate(const lbann_callback_learning_rate&) = default;
-  lbann_callback_learning_rate& operator=(
-    const lbann_callback_learning_rate&) = default;
-  /** Only apply to specific weights. */
-  lbann_callback_learning_rate(std::unordered_set<weights *> weights_list);
-  /** Do some initialization. */
-  void setup(model *m) override;
-  /** Apply global learning rate schedules. */
-  void on_epoch_end(model *m) override;
-
-  using lbann_callback::on_backward_prop_end;
-  /** Apply local/per-optimizer learning rate schedules. */
-  void on_backward_prop_end(model *m) override;
- protected:
-  /**
-   * This is called at the end of every epoch to update the learning
-   * rate for every optimizer. Adjustments should be made based on the
-   * current global learning rate.
-   * The returned learning rate will be used to automatically update
-   * the current global learning rate.
-   */
-  virtual float global_schedule(model *m) { return m_cur_global_lr; }
-  /**
-   * This is called at the end of every training mini-batch to update the
-   * learning rate for optimizer opt. The current global learning rate is *not*
-   * updated automatically based on this method.
-   */
-  virtual float optimizer_schedule(model *m, optimizer &opt) {
-    return opt.get_learning_rate();
-  }
-
-  /** Weights to update. */
-  std::unordered_set<weights *> m_weights;
-
-  /**
-   * This should be maintained by all learning rate schedule
-   * implementations as the current global learning rate. This enables
-   * coordination among different schedules, particularly ones that
-   * work on a per-optimizer basis.
-   */
-  static float m_cur_global_lr;
-};
-
-/**
- * Decrease the learning rate by a fixed proportion every X epochs.
- */
-class lbann_callback_step_learning_rate : public lbann_callback_learning_rate {
- public:
-  /** Decrease the learning rate by amt every step epochs. */
-  lbann_callback_step_learning_rate(int step, float amt);
-  lbann_callback_step_learning_rate(int step, float amt,
-                                    std::unordered_set<weights *> weights_list);
-  lbann_callback_step_learning_rate(
-    const lbann_callback_step_learning_rate&) = default;
-  lbann_callback_step_learning_rate& operator=(
-    const lbann_callback_step_learning_rate&) = default;
-  lbann_callback_step_learning_rate* copy() const override {
-    return new lbann_callback_step_learning_rate(*this);
-  }
-  std::string name() const override { return "step learning rate"; }
- protected:
-  float global_schedule(model *m) override;
- private:
-  /** Number of epochs between each learning rate decrease. */
-  int m_step;
-  /** Amount to decrease the learning rate by. */
-  float m_amt;
-};
-
-/**
- * Decrease the learning rate by a fixed proportion when validation error stops
- * improving.
- */
-class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rate {
- public:
-  /**
-   * Decrease the learning rate by amt if accuracy does not improve for patience
-   * epochs.
-   */
-  lbann_callback_adaptive_learning_rate(int64_t patience, float amt);
-  lbann_callback_adaptive_learning_rate(int64_t patience, float amt,
-                                        std::unordered_set<weights *> weights_list);
-  lbann_callback_adaptive_learning_rate(
-    const lbann_callback_adaptive_learning_rate&) = default;
-  lbann_callback_adaptive_learning_rate& operator=(
-    const lbann_callback_adaptive_learning_rate&) = default;
-  lbann_callback_adaptive_learning_rate* copy() const override {
-    return new lbann_callback_adaptive_learning_rate(*this);
-  }
-  std::string name() const override { return "adaptive learning rate"; }
- protected:
-  float global_schedule(model *m) override;
- private:
-  /** Number of epochs to wait for improvements. */
-  int64_t m_patience;
-  /** Amount to decrease the learning rate by. */
-  float m_amt;
-  /** Current epoch. */
-  int m_cur_epoch = -1;
-  /** Last recorded score. */
-  EvalType m_last_score = std::numeric_limits<EvalType>::max();
-  /** Current number of epochs without improvement. */
-  int64_t m_wait = 0;
-  /** Whether to adjust learning rate for current epoch. */
-  bool m_adjust_learning_rate = false;
-};
-
-/**
- * Decrease learning rate by a fixed amount at fixed times.
- */
-class lbann_callback_drop_fixed_learning_rate :
-    public lbann_callback_learning_rate {
- public:
-  /**
-   * Decrease the learning rate by amt when each epoch in drop_epochs is
-   * reached.
-   */
-  lbann_callback_drop_fixed_learning_rate(
-    std::vector<int64_t> drop_epochs, float amt);
-  lbann_callback_drop_fixed_learning_rate(
-    std::vector<int64_t> drop_epochs, float amt,
-    std::unordered_set<weights *> weights_list);
-  lbann_callback_drop_fixed_learning_rate(
-    const lbann_callback_drop_fixed_learning_rate&) = default;
-  lbann_callback_drop_fixed_learning_rate& operator=(
-    const lbann_callback_drop_fixed_learning_rate&) = default;
-  lbann_callback_drop_fixed_learning_rate* copy() const override {
-    return new lbann_callback_drop_fixed_learning_rate(*this);
-  }
-  std::string name() const override { return "drop fixed learning rate"; }
- protected:
-  float global_schedule(model *m) override;
- private:
-  /// Amount to decrease the learning rate by.
-  float m_amt;
-  /**
-   * Epochs to drop learning rate at. This is stored in reverse sorted order,
-   * so that the end can be examined and then popped in constant time.
-   */
-  std::vector<int64_t> m_drop_epochs;
-};
-
-/**
- * Linearly increase the learning rate to reach a target value over a
- * fixed number of epochs.
- * @note This currently assumes every optimizer begins with the same
- * learning rate.  This also *forces* its schedule and will stomp over
- * other changes.
- */
-class lbann_callback_linear_growth_learning_rate :
-    public lbann_callback_learning_rate {
- public:
-  /**
-   * Linearly increase the learning rate to reach target after num_epochs.
-   */
-  lbann_callback_linear_growth_learning_rate(
-    float target, int64_t num_epochs);
-  lbann_callback_linear_growth_learning_rate(
-    float target, int64_t num_epochs, int64_t delay);
-  lbann_callback_linear_growth_learning_rate(
-    float target, int64_t num_epochs, int64_t delay,
-    std::unordered_set<weights *> weights_list);
-  lbann_callback_linear_growth_learning_rate(
-    const lbann_callback_linear_growth_learning_rate&) = default;
-  lbann_callback_linear_growth_learning_rate& operator=(
-    const lbann_callback_linear_growth_learning_rate&) = default;
-  lbann_callback_linear_growth_learning_rate* copy() const override {
-    return new lbann_callback_linear_growth_learning_rate(*this); }
-  void setup(model *m) override;
-  std::string name() const override { return "linear growth learning rate"; }
- protected:
-  float global_schedule(model *m) override;
- private:
-  /// Initial learning rate.
-  float m_base_lr;
-  /// Target learning rate to reach.
-  float m_target;
-  /// Amount to increase each epoch.
-  float m_inc;
-  /// Number of epochs over which to scale the learning rate.
-  int64_t m_num_epochs;
-  /// Number of epochs to delay before starting growth.
-  int64_t m_delay;
-};
-
-/**
- * Decrease the learning rate by polynomial policy
- * base_lr*(1 - i_cur/i_max)^p, where
- * base_lr is the initial learning rate, i_cur is the current iteration,
- * i_max is the maximum iteration, and p is a parameter.
- */
-class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate {
- public:
-  lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter);
-  lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r,
-    std::unordered_set<weights *> weights_list);
-  lbann_callback_poly_learning_rate(
-    const lbann_callback_poly_learning_rate&) = default;
-  lbann_callback_poly_learning_rate& operator=(
-    const lbann_callback_poly_learning_rate&) = default;
-  lbann_callback_poly_learning_rate* copy() const override {
-    return new lbann_callback_poly_learning_rate(*this);
-  }
-  void setup(model *m) override;
-  std::string name() const override { return "poly learning rate"; }
- protected:
-  float global_schedule(model *m) override;
-  float optimizer_schedule(model *m, optimizer &opt) override;
- private:
-  /// The exponent to compute new learning rate in poly policy
-  double m_p;
-  /// The number of epochs for training
-  uint64_t m_num_epochs;
-  /// The maximum number of iterations until which the learning rate changes
-  uint64_t m_max_iter;
-  /// The minimum learning rate
-  float m_end_lr;
-  /// The current rate to scale the base learning rate
-  float m_lr;
-  /// The learning rate scale used at the end of the last epoch
-  float m_last_epoch_lr;
-};
-
-/**
- * This implements an adaptive scheme for adjust each optimizer's
- * learning rate based on the ratio of the norms of its weights and
- * its gradients.
- * See: You et al. "Scaling SGD Batch Size to 32K for ImageNet
- * Training", 2017.
- */
-class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callback_learning_rate {
- public:
-  lbann_callback_optimizerwise_adaptive_learning_rate(float scale);
-  lbann_callback_optimizerwise_adaptive_learning_rate(
-    float scale, std::unordered_set<weights *> weights_list);
-  lbann_callback_optimizerwise_adaptive_learning_rate(
-    const lbann_callback_optimizerwise_adaptive_learning_rate&) = default;
-  lbann_callback_optimizerwise_adaptive_learning_rate& operator=(
-    const lbann_callback_optimizerwise_adaptive_learning_rate&) = default;
-  lbann_callback_optimizerwise_adaptive_learning_rate* copy() const override {
-    return new lbann_callback_optimizerwise_adaptive_learning_rate(*this); }
-  std::string name() const override { return "optimizerwise adaptive learning rate"; }
- protected:
-  float optimizer_schedule(model *m, optimizer &opt) override;
- private:
-  float m_scale;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/callback_ltfb.hpp
deleted file mode 100644
index e28a717da9c..00000000000
--- a/include/lbann/callbacks/callback_ltfb.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include <memory>
-#include <set>
-#include <vector>
-
-namespace lbann {
-
-/** @brief Tournament training.
- *
- *  This is intended to support research into the LTFB algorithm. An
- *  outline:
- *    - Divide the computational resources into multiple "trainers"
- *      that can operate in parallel.
- *    - Setup a model on each trainer and begin training independently.
- *    - Periodically launch tournaments to select "good" models. More
- *      specifically, trainers partner up and exchange their models.
- *      Each trainer evaluates a metric for its local and partner
- *      models, using its validation data set. The model with the better
- *      score is retained and the other one is discarded.
- *
- *  There are many algorithmic variations to be explored:
- *    - How is data is divvied up amongst the trainers. Is it strictly
- *      partitioned, partially shared, or completely replicated?
- *    - What model components are exchanged? Just the trainable weights,
- *      or a subset of the weights? Hyperparameters?
- *    - Can this be used to explore model architectures?
- *
- *  @todo Exchange optimizer state.
- *  @todo Support heterogeneous models.
- */
-class lbann_callback_ltfb : public lbann_callback {
-public:
-
-  /** Inter-trainer communication scheme for LTFB.
-   *
-   *  The specifics of these algorithms are experimental and will be
-   *  in flux.
-   */
-  enum class communication_algorithm {
-    /** Directly exchange weights values with sendrecv.
-     *
-     *  Corresponding ranks in partner trainers will iterate through
-     *  their weights and exchange values with sendrecvs.
-     *
-     *  Notes:
-     *    - Requires all models to be identical aside from their
-     *      weights values, so this is not suitable for hyperparameter
-     *      or model architecture exploration.
-     *    - Optimizer state is not exchanged, so there may be wonky
-     *      learning behavior immediately after a tournament.
-     *    - Optimal if communication performance between ranks is
-     *      uniform and independent. If intra-trainer communication is
-     *      fast or if communication performance is sensitive to
-     *      network traffic, it may be advantageous to gather model
-     *      data on the trainer master ranks and only perform
-     *      inter-trainer communication between them.
-     */
-    sendrecv_weights,
-
-    /** Save and load model data with checkpoint files.
-     *
-     *  @todo Implement.
-     *
-     *  Notes:
-     *    - Supports hyperparameter exploration.
-     *    - Checkpoint files currently do not store model architecture
-     *      information, so this is not suitable for model
-     *      architecture exploraiton.
-     *    - This approach is temporary and experimental, since going
-     *      through the file system is very suboptimal. When a wire
-     *      format for model checkpoints is developed, it should be
-     *      used instead.
-     */
-    checkpoint_file
-  };
-
-  /** @brief Construct the LTFB callback
-   *  @param batch_interval Number of training mini-batch steps between
-   *                        tournaments.
-   *  @param metric_name    Metric for tournament evaluation.
-   *  @param weights_names  List of weights to exchange with partner.
-   *                        If empty, then all weights are exchanged.
-   *  @param low_score_wins Whether low-scoring or high-scoring models
-   *                        survive a tournament.
-   *  @param comm_algo      Inter-trainer communication scheme.
-   *  @param summarizer     The summarizer to use for this callback
-   */
-  lbann_callback_ltfb(
-    El::Int batch_interval,
-    std::string metric_name,
-    std::set<std::string> weights_names = std::set<std::string>(),
-    bool low_score_wins = false,
-    communication_algorithm comm_algo = communication_algorithm::sendrecv_weights,
-    lbann_summary *summarizer = nullptr);
-  lbann_callback_ltfb(const lbann_callback_ltfb& other);
-  lbann_callback_ltfb& operator=(const lbann_callback_ltfb& other);
-  lbann_callback_ltfb* copy() const override { return new lbann_callback_ltfb(*this); }
-  std::string name() const override { return "LTFB"; }
-
-  void setup(model *m) override;
-  void on_train_begin(model *m) override;
-  void on_batch_begin(model *m) override;
-
-  /** Convert string to LTFB communication algorithm.
-   *
-   *  If an empty string is provided, returns @c
-   *  communication_algorithm::sendrecv_weights.
-   */
-  static communication_algorithm string_to_comm_algo(const std::string& str);
-
-private:
-
-  /** Metric for tournament evaluation. */
-  std::string m_metric_name;
-
-  /** List of weights to exchange with partner.
-   *
-   *  If empty, then all weights are exchanged.
-   */
-  std::set<std::string> m_weights_names;
-
-  /** Whether low-scoring or high-scoring models survive a
-   *  tournament. */
-  bool m_low_score_wins;
-
-  /** Inter-trainer communication scheme. */
-  communication_algorithm m_comm_algo;
-
-  /** Workspace weights.
-   *
-   *  Used to temporarily store local weights during a tournament.
-   */
-  std::vector<std::unique_ptr<weights>> m_workspace_weights;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp
deleted file mode 100644
index 6adf47dd83a..00000000000
--- a/include/lbann/callbacks/callback_perturb_adam.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include "lbann/optimizers/adam.hpp"
-#include <set>
-
-namespace lbann {
-
-/** @brief Hyperparameter exploration with Adam optimizers.
- *
- *  Goes through the Adam optimizers in a model and perturbs four
- *  hyperparameters: the learning rate, @f$\beta_1@f$, @f$\beta_2@f$,
- *  and @f$\epsilon@f$. Since these hyperparameters can range over
- *  orders of magnitude, the perturbations are performed in log space.
- *  More precisely, random values are drawn from normal distributions
- *  (with user-provided standard deviations) and added to
- *  @f$\log(\text{learning rate})@f$, @f$\log(1-\beta_1)@f$,
- *  @f$\log(1-\beta_2)@f$, and @f$\log\epsilon@f$.
- */
-class lbann_callback_perturb_adam : public lbann_callback {
-public:
-
-  /** @param learning_rate_factor   Standard deviation of learning rate
-   *                                perturbation (in log space).
-   *  @param beta1_factor           Standard deviation of @f$\beta_1@f$
-   *                                perturbation (in log space).
-   *  @param beta2_factor           Standard deviation of @f$\beta_2@f$
-   *                                perturbation (in log space).
-   *  @param eps_factor             Standard deviation of @f$\epsilon@f$
-   *                                perturbation (in log space).
-   *  @param perturb_during_training    Whether to periodically perturb
-   *                                    hyperparameters during training
-   *                                    or to only perturb once during
-   *                                    setup.
-   *  @param batch_interval Number of training mini-batch steps between
-   *                        perturbations. Only used if
-   *                        @c perturb_during_training is @c true.
-   *  @param weights_names  Names of weights with Adam optimizers. If
-   *                        empty, all Adam optimizers in the model are
-   *                        perturbed.
-   */
-  lbann_callback_perturb_adam(DataType learning_rate_factor,
-                              DataType beta1_factor,
-                              DataType beta2_factor,
-                              DataType eps_factor = 0,
-                              bool perturb_during_training = false,
-                              El::Int batch_interval = 1,
-                              std::set<std::string> weights_names
-                              = std::set<std::string>());
-  lbann_callback_perturb_adam* copy() const override { return new lbann_callback_perturb_adam(*this); }
-  std::string name() const override { return "perturb Adam"; }
-
-  void setup(model* m);
-  void on_batch_begin(model* m);
-
-private:
-
-  /** Standard deviation of learning rate perturbation.
-   *
-   *  In log space.
-   */
-  DataType m_learning_rate_factor;
-  /** Standard deviation of @f$\beta_1@f$ perturbation.
-   *
-   *  In log space.
-   */
-  DataType m_beta1_factor;
-  /** Standard deviation of @f$\beta_2@f$ perturbation.
-   *
-   *  In log space.
-   */
-  DataType m_beta2_factor;
-  /** Standard deviation of @f$\epsilon@f$ perturbation.
-   *
-   *  In log space.
-   */
-  DataType m_eps_factor;
-
-  /** Whether to periodically perturb during training.
-   *
-   *  If false, only perturb once during setup.
-   */
-  bool m_perturb_during_training;
-
-  /** Optimizers for these weights will be perturbed.
-   *
-   *  If empty, all Adam optimizers in the model will be perturbed.
-   */
-  std::set<std::string> m_weights_names;
-
-  /** Perturb Adam optimizers in model. */
-  void perturb(model& m) const;
-  /** Perturb Adam optimizer hyperparameters. */
-  void perturb(lbann_comm& comm, adam& m) const;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_print.hpp b/include/lbann/callbacks/callback_print.hpp
deleted file mode 100644
index 53c77d2a7a1..00000000000
--- a/include/lbann/callbacks/callback_print.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_print .hpp .cpp - Callback hooks to print information
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Periodically print computational results.
- *  Prints average objective function value and metric scores after
- *  each training epoch and evaluation.
- */
-class lbann_callback_print : public lbann_callback {
- public:
-  lbann_callback_print(int batch_interval = 1, bool print_global_stat_only=false) :
-  lbann_callback(batch_interval), m_print_global_stat_only(print_global_stat_only) {}
-  lbann_callback_print(const lbann_callback_print&) = default;
-  lbann_callback_print& operator=(const lbann_callback_print&) = default;
-  lbann_callback_print* copy() const override { return new lbann_callback_print(*this); }
-  void setup(model *m) override;
-  void on_epoch_begin(model *m) override;
-  void on_epoch_end(model *m) override;
-  void on_validation_end(model *m) override;
-  void on_test_end(model *m) override;
-  std::string name() const override { return "print"; }
-
- private:
-  /** Print objective function and metrics to standard output. */
-  void report_results(model *m);
-  bool m_print_global_stat_only;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/callback_replace_weights.hpp
deleted file mode 100644
index 62bf033792c..00000000000
--- a/include/lbann/callbacks/callback_replace_weights.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
-
-#include <utility>
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- *  Weights/parameters replacement on k-batch end
- *  Currently support replacing weights/parameters using layer names
- *  Can easily be extended to support replacement by weights name
- *  Given two layers specified in prototext, weights are copied from source layer to destination layer.
- */
-class lbann_callback_replace_weights : public lbann_callback {
- public:
-  lbann_callback_replace_weights(std::vector<Layer*> src,
-    std::vector<Layer*> dst, int batch_interval=1) :
-    lbann_callback(batch_interval),
-    m_src_layers(std::move(src)),
-    m_dst_layers(std::move(dst)){
-    if(m_src_layers.size() != m_dst_layers.size())
-     throw lbann_exception("In replace weights callback: number of src and dest layers does not match.");
-  }
-
-  lbann_callback_replace_weights(
-    const lbann_callback_replace_weights&) = default;
-  lbann_callback_replace_weights& operator=(
-    const lbann_callback_replace_weights&) = default;
-  lbann_callback_replace_weights* copy() const override {
-    return new lbann_callback_replace_weights(*this);
-  }
-  void on_batch_end(model *m) override;
-
-  std::string name() const override { return "replace weights"; }
- private:
-  std::vector<Layer*> m_src_layers, m_dst_layers;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/callback_save_images.hpp
deleted file mode 100644
index 72d870f3fc1..00000000000
--- a/include/lbann/callbacks/callback_save_images.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
-
-#include <string>
-#include <vector>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Save layer outputs as image files.
- *  Image files are in the form
- *  "<prefix><tag>-<layer name>.<format>".
- */
-class lbann_callback_save_images : public lbann_callback {
-public:
-
-  /** Constructor.
-   *  @param layer_names  List of layer names to save as images.
-   *  @param image_format Image file format (e.g. jpg, png, pgm).
-   *  @param image_prefix Prefix for image file names.
-   */
-  lbann_callback_save_images(std::vector<std::string> layer_names,
-                             std::string image_format = "jpg",
-                             std::string image_prefix = "");
-  lbann_callback_save_images(const lbann_callback_save_images&) = default;
-  lbann_callback_save_images& operator=(
-    const lbann_callback_save_images&) = default;
-  lbann_callback_save_images* copy() const override {
-    return new lbann_callback_save_images(*this);
-  }
-  void on_epoch_end(model *m) override;
-  void on_test_end(model *m) override;
-  std::string name() const override { return "save images"; }
-
-private:
-
-  /** List of layer names to save as images. */
-  std::vector<std::string> m_layer_names;
-  /** Image file format.
-   *  Valid options: jpg, png, pgm.
-   */
-  std::string m_image_format;
-  /** Prefix for saved image files. */
-  std::string m_image_prefix;
-
-};
-
-} // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp
deleted file mode 100644
index aeeae47415a..00000000000
--- a/include/lbann/callbacks/callback_save_model.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_save_model .hpp .cpp - Callbacks to save model, currently as protobuf
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
-
-#include <utility>
-
-#include "lbann/callbacks/callback.hpp"
-#include <lbann.pb.h>
-#include <google/protobuf/message.h>
-
-namespace lbann {
-
-/**
- * Save model to as protobuf file and set of weights
- */
-class lbann_callback_save_model : public lbann_callback {
- public:
-  /**
-   * @param dir directory to save model
-   * @param disable_save_after_training Don't save after training
-   * @param extension file extension e.g., model, state ......
-   */
-  lbann_callback_save_model(std::string dir,
-                            bool disable_save_after_training,
-                            std::string extension="prototext") :
-    lbann_callback(), m_dir(std::move(dir)),
-    m_disable_save_after_training(disable_save_after_training),
-    m_extension(std::move(extension))
-    {}
-  lbann_callback_save_model(const lbann_callback_save_model&) = default;
-  lbann_callback_save_model& operator=(
-    const lbann_callback_save_model&) = default;
-  lbann_callback_save_model* copy() const override {
-    return new lbann_callback_save_model(*this);
-  }
-  void on_train_end(model *m) override;
-  bool save_model(model *m);
-  bool save_model_weights(model *m);
-  static bool load_model_weights(std::string ckpt_dir, model *m);
-
-  std::string name() const override { return "save model"; }
- private:
-  std::string m_dir; //directory to save file
-  bool m_disable_save_after_training; /// Disables the normal behavior of saving when training is complete
-  std::string m_extension; //file extension
-  persist p;
-  void write_proto_binary(const lbann_data::Model& proto, const std::string filename);
-  void write_proto_text(const lbann_data::Model& proto, const std::string filename);
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_summary.hpp b/include/lbann/callbacks/callback_summary.hpp
deleted file mode 100644
index 15294ac240d..00000000000
--- a/include/lbann/callbacks/callback_summary.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_summary .hpp .cpp - Callback hooks to summarize to Tensorboard
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include "lbann/utils/summary.hpp"
-
-namespace lbann {
-
-/**
- * Summarize information to Tensorboard using LBANN's summary interface.
- */
-class lbann_callback_summary : public lbann_callback {
- public:
-  /**
-   * @param summarizer The summary object to write to; this callback takes
-   * ownership of it.
-   * @param batch_interval The frequency with which to summarize
-   * @param mat_interval FIXME
-   * @todo Document mat_interval parameter.
-   */
-  lbann_callback_summary(lbann_summary *summarizer, int batch_interval = 1,
-    int mat_interval = 25);
-  ~lbann_callback_summary() override;
-  lbann_callback_summary(const lbann_callback_summary&) = default;
-  lbann_callback_summary& operator=(const lbann_callback_summary&) = default;
-  lbann_callback_summary* copy() const override {
-    return new lbann_callback_summary(*this);
-  }
-  void on_train_begin(model *m) override;
-  void on_batch_end(model *m) override;
-  void on_epoch_end(model *m) override;
-  void on_test_end(model *m) override;
-  std::string name() const override { return "summary"; }
- protected:
-  /** Write out histograms from the model's layers. */
-  void save_histograms(model *m);
-  /** Interval for doing matrix summarization. */
-  int m_mat_interval;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_sync_layers.hpp b/include/lbann/callbacks/callback_sync_layers.hpp
deleted file mode 100644
index 2c9d4984fa8..00000000000
--- a/include/lbann/callbacks/callback_sync_layers.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_sync_layers.hpp - Callback to synchronize layers
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/** Synchronize layers after forward and backward prop.
- * Additionally updates layer timing information to account for this.
- * Note that this callback should come before the summarizer callback to report
- * time correctly (otherwise it will be shifted by one mini-batch).
- */
-class lbann_callback_sync_layers : public lbann_callback {
- public:
-  /**
-   * @param sync_gpus The GPU stream will be synchronized.
-   * @param sync_mpi A global barrier will synchronize processes.
-   * @param only_input The only synchronization will be after the input layer in
-   * forward prop.
-   */
-  lbann_callback_sync_layers(bool sync_gpus = true, bool sync_mpi = true,
-                             bool only_input = false) :
-    lbann_callback(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi),
-    m_only_input(only_input) {}
-  lbann_callback_sync_layers(const lbann_callback_sync_layers&) = default;
-  lbann_callback_sync_layers& operator=(
-    const lbann_callback_sync_layers&) = default;
-  lbann_callback_sync_layers* copy() const override {
-    return new lbann_callback_sync_layers(*this);
-  }
-  std::string name() const override { return "sync_layers"; }
-
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_end;
-
-  void on_forward_prop_end(model *m, Layer *l) override;
-  void on_backward_prop_end(model *m, Layer *l) override;
-
- protected:
-  /** Whether to synchronize GPUs. */
-  bool m_sync_gpus;
-  /** Whether to do a global synchronization. */
-  bool m_sync_mpi;
-  /** Whether to only synchronize after the input layer. */
-  bool m_only_input;
-
-  virtual void do_sync(Layer *l);
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_sync_selected.hpp b/include/lbann/callbacks/callback_sync_selected.hpp
deleted file mode 100644
index 53cda7e8b3f..00000000000
--- a/include/lbann/callbacks/callback_sync_selected.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_sync_selected.hpp - Callback to synchronize selected layers
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED
-
-#include "lbann/callbacks/callback_sync_layers.hpp"
-#include <unordered_map>
-#include <unordered_set>
-
-namespace lbann {
-
-/**
- * Synchronize at the beginning and the end of the propagation operation(s) of
- * a selected layer, which can be both/either of the forward prop and/or the
- * backward prop of the layer. Additionally updates layer timing information to
- * account for the synchronization at the end of propagation(s).
- * When nvprof is enabled, cudaProfilerStart() follows the synchronization
- * inserted at the beginning of the selected prop step(s), and cudaProfilerEnd()
- * comes after the local GPU sychronization and before the global MPI barrier
- * inserted at the end of the selected prop step(s).
- * Note that this callback should come before the summarizer callback
- * as the base callback lbann_callback_sync_layers requires.
- */
-class lbann_callback_sync_selected : public lbann_callback_sync_layers {
- public:
-  ///type of propagation toch synchronize
-  enum prop_t {Both = 0, Forward = 1, Backward = 2};
-  static const std::map<prop_t, std::string> m_prop_str;
-
-  using layers_t = std::unordered_map<std::string, prop_t>;
-  using layer_ptrs_t = std::unordered_set<Layer*>;
-
-  /**
-   * @param layers specifies the layers to synchronize
-   * @param async_gpus sets not to synchronize gpus. The default is false.
-   * @param async_mpi sets not to synchronize mpi. The default is false.
-   */
-  lbann_callback_sync_selected(const layers_t& layers,
-                               bool async_gpus = false, bool async_mpi = false);
-
-  lbann_callback_sync_selected(const lbann_callback_sync_selected&) = default;
-
-  lbann_callback_sync_selected& operator=(
-    const lbann_callback_sync_selected&) = default;
-
-  lbann_callback_sync_selected* copy() const override {
-    return new lbann_callback_sync_selected(*this);
-  }
-
-  ~lbann_callback_sync_selected() override;
-
-  std::string name() const override { return "sync_selected"; }
-  std::string get_description() const;
-
-  /// To protect in case that cudaProfilerInitialized() has already been called
-  static void turn_off_init_cuda_profiler();
-
-  /// Tells if cuda_profiler has been initialized
-  static bool check_if_cuda_profiler_initialized();
-
-  void init_cuda_profiler(const std::string cfg_file, const std::string out_dir,
-                          int out_mode, lbann_comm* comm) const;
-
-  /** Called once to set up the callback (after all layers are set up).
-   * Then, populate the layer pointers */
-  void setup(model *m) override;
-
-  using lbann_callback::on_forward_prop_begin;
-  using lbann_callback::on_backward_prop_begin;
-  using lbann_callback_sync_layers::on_forward_prop_end;
-  using lbann_callback_sync_layers::on_backward_prop_end;
-
-  /// Synchronize at the beginning of the forward prop of layer l
-  void on_forward_prop_begin(model* m, Layer* l) override;
-  /// Synchronize at the end of the forward prop of layer l
-  void on_forward_prop_end(model* m, Layer* l) override;
-  /// Synchronize at the beginning of the backward prop of layer l
-  void on_backward_prop_begin(model* m, Layer* l) override;
-  /// Synchronize at the end of the backward prop of layer l
-  void on_backward_prop_end(model* m, Layer* l) override;
-
- protected:
-  bool check_if_all_accounted_for() const;
-
-  layer_ptrs_t::iterator populate_layer_ptrs(Layer* l, const prop_t current_prop);
-
-  /// Synchronize and enable cuda profiler
-  void do_pre_sync(Layer* l);
-  /// Synchronize and disble cuda profiler
-  void do_sync(Layer* l) override;
-
-  /// The layers to synchronize.
-  layers_t m_layers;
-
-  /** The pointers of layers to synchronize for forward prop.
-   *  This set includes those of layers to synchronize for both props. */
-  layer_ptrs_t m_fwd_ptrs;
-  /** The pointers of layers to synchronize for backward prop.
-   *  This set includes those of layers to synchronize for both props. */
-  layer_ptrs_t m_bwd_ptrs;
-  /// The pointers of layers to synchronize for both props.
-  layer_ptrs_t m_both_ptrs;
-
-  bool m_all_set; ///< whether all the layer pointers are collected
-
-  /// Tells if cudaProfilerInitialized() has already been called.
-  static bool m_cuda_profiler_initialized;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_timeline.hpp b/include/lbann/callbacks/callback_timeline.hpp
deleted file mode 100644
index 8bf84dd787d..00000000000
--- a/include/lbann/callbacks/callback_timeline.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
-
-#include <unordered_map>
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Record a timeline of training runtime on each rank and output it to a
- * logfile for external processing.
- * The logfile is named timeline.m\<model-rank\>.\<rank\>.txt.
- * Each line is a separate event, written as name:start-time:end-time.
- * Times are relative to the beginning of training.
- */
-class lbann_callback_timeline : public lbann_callback {
- public:
-  lbann_callback_timeline(std::string outdir) : lbann_callback(1),
-                                                m_outdir(outdir) {}
-  lbann_callback_timeline(const lbann_callback_timeline&) = default;
-  lbann_callback_timeline& operator=(const lbann_callback_timeline&) = default;
-  lbann_callback_timeline* copy() const override {
-    return new lbann_callback_timeline(*this);
-  }
-  std::string name() const override { return "timeline"; }
-  void on_train_begin(model *m) override;
-  void on_train_end(model *m) override;
-
-  using lbann_callback::on_forward_prop_begin;
-  using lbann_callback::on_forward_prop_end;
-  using lbann_callback::on_backward_prop_begin;
-  using lbann_callback::on_backward_prop_end;
-  using lbann_callback::on_optimize_begin;
-  using lbann_callback::on_optimize_end;
-
-  void on_forward_prop_begin(model *m, Layer *l) override;
-  void on_forward_prop_end(model *m, Layer *l) override;
-  void on_backward_prop_begin(model *m, Layer *l) override;
-  void on_backward_prop_end(model *m, Layer *l) override;
-  void on_optimize_begin(model *m, weights *w) override;
-  void on_optimize_end(model *m, weights *w) override;
- private:
-  /// Get time relative to the start time.
-  EvalType get_rel_time() const { return get_time() - m_start_time; }
-
-  /// Directory to write output to.
-  std::string m_outdir;
-  /// Time training started; all times are relative to this.
-  EvalType m_start_time = EvalType(0);
-  /// Time the current layer's forward pass started.
-  EvalType m_fp_start_time = EvalType(0);
-  /// Time the current layer's backward pass started.
-  EvalType m_bp_start_time = EvalType(0);
-  /// Time the current weights' optimization pass started.
-  EvalType m_opt_start_time = EvalType(0);
-  /// Store (relative) timing information.
-  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_fp_times;
-  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_bp_times;
-  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_opt_times;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/callback_timer.hpp
deleted file mode 100644
index a53243e7a3f..00000000000
--- a/include/lbann/callbacks/callback_timer.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
-#define LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-#include <chrono>
-#include <map>
-#include <vector>
-
-namespace lbann {
-
-/** Record and report model timing results.
- *  Reports the total time and mini-batch time statistics for training
- *  epochs and for model evaluations. This reports times for the
- *  master process in each model.
- */
-class lbann_callback_timer : public lbann_callback {
-public:
-
-  lbann_callback_timer(lbann_summary *summarizer = nullptr)
-    : lbann_callback(1, summarizer) {}
-  lbann_callback_timer(const lbann_callback_timer&) = default;
-  lbann_callback_timer& operator=(const lbann_callback_timer&) = default;
-  lbann_callback_timer* copy() const override {
-    return new lbann_callback_timer(*this);
-  }
-
-  /** Start timing for a training epoch. */
-  void on_epoch_begin(model *m) override      { timing_begin(*m); }
-  /** Report timing for a training epoch. */
-  void on_epoch_end(model *m) override        { timing_end(*m);   }
-  /** Start timing for validation. */
-  void on_validation_begin(model *m) override { timing_begin(*m); }
-  /** Report timing for validation. */
-  void on_validation_end(model *m) override   { timing_end(*m);   }
-  /** Start timing for testing. */
-  void on_test_begin(model *m) override       { timing_begin(*m); }
-  /** Report timing for testing. */
-  void on_test_end(model *m) override         { timing_end(*m);   }
-  /** Record training mini-batch start time. */
-  void on_batch_begin(model *m) override          { batch_timing_begin(*m); }
-  /** Record training mini-batch run time. */
-  void on_batch_end(model *m) override            { batch_timing_end(*m);   }
-  /** Record evaluation mini-batch start time. */
-  void on_batch_evaluate_begin(model *m) override { batch_timing_begin(*m); }
-  /** Record evaluation mini-batch run time. */
-  void on_batch_evaluate_end(model *m) override   { batch_timing_end(*m);   }
-
-  /** Callback name. */
-  std::string name() const override { return "timer"; }
-
-private:
-
-  /** Timing session start times. */
-  std::map<execution_mode,EvalType> m_start_times;
-  /** Mini-batch timing session start times. */
-  std::map<execution_mode,EvalType> m_batch_start_times;
-  /** Mini-batch times. */
-  std::map<execution_mode,std::vector<EvalType>> m_batch_times;
-
-  /** Start timing session. */
-  void timing_begin(const model& m);
-  /** End timing session.
-   *  Prints results to standard output.
-   */
-  void timing_end(model& m);
-  /** Start mini-batch timing session. */
-  void batch_timing_begin(const model& m);
-  /** End mini-batch timing session.
-   *  Prints results to standard output.
-   */
-  void batch_timing_end(const model& m);
-
-};
-
-} // namespace lbann
-
-#endif  // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp
deleted file mode 100644
index 44d8c62f766..00000000000
--- a/include/lbann/callbacks/callback_variable_minibatch.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
-#define LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
-
-#include "lbann/callbacks/callback.hpp"
-
-namespace lbann {
-
-/**
- * Support changing the mini-batch size on different schedules.
- * Implementations should override implement the abstract methods to define
- * concrete schedules.
- */
-class lbann_callback_variable_minibatch : public lbann_callback {
- public:
-  lbann_callback_variable_minibatch(int starting_mbsize);
-  lbann_callback_variable_minibatch(
-    const lbann_callback_variable_minibatch&) = default;
-  lbann_callback_variable_minibatch& operator=(
-    const lbann_callback_variable_minibatch&) = default;
-  /// Set the initial mini-batch size.
-  void on_train_begin(model *m) override;
-  /// Potentially change the mini-batch size.
-  void on_epoch_end(model *m) override;
- protected:
-  /**
-   * Implemented by child classes to provide the mini-batch/learning schedule.
-   * This is called at the end of every training epoch. If it returns false,
-   * no changes are made from the currently established schedule.
-   * If this returns true, the mini-batch size will be changed accordingly.
-   * If the mini-batch size is larger than the model's maximum mini-batch size,
-   * a warning is printed and the maximum mini-batch size is used.
-   * If new_lr also non-zero, the learning rate will be changed to new_lr,
-   * with a linear ramp time. (If ramp_time is 0, it is changed immediately.)
-   * Note changing the learning rate while in a ramp may lead to unexpected
-   * behavior; also be aware of interactions with other learning rate
-   * schedules.
-   */
-  virtual bool schedule(model *m, int& new_mbsize, float& new_lr,
-                        int& ramp_time) = 0;
-  /// Change the learning rate of every layer in m to new_lr.
-  void change_learning_rate(model *m, float new_lr) const;
-  /// Get the current learning rate (assumes every layer has the same one).
-  float get_current_learning_rate(model *m) const;
-  /// Initial mini-batch size.
-  const int m_starting_mbsize;
-  /**
-   * The current mini-batch size for this epoch.
-   * This is kept separately from the model's get_current_mini_batch_size()
-   * method, as calling that in on_epoch_end returns the size of the last mini-
-   * batch, not the "base" mini-batch.
-   */
-  int m_current_mini_batch_size;
-  /// Current number of epochs left to ramp the learning rate.
-  int m_ramp_count = 0;
-  /// Amount to increment the learning rate by when ramping.
-  float m_lr_incr = 0.0f;
-};
-
-/**
- * Double the mini-batch size every set number of epochs.
- * Also doubles the learning rate.
- */
-class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch {
- public:
-  lbann_callback_step_minibatch(int starting_mbsize, int step,
-                                int ramp_time = 0);
-  lbann_callback_step_minibatch(const lbann_callback_step_minibatch&) = default;
-  lbann_callback_step_minibatch& operator=(
-    const lbann_callback_step_minibatch&) = default;
-  lbann_callback_step_minibatch* copy() const override {
-    return new lbann_callback_step_minibatch(*this);
-  }
-  std::string name() const override { return "step minibatch"; }
- protected:
-  bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override;
-  /// Number of epochs between mini-batch size increases.
-  int m_step;
-  /// Number of steps to ramp the learning rate over.
-  int m_ramp_time;
-};
-
-class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibatch {
- public:
-  /// Represents a step in a schedule of mini-batch sizes.
-  struct minibatch_step {
-    /// Epoch for this schedule to start.
-    int epoch;
-    /// Mini-batch size to use.
-    int mbsize;
-    /// Learning rate to use.
-    float lr;
-    /// Number of epochs to ramp the learning rate over.
-    int ramp_time;
-    minibatch_step(int _epoch, int _mbsize, float _lr, int _ramp_time) :
-      epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {}
-  };
-
-  lbann_callback_minibatch_schedule(
-    int starting_mbsize, std::vector<minibatch_step> steps);
-  lbann_callback_minibatch_schedule(
-    const lbann_callback_minibatch_schedule&) = default;
-  lbann_callback_minibatch_schedule& operator=(
-    const lbann_callback_minibatch_schedule&) = default;
-  lbann_callback_minibatch_schedule* copy() const override {
-    return new lbann_callback_minibatch_schedule(*this);
-  }
-  std::string name() const override { return "minibatch schedule"; }
- protected:
-  bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override;
-
-  /// Steps in the mini-batch schedule, stored in reverse sorted order.
-  std::vector<minibatch_step> m_steps;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_dataset.hpp b/include/lbann/callbacks/check_dataset.hpp
new file mode 100644
index 00000000000..74030a6fce9
--- /dev/null
+++ b/include/lbann/callbacks/check_dataset.hpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
+
+#include <set>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Save the sample indices for each mini-batch to ordered set.
+ * Check to make sure that all samples were properly processed.
+ */
+class check_dataset : public callback_base {
+ public:
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_evaluate_forward_prop_end;
+
+  check_dataset() :
+    callback_base() {}
+  check_dataset(
+    const check_dataset&) = default;
+  check_dataset& operator=(
+    const check_dataset&) = default;
+  check_dataset* copy() const override {
+    return new check_dataset(*this);
+  }
+  void on_forward_prop_end(model *m, Layer *l) override;
+  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
+  void on_epoch_end(model *m) override;
+  void on_validation_end(model *m) override;
+  void on_test_end(model *m) override;
+
+  void add_to_set(model *m, Layer *l, int64_t step, std::set<long> &set);
+
+  std::string name() const override { return "check data set indices"; }
+ private:
+  /** @brief Basename for writing files. */
+  std::string m_basename;
+
+  std::set<long> training_set;
+  std::set<long> validation_set;
+  std::set<long> testing_set;
+};
+
+// Builder function
+LBANN_ADD_DEFAULT_CALLBACK_BUILDER(
+  check_dataset, build_check_dataset_callback_from_pbuf);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_gradients.hpp b/include/lbann/callbacks/check_gradients.hpp
new file mode 100644
index 00000000000..39ca536c084
--- /dev/null
+++ b/include/lbann/callbacks/check_gradients.hpp
@@ -0,0 +1,98 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace callback {
+
+/** @brief Gradient checking callback.
+ *
+ *  Gradient checking is performed at the end of each execution mode
+ *  phase. Using a fourth-order finite difference scheme, a numerical
+ *  partial derivative is computed for every weight parameter. If the
+ *  numerical derivative differs signifcantly from the analytical
+ *  derivative computed during backprop, the gradient check has
+ *  failed.
+ */
+class check_gradients : public callback_base {
+public:
+
+  /**
+   *  @param modes              Execution modes with gradient checks. If
+   *                            none are provided, gradient checking is
+   *                            performed for every execution mode.
+   *  @param step_size          Step size for numerical
+   *                            differentiation (with a step size of
+   *                            zero, the step size is estimated to
+   *                            minimize the numerical error).
+   *  @param verbose            Whether to print results for each
+   *                            parameter.
+   *  @param error_on_failure   Whether to throw an exception for
+   *                            large gradient errors.
+   */
+  check_gradients(std::set<execution_mode> modes = {},
+                  DataType step_size = DataType(0),
+                  bool verbose = false,
+                  bool error_on_failure = false);
+  check_gradients* copy() const override {
+    return new check_gradients(*this);
+  }
+  std::string name() const override { return "check gradients"; }
+  void on_train_end(model *m) override      { do_check_gradients(*m); }
+  void on_validation_end(model *m) override { do_check_gradients(*m); }
+  void on_test_end(model *m) override       { do_check_gradients(*m); }
+
+private:
+
+  /** Execution modes with gradient checks. */
+  std::set<execution_mode> m_modes;
+  /** Step size for numerical differentiation. */
+  EvalType m_step_size;
+  /** Whether to print results for each parameter. */
+  bool m_verbose;
+  /** Whether to throw an exception for large gradient errors. */
+  bool m_error_on_failure;
+
+  /** Does nothing if current execution mode is not in m_modes. */
+  void do_check_gradients(model& m) const;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_check_gradients_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_init.hpp b/include/lbann/callbacks/check_init.hpp
new file mode 100644
index 00000000000..0f6ffa5c7a5
--- /dev/null
+++ b/include/lbann/callbacks/check_init.hpp
@@ -0,0 +1,60 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// check_init .hpp .cpp - Check multi-model init
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Verify that every model uses the same initialization.
+ */
+class check_init : public callback_base {
+ public:
+  check_init() = default;
+  check_init(const check_init&) = default;
+  check_init& operator=(const check_init&) = default;
+  check_init* copy() const override {
+    return new check_init(*this);
+  }
+  /** Check initializations. */
+  void on_train_begin(model *m) override;
+  std::string name() const override { return "check init"; }
+};
+
+// Builder function
+LBANN_ADD_DEFAULT_CALLBACK_BUILDER(
+  check_init, build_check_init_callback_from_pbuf);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_metric.hpp b/include/lbann/callbacks/check_metric.hpp
new file mode 100644
index 00000000000..d965f6d6ad5
--- /dev/null
+++ b/include/lbann/callbacks/check_metric.hpp
@@ -0,0 +1,87 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include <set>
+
+namespace lbann {
+namespace callback {
+
+/** Metric checking callback.
+ *  Checks if a metric value falls within an expected range.
+ */
+class check_metric : public callback_base {
+public:
+
+  check_metric(std::string metric_name,
+               std::set<execution_mode> modes,
+               EvalType lower_bound,
+               EvalType upper_bound,
+               bool error_on_failure);
+  check_metric* copy() const override {
+    return new check_metric(*this);
+  }
+  std::string name() const override { return "check metric"; }
+
+  void on_epoch_end(model* m) override      { do_check_metric(*m); }
+  void on_validation_end(model* m) override { do_check_metric(*m); }
+  void on_test_end(model* m) override       { do_check_metric(*m); }
+
+private:
+
+  /** Metric name. */
+  std::string m_metric_name;
+
+  /** Execution modes with metric checks. */
+  std::set<execution_mode> m_modes;
+
+  /** Lower bound for metric value. */
+  EvalType m_lower_bound;
+  /** Upper bound for metric value. */
+  EvalType m_upper_bound;
+
+  /** Whether to throw an exception if metric check fails. */
+  bool m_error_on_failure;
+
+  /** Perform metric check.
+   *  Does nothing if current execution mode is not in m_modes;
+   */
+  void do_check_metric(const model& m) const;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_check_metric_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_nan.hpp b/include/lbann/callbacks/check_nan.hpp
new file mode 100644
index 00000000000..0894b25a12e
--- /dev/null
+++ b/include/lbann/callbacks/check_nan.hpp
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// check_nan .hpp .cpp - Check matrices for invalid numbers
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Check matrices for whether they include any NaNs or infs to help debugging.
+ * This will kill the rank if such values are discovered.
+ */
+class check_nan : public callback_base {
+ public:
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_end;
+
+  check_nan() = default;
+  check_nan(const check_nan&) = default;
+  check_nan& operator=(
+    const check_nan&) = default;
+  check_nan* copy() const override {
+    return new check_nan(*this);
+  }
+  /** Check that activations are good. */
+  void on_forward_prop_end(model *m, Layer *l) override;
+  /** Check that error signals are good. */
+  void on_backward_prop_end(model *m, Layer *l) override;
+  /** Check that gradients are good. */
+  void on_backward_prop_end(model *m) override;
+  /** Check that weights are good. */
+  void on_batch_end(model *m) override;
+  std::string name() const override { return "check_nan"; }
+
+};
+
+// Builder function
+LBANN_ADD_DEFAULT_CALLBACK_BUILDER(
+  check_nan, build_check_nan_callback_from_pbuf)
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED
diff --git a/include/lbann/callbacks/check_small.hpp b/include/lbann/callbacks/check_small.hpp
new file mode 100644
index 00000000000..c5419f58571
--- /dev/null
+++ b/include/lbann/callbacks/check_small.hpp
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// check_small .hpp .cpp - Check matrices for small values
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Check matrices for whether they include any very small values to avoid
+ * getting denormalized values. Denormalized values can significantly slow
+ * floating point computations.
+ * Since we often square values, the check is based on the square root of the
+ * smallest floating point value.
+ * This will kill the rank if such values are discovered.
+ */
+class check_small : public callback_base {
+ public:
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_end;
+
+  check_small() = default;
+  check_small(const check_small&) = default;
+  check_small& operator=(const check_small&) = default;
+  check_small* copy() const override {
+    return new check_small(*this);
+  }
+  /** Check that activations are good. */
+  void on_forward_prop_end(model *m, Layer *l) override;
+  /** Check that gradients are good. */
+  void on_backward_prop_end(model *m) override;
+  /** Check that weights are good. */
+  void on_batch_end(model *m) override;
+  std::string name() const override { return "check_small"; }
+};
+
+// Builder function
+LBANN_ADD_DEFAULT_CALLBACK_BUILDER(
+  check_small, build_check_small_callback_from_pbuf)
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED
diff --git a/include/lbann/callbacks/checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp
new file mode 100644
index 00000000000..8a5aaac8003
--- /dev/null
+++ b/include/lbann/callbacks/checkpoint.hpp
@@ -0,0 +1,314 @@
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// checkpoint .hpp .cpp - Callback hooks to checkpoint model
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/io/persist.hpp"
+#include "lbann/training_algorithms/training_algorithm.hpp"
+
+namespace lbann {
+namespace callback {
+
+enum class callback_phase {
+  batch,
+  epoch,
+  validation,
+  inference,
+  invalid
+};
+
+/** @brief Checkpoint at given interval in given directory */
+class checkpoint : public callback_base {
+ public:
+
+  /** @brief Construct the checkpoint callback
+   *
+   *  It may be beneficial to the distributed checkpoints at a higher
+   *  tempo than the shared checkpoints because they are less
+   *  expensive.
+   *
+   *  @param checkpoint_dir directory to save checkpoint files
+   *  @param restart_dir directory to find checkpoint files
+   *  @param checkpoint_epochs interval to checkpoint
+   *  @param checkpoint_steps interval to checkpoint
+   *  @param checkpoint_secs interval to checkpoint
+   *  @param per_rank_dir The directory into which to dump distributed checkpoints
+   *  @param ckpt_dist_epochs The frequency of distributed checkpoints in epochs
+   *  @param ckpt_dist_steps The frequence of distributed checkpoints in steps
+   */
+  checkpoint(std::string checkpoint_dir,
+             std::string restart_dir,
+             int checkpoint_epochs,
+             int checkpoint_steps,
+             int checkpoint_secs,
+             std::string per_rank_dir,
+             int ckpt_dist_epochs,
+             int ckpt_dist_steps) :
+    callback_base(),
+    m_active_trainer(nullptr),
+    m_active_training_algorithm(nullptr),
+    m_checkpoint_dir(std::move(checkpoint_dir)),
+    m_restart_dir(std::move(restart_dir)),
+    m_checkpoint_epochs(checkpoint_epochs),
+    m_checkpoint_steps(checkpoint_steps),
+    m_checkpoint_secs(checkpoint_secs),
+    m_per_rank_dir(per_rank_dir),
+    m_ckpt_dist_epochs(ckpt_dist_epochs),
+    m_ckpt_dist_steps(ckpt_dist_steps) {}
+  checkpoint(const checkpoint&) = default;
+  checkpoint& operator=(const checkpoint&) = default;
+  checkpoint* copy() const override { return new checkpoint(*this); }
+  void setup(model *m) override;
+  void setup(trainer *t) override;
+  void on_train_begin(model *m) override;
+  void on_epoch_end(model *m) override;
+  void on_batch_end(model *m) override;
+  void on_validation_end(model *m) override;
+
+  inline void set_checkpoint_dir(const std::string& dir){
+    m_checkpoint_dir = dir;
+  }
+
+  inline const std::string& get_checkpoint_dir(){
+    return m_checkpoint_dir;
+  }
+
+  inline void set_restart_dir(const std::string& dir){
+    m_restart_dir = dir;
+  }
+
+  inline const std::string& get_restart_dir(){
+    // If the restart directory has been explicitly defined use that
+    if(m_restart_dir.length() != 0) {
+      return m_restart_dir;
+    }else {
+      return m_checkpoint_dir;
+    }
+  }
+
+  inline void set_active_trainer(trainer* t){
+    m_active_trainer = t;
+  }
+
+  inline trainer& get_active_trainer(){
+    if(m_active_trainer == nullptr) {
+      LBANN_ERROR("No active trainer for the checkpoint callback");
+    }
+    return *m_active_trainer;
+  }
+
+  inline void set_active_training_algorithm(training_algorithm* t){
+    m_active_training_algorithm = t;
+  }
+
+  inline training_algorithm& get_active_training_algorithm(){
+    if(m_active_training_algorithm == nullptr) {
+      LBANN_ERROR("No active training algorithm for the checkpoint callback");
+    }
+    return *m_active_training_algorithm;
+  }
+
+  inline void set_checkpoint_epochs(int epochs){
+    m_checkpoint_epochs= epochs;
+  }
+
+  inline void set_checkpoint_steps(int steps){
+    m_checkpoint_steps= steps;
+  }
+
+  inline void set_checkpoint_secs(EvalType secs){
+    m_checkpoint_secs= secs;
+  }
+
+  inline void set_per_rank_dir(std::string dir){
+    m_per_rank_dir = dir;
+  }
+
+  inline const std::string& get_per_rank_dir(){
+    return m_per_rank_dir;
+  }
+
+  inline void set_ckpt_dist_epochs(int ckpt_dist_epochs){
+    m_ckpt_dist_epochs = ckpt_dist_epochs;
+  }
+
+  inline void set_ckpt_dist_steps(int ckpt_dist_steps){
+    m_ckpt_dist_steps = ckpt_dist_steps;
+  }
+
+  inline std::string get_shared_checkpoint_rootdir() {
+    return get_restart_dir();
+  }
+
+  /// @todo BVE FIMME this looks wrong  I think that the order
+  /// should be reversed
+  inline std::string get_distributed_checkpoint_rootdir() {
+    if(m_per_rank_dir.length()) {
+      return get_per_rank_dir() + "/" + get_restart_dir();
+    }else {
+      return get_restart_dir();
+    }
+  }
+
+  bool need_checkpoint(model *m, callback_phase phase);
+  std::string find_latest_checkpoint(lbann_comm& comm,
+                                     const std::string& trainer_name,
+                                     const std::string& alg_name,
+                                     execution_mode& mode,
+                                     size_t &epoch,
+                                     size_t& step,
+                                     bool& shared);
+  bool open_latest_checkpoint(lbann_comm& comm,
+                              const std::string& task_label,
+                              const std::string& trainer_name,
+                              const std::string& alg_name,
+                              std::function<bool(/*const */persist&)> reload_shared_ckpt,
+                              std::function<bool(/*const */persist&)> reload_distributed_ckpt);
+  bool reload_model(model *m);
+  bool reload_trainer(trainer *t);
+  bool restart(model *m);
+  std::string name() const override { return "checkpoint"; }
+ protected:
+  bool do_checkpoint(model *m);
+ private:
+  trainer* m_active_trainer;
+  training_algorithm* m_active_training_algorithm;
+  std::string m_checkpoint_dir;
+  // If the restart directory is not explicity set, default to the
+  // checkpoint directory
+  std::string m_restart_dir;
+  int m_checkpoint_epochs;
+  int m_checkpoint_steps;
+  EvalType m_checkpoint_secs;
+  std::string m_per_rank_dir;
+  int m_ckpt_dist_epochs;
+  int m_ckpt_dist_steps;
+  EvalType m_checkpoint_last;
+  bool m_checkpoint_dist;
+  bool m_checkpoint_shared;
+
+  template<size_t _max_dir_len>
+  struct header_t {
+    execution_mode mode;
+    int epoch;
+    int step;
+    int shared;
+    char dirname[_max_dir_len];
+  };
+};
+
+inline std::string get_trainer_checkpoint_dirname(const std::string& trainer_name, const std::string& dir) {
+  return build_string(dir, '/', trainer_name, '/');
+}
+
+inline std::string get_last_shared_checkpoint_filename(const std::string& alg_name, const std::string& dir) {
+  return build_string(dir, '/', alg_name, ".last.shared.checkpoint");
+}
+
+inline std::string get_last_shared_checkpoint_filename(const std::string& trainer_name, const std::string& alg_name, const std::string& dir) {
+  return get_last_shared_checkpoint_filename(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir));
+}
+
+inline std::string get_shared_checkpoint_dirname(const std::string& alg_name, const std::string& dir, execution_mode mode, size_t epoch, size_t step) {
+  return build_string(dir, '/', alg_name, ".shared.", to_string(mode), ".epoch.", epoch, ".step.", step, '/');
+}
+
+inline std::string get_shared_checkpoint_dirname(const std::string& trainer_name, const std::string& alg_name, const std::string& dir, execution_mode mode, size_t epoch, size_t step) {
+  return get_shared_checkpoint_dirname(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir), mode, epoch, step);
+}
+
+inline std::string get_last_distributed_checkpoint_filename(const std::string& alg_name, const std::string& dir) {
+  return build_string(dir, '/', alg_name, ".last.distributed.checkpoint");
+}
+
+inline std::string get_last_distributed_checkpoint_filename(const std::string& trainer_name, const std::string& alg_name, const std::string& dir) {
+  return get_last_distributed_checkpoint_filename(alg_name, get_trainer_checkpoint_dirname(trainer_name, dir));
+}
+
+inline std::string get_distributed_checkpoint_dirname(const std::string& alg_name, const int rank_in_trainer, const std::string& dir, execution_mode mode, size_t epoch, size_t step) {
+  return build_string(dir, '/',
+     alg_name,
+    ".rank.", rank_in_trainer,
+    ".distributed.", to_string(mode),
+    ".epoch.", epoch,
+    ".step.", step, '/');
+}
+
+inline std::string get_distributed_checkpoint_dirname(const std::string& trainer_name, const std::string& alg_name, const int rank_in_trainer, const std::string& dir, execution_mode mode, size_t epoch, size_t step) {
+  return get_distributed_checkpoint_dirname(alg_name, rank_in_trainer, get_trainer_checkpoint_dirname(trainer_name, dir), mode, epoch, step);
+}
+
+// Print last checkpoint to file, used to determine which checkpoint to load from.
+inline bool write_latest(std::string filename, execution_mode mode, size_t epoch, size_t train) {
+  // open the file for writing
+  int fd = openwrite(filename.c_str());
+  if (fd != -1) {
+    char field[256];
+    sprintf(field, "mode=%s epoch=%ld step=%ld\n", to_string(mode).c_str(), epoch, train);
+    write_string(fd, filename.c_str(), field, strlen(field));
+    // close our file
+    closewrite(fd, filename.c_str());
+  }
+  return true;
+}
+
+/** \brief Reads the "latest" file and returns the epoch number and
+ *        sample offset for most recent checkpoint
+ */
+inline bool read_latest(std::string filename, execution_mode *mode, size_t *epochLast, size_t *trainLast) {
+  // assume we don't have a file, we'll return -1 in that case
+  *epochLast = -1;
+  *trainLast = -1;
+  *mode = execution_mode::invalid;
+  // open the file for reading
+  int fd = openread(filename.c_str());
+  if (fd != -1) {
+    // read epoch from file
+    char field[256];
+    read_string(fd, filename.c_str(), field, sizeof(field));
+    char modeStr[64];
+    int ret = sscanf(field, "mode=%s epoch=%ld step=%ld\n", modeStr, epochLast, trainLast);
+    *mode = exec_mode_from_string(modeStr);
+    // close our file
+    closeread(fd, filename.c_str());
+    if(ret != 3) { return false; }
+    return true;
+  }
+  return false;
+}
+
+// Builder function
+std::unique_ptr<callback_base>
+build_checkpoint_callback_from_pbuf(
+  const google::protobuf::Message&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/confusion_matrix.hpp b/include/lbann/callbacks/confusion_matrix.hpp
new file mode 100644
index 00000000000..187c9088487
--- /dev/null
+++ b/include/lbann/callbacks/confusion_matrix.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Compute confusion matrix.
+ *  Confusion matrices are saved in CSV files of the form
+ *  "<prefix><mode>.csv". The (i,j)-entry is the proportion of samples
+ *  with prediction i and label j. The prediction and label layers are
+ *  assumed to output one-hot vectors for each mini-batch sample.
+ */
+class confusion_matrix : public callback_base {
+public:
+  using AbsDistMatType = El::AbstractDistMatrix<DataType>;
+public:
+
+  confusion_matrix(std::string&& prediction_layer,
+                   std::string&& label_layer,
+                   std::string&& prefix);
+  confusion_matrix(std::string const& prediction_layer,
+                   std::string const& label_layer,
+                   std::string const& prefix);
+  confusion_matrix(const confusion_matrix&);
+  confusion_matrix& operator=(const confusion_matrix&);
+  confusion_matrix* copy() const override {
+    return new confusion_matrix(*this);
+  }
+  std::string name() const override { return "confusion matrix"; }
+
+  void setup(model *m) override;
+
+  void on_epoch_begin(model *m) override      { reset_counts(*m); }
+  void on_epoch_end(model *m) override        { save_confusion_matrix(*m); }
+  void on_validation_begin(model *m) override { reset_counts(*m); }
+  void on_validation_end(model *m) override   { save_confusion_matrix(*m); }
+  void on_test_begin(model *m) override       { reset_counts(*m); }
+  void on_test_end(model *m) override         { save_confusion_matrix(*m); }
+  void on_batch_end(model *m) override          { update_counts(*m); }
+  void on_batch_evaluate_end(model *m) override { update_counts(*m); }
+
+private:
+
+  /** Name of prediction layer.
+   *  This layer is assumed to output one-hot vectors.
+   */
+  std::string m_prediction_layer;
+  /** Name of label layer.
+   *  This layer is assumed to output one-hot vectors.
+   */
+  std::string m_label_layer;
+  /** Prefix for output files. */
+  std::string m_prefix;
+
+  /** Confusion matrix counts.
+   *  Each vector should be interpreted as a num_classes x num_classes
+   *  matrix in row-major order. The (i,j)-entry is the number of
+   *  samples with prediction i and label j.
+   */
+  std::map<execution_mode,std::vector<El::Int>> m_counts;
+
+  /** "View" into prediction matrix.
+   *  This is a CPU matrix. If the prediction layer keeps data on GPU,
+   *  then this will be a matrix copy rather than a matrix view.
+   */
+  std::unique_ptr<AbsDistMatType> m_predictions_v;
+  /** "View" into label matrix.
+   *  This is a CPU matrix. If the label layer keeps data on GPU or in
+   *  a different distribution than the prediction layer, then this
+   *  will be a matrix copy rather than a matrix view.
+   */
+  std::unique_ptr<AbsDistMatType> m_labels_v;
+
+  /** Get prediction matrix. */
+  const AbsDistMatType& get_predictions(const model& m) const;
+  /** Get label matrix. */
+  const AbsDistMatType& get_labels(const model& m) const;
+
+  /** Reset confusion matrix counts. */
+  void reset_counts(const model& m);
+  /** Update confusion matrix counts.
+   *  Counts are updated with current mini-batch predictions and
+   *  labels.
+   */
+  void update_counts(const model& m);
+  /** Output confusion matrix to file. */
+  void save_confusion_matrix(const model& m);
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_confusion_matrix_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED
diff --git a/include/lbann/callbacks/debug.hpp b/include/lbann/callbacks/debug.hpp
new file mode 100644
index 00000000000..354696e7dd2
--- /dev/null
+++ b/include/lbann/callbacks/debug.hpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * @brief Phase specific "printf debugging"
+ *
+ * Print verbose status updates to standard error stream.
+ * This callback is useful for "printf debugging."
+ *
+ * Takes a prototext parameter @c phase: train | validate | test | \<empty\>
+ * if \<empty\> will print messages for all phases
+ *
+ */
+class debug : public callback_base {
+ public:
+
+  /** @brief Constructor.
+   *
+   *  If modes is empty, status updates will be printed for all
+   *  execution modes.
+   */
+  debug(std::set<execution_mode> modes) :
+    m_modes(std::move(modes)) {}
+  debug(const debug&) = default;
+  debug& operator=(const debug&) = default;
+  debug* copy() const override { return new debug(*this); }
+  std::string name() const override { return "debug"; }
+
+  /** @brief Print that a batch is beginning. */
+  void on_batch_begin(model *m) override;
+  /** @brief Print that a batch is ending. */
+  void on_batch_end(model *m) override;
+  /** @brief Print that a layer's forward prop is beginning. */
+  void on_batch_evaluate_begin(model *m) override;
+  /** @brief Print that a layer's forward prop is ending. */
+  void on_batch_evaluate_end(model *m) override;
+
+  using callback_base::on_forward_prop_begin;
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_begin;
+  using callback_base::on_backward_prop_end;
+  using callback_base::on_evaluate_forward_prop_begin;
+  using callback_base::on_evaluate_forward_prop_end;
+
+  /** @brief Print that a layer's forward prop is beginning. */
+  void on_forward_prop_begin(model *m, Layer *l) override;
+  /** @brief Print that a layer's forward prop is ending. */
+  void on_forward_prop_end(model *m, Layer *l) override;
+  /** @brief Print that a layer's backward prop is beginning. */
+  void on_backward_prop_begin(model *m, Layer *l) override;
+  /** @brief Print that a layer's backward prop is ending. */
+  void on_backward_prop_end(model *m, Layer *l) override;
+  /** @brief Print that a layer's backward prop is beginning. */
+  void on_evaluate_forward_prop_begin(model *m, Layer *l) override;
+  /** @brief Print that a layer's backward prop is ending. */
+  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
+
+  /** @brief Print that a weights' optimization step is beginning. */
+  void on_optimize_begin(model *m, weights *w) override;
+  /** @brief Print that a weights' optimization step is ending. */
+  void on_optimize_end(model *m, weights *w) override;
+
+ private:
+
+  /** @brief Execution modes for which status updates will be printed.
+   *
+   *  If empty, status updates are printed for all execution modes.
+   */
+  std::set<execution_mode> m_modes;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_debug_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED
diff --git a/include/lbann/callbacks/debug_io.hpp b/include/lbann/callbacks/debug_io.hpp
new file mode 100644
index 00000000000..834f91e40bb
--- /dev/null
+++ b/include/lbann/callbacks/debug_io.hpp
@@ -0,0 +1,97 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// debug .hpp .cpp - Callback hooks to debug LBANN
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
+
+#include <chrono>
+#include <vector>
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/layers/io/input/input_layer.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Print status updates on where training is.
+ */
+class debug_io : public callback_base {
+ public:
+  using callback_base::on_forward_prop_begin;
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_begin;
+  using callback_base::on_backward_prop_end;
+  using callback_base::on_evaluate_forward_prop_begin;
+  using callback_base::on_evaluate_forward_prop_end;
+
+  /**
+   * Debug a particular phase; use invalid to debug every phase.
+   */
+  debug_io(execution_mode phase = execution_mode::invalid,
+                          int debug_lvl = 0) :
+    callback_base(1),
+    m_debug_phase(phase),
+    m_debug_lvl(debug_lvl) {}
+  debug_io(const debug_io&) = default;
+  debug_io& operator=(
+    const debug_io&) = default;
+  debug_io* copy() const override { return new debug_io(*this); }
+  /** Print that a training epoch is being started. */
+  void on_epoch_begin(model *m) override;
+  /** Print that forward prop for a layer is beginning. */
+  void on_forward_prop_begin(model *m, Layer *l) override;
+
+  /** Print I/O details at the beginning of validation. */
+  void on_validation_begin(model *m) override;
+  /** Print that an evaluation forward prop is beginning. */
+  void on_evaluate_forward_prop_begin(model *m, Layer *l) override;
+
+  /** Print I/O details at the beginning of testing. */
+  void on_test_begin(model *m) override;
+
+  /** Common format for printing I/O stats at the start of a mini-batch */
+  void print_fp_start(model *m, generic_input_layer<DataType> *input);
+  /** Common format for printing I/O stats at the start of a phase */
+  void print_phase_start(model *m, execution_mode mode);
+
+  std::string name() const override { return "debug_io"; }
+ private:
+  /** The phase to debug. */
+  execution_mode m_debug_phase;
+  int m_debug_lvl; /** Debugging level: 0 - epoch begin, 1 - fwd prop */
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_debug_io_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED
diff --git a/include/lbann/callbacks/dump_error_signals.hpp b/include/lbann/callbacks/dump_error_signals.hpp
new file mode 100644
index 00000000000..9d704d9560a
--- /dev/null
+++ b/include/lbann/callbacks/dump_error_signals.hpp
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Dump gradients w.r.t. inputs to file.
+ *  After each layer performs a backward prop step, this callback will
+ *  dump the gradients w.r.t. inputs (the "error signals") to a
+ *  human-readable ASCII file. This is slow and produces a lot of output.
+ */
+class dump_error_signals : public callback_base {
+ public:
+
+  /** Constructor.
+   *  @param basename The basename for output files.
+   */
+  dump_error_signals(std::string basename = "")
+    : callback_base(), m_basename(basename) {}
+  dump_error_signals* copy() const override {
+    return new dump_error_signals(*this);
+  }
+  std::string name() const override { return "dump error signals"; }
+
+  /** Write error signals to file after each backward prop step. */
+  void on_backward_prop_end(model *m, Layer *l) override;
+
+ private:
+  /** Basename for output files. */
+  std::string m_basename;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_dump_error_signals_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/dump_gradients.hpp b/include/lbann/callbacks/dump_gradients.hpp
new file mode 100644
index 00000000000..005a0195955
--- /dev/null
+++ b/include/lbann/callbacks/dump_gradients.hpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_gradients .hpp .cpp - Callbacks to dump gradients
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * @brief Dump gradient matrices to files.
+ * @details This will dump each hidden layer's gradient matrix after
+ * each minibatch.  The matrices are written to files using
+ * Elemental's simple ASCII format. This is not meant for
+ * checkpointing, but for exporting gradient matrices for analysis
+ * that isn't easily done in LBANN.  Note this dumps matrices during
+ * each mini-batch. This will be slow and produce a lot of output.
+ */
+class dump_gradients : public callback_base {
+ public:
+  using callback_base::on_backward_prop_end;
+
+  /**
+   * @param basename The basename for writing files.
+   * @param batch_interval The frequency at which to dump the gradients
+   */
+  dump_gradients(std::string basename, int batch_interval = 1) :
+    callback_base(batch_interval), m_basename(std::move(basename)) {}
+  dump_gradients(
+    const dump_gradients&) = default;
+  dump_gradients& operator=(
+    const dump_gradients&) = default;
+  dump_gradients* copy() const override {
+    return new dump_gradients(*this);
+  }
+  void on_backward_prop_end(model *m) override;
+  std::string name() const override { return "dump gradients"; }
+ private:
+  /** @brief Basename for writing files. */
+  std::string m_basename;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_dump_gradients_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp
new file mode 100644
index 00000000000..1aca8c40a0e
--- /dev/null
+++ b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp
@@ -0,0 +1,85 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_minibatch_sample_indices .hpp .cpp - Callbacks
+// to dump the list of indices per minibatch
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * @brief Dump sample indices for each minibatch to files.
+ * @details This will dump the list of indices from the training /
+ * validation / testing data that was processed Note this dumps
+ * vectors during each mini-batch. This will be slow and produce a lot
+ * of output.
+ */
+class dump_minibatch_sample_indices : public callback_base {
+ public:
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_evaluate_forward_prop_end;
+
+  /**
+   * @param basename The basename for writing files.
+   * @param batch_interval The frequency at which to dump sample indices
+   */
+  dump_minibatch_sample_indices(std::string basename,
+                                               int batch_interval = 1) :
+    callback_base(batch_interval), m_basename(std::move(basename)) {}
+  dump_minibatch_sample_indices(
+    const dump_minibatch_sample_indices&) = default;
+  dump_minibatch_sample_indices& operator=(
+    const dump_minibatch_sample_indices&) = default;
+  dump_minibatch_sample_indices* copy() const override {
+    return new dump_minibatch_sample_indices(*this);
+  }
+  void on_forward_prop_end(model *m, Layer *l) override;
+  void on_evaluate_forward_prop_end(model *m, Layer *l) override;
+
+  void dump_to_file(model *m, Layer *l, int64_t step);
+
+  std::string name() const override { return "dump minibatch sample indices"; }
+ private:
+  /** Basename for writing files. */
+  std::string m_basename;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_dump_mb_indices_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/dump_outputs.hpp b/include/lbann/callbacks/dump_outputs.hpp
new file mode 100644
index 00000000000..34610896f3b
--- /dev/null
+++ b/include/lbann/callbacks/dump_outputs.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <set>
+#include <string>
+
+namespace lbann {
+namespace callback {
+
+/** @brief Dump layer output tensors to files.
+ *
+ *  Saves a file for each output tensor of each selected layer,
+ *  computed at each mini-batch step. Output files have the form
+ *  "<model>-<mode>-epoch<#>-step<#>-<layer>-output<#>.<format>".
+ *  This is primarily intended as a debugging tool, although it can be
+ *  used for inference when performance is not critical.
+ *
+ *  For NumPy file formats (npy and npz), tensor dimensions are
+ *  recorded. For text file formats (CSV and TSV), each line contains
+ *  flattened tensor data corresponding to one mini-batch sample
+ *  (which is the transpose of the column-major matrix representation
+ *  we use internally).
+ *
+ *  CNPY is required to export to NumPy file formats (npy and npz).
+ */
+class dump_outputs : public callback_base {
+public:
+
+  /** @brief Construct a callback to dump outputs.
+   *
+   *  @param layer_names    Names of layers with output dumps
+   *                        (default: dump outputs for all layers).
+   *  @param modes          Execution modes with output dumps
+   *                        (default: dump outputs for all modes).
+   *  @param batch_interval Frequency of output dumps (default: dump
+   *                        outputs at each mini-batch step).
+   *  @param directory      Directory for output files (default: current
+   *                        working directory).
+   *  @param file_format    Output file format. Options are csv, tsv,
+   *                        npy, npz (default: csv).
+   */
+  dump_outputs(
+    std::set<std::string> layer_names,// = std::set<std::string>(),
+    std::set<execution_mode> modes, // = std::set<std::string>(),
+    El::Int batch_interval = 0,
+    std::string directory = "",
+    std::string file_format = "");
+
+  dump_outputs* copy() const override {
+    return new dump_outputs(*this);
+  }
+  std::string name() const override { return "dump outputs"; }
+
+  void on_forward_prop_end(model* m, Layer* l) override {
+    do_dump_outputs(*m, *l);
+  }
+  void on_evaluate_forward_prop_end(model* m, Layer* l) override {
+    const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+    if(c.get_step() % m_batch_interval == 0) {
+      do_dump_outputs(*m, *l);
+    }
+  }
+
+private:
+
+  /** @brief   Names of layers with output dumps.
+   *  @details If empty, outputs will be dumped for all layers.
+   */
+  std::set<std::string> m_layer_names;
+
+  /** @brief   Execution modes with output dumps.
+   *  @details If empty, outputs will be dumped for all execution modes.
+   */
+  std::set<execution_mode> m_modes;
+
+  /** @brief   Directory for output files.
+   *  @details Pathname has trailing '/'.
+   */
+  std::string m_directory;
+
+  /** @brief Output file format. */
+  std::string m_file_format;
+
+  /** @brief   Dump outputs to file.
+   *  @details Returns immediately if an output dump is not needed.
+   */
+  void do_dump_outputs(const model& m, const Layer& l);
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_dump_outputs_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp
new file mode 100644
index 00000000000..ecfaa58d9d1
--- /dev/null
+++ b/include/lbann/callbacks/dump_weights.hpp
@@ -0,0 +1,82 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_weights .hpp .cpp - Callbacks to dump weight matrices
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Dump weight matrices to files.
+ * This will dump each hidden layer's weight/bias matrix after specified epoch interval.
+ * The matrices are written to files using Elemental's simple ASCII format. This
+ * is not meant for checkpointing, but for exporting weight matrices for
+ * analysis that isn't easily done in LBANN.
+ */
+class dump_weights : public callback_base {
+ public:
+  /**
+   * @param basename The basename for writing files.
+   */
+  dump_weights(std::string dir, El::Int epoch_interval=1) :
+    callback_base(), m_directory(std::move(dir)),
+    m_epoch_interval(std::max(El::Int(1),epoch_interval)) {}
+  dump_weights(const dump_weights&) = default;
+  dump_weights& operator=(
+    const dump_weights&) = default;
+  dump_weights* copy() const override {
+    return new dump_weights(*this);
+  }
+  void on_train_begin(model *m) override;
+  void on_epoch_end(model *m) override;
+  std::string name() const override { return "dump weights"; }
+  void set_target_dir(const std::string& dir) { m_directory = dir; }
+  const std::string& get_target_dir() { return m_directory; }
+ private:
+  /** Basename for writing files. */
+  std::string m_directory;
+  /** Interval at which to dump weights */
+  El::Int m_epoch_interval;
+  /// Dump weights from learning layers.
+  void do_dump_weights(const model& m, std::string s = "");
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_dump_weights_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/early_stopping.hpp b/include/lbann/callbacks/early_stopping.hpp
new file mode 100644
index 00000000000..f74611900f5
--- /dev/null
+++ b/include/lbann/callbacks/early_stopping.hpp
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
+#define LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
+
+#include <unordered_set>
+#include <unordered_map>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Stop training after validation error stops improving.
+ */
+class early_stopping : public callback_base {
+ public:
+  /**
+   * Continue training until score has not improved for patience epochs.
+   */
+  early_stopping(int64_t patience);
+  early_stopping(const early_stopping&) = default;
+  early_stopping& operator=(
+    const early_stopping&) = default;
+  early_stopping* copy() const override {
+    return new early_stopping(*this);
+  }
+  /** Update validation score and check for early stopping. */
+  void on_validation_end(model *m) override;
+  std::string name() const override { return "early stopping"; }
+ private:
+  /** Number of epochs to wait for improvements. */
+  int64_t m_patience;
+  /** Last recorded score. */
+  EvalType m_last_score = std::numeric_limits<EvalType>::max();
+  /** Current number of epochs without improvement. */
+  int64_t m_wait = 0;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_early_stopping_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED
diff --git a/include/lbann/callbacks/gpu_memory_usage.hpp b/include/lbann/callbacks/gpu_memory_usage.hpp
new file mode 100644
index 00000000000..1d18019776e
--- /dev/null
+++ b/include/lbann/callbacks/gpu_memory_usage.hpp
@@ -0,0 +1,58 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// callback_gpu_memory_usage .hpp .cpp - Callbacks for printing GPU memory usage
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Callback hooks for printing GPU memory usage. */
+class gpu_memory_usage : public callback_base {
+ public:
+
+  /** Constructor.
+   */
+  gpu_memory_usage() = default;
+  gpu_memory_usage(const gpu_memory_usage&) = default;
+  gpu_memory_usage& operator=(const gpu_memory_usage&) = default;
+  gpu_memory_usage* copy() const override { return new gpu_memory_usage(*this); }
+  void on_epoch_begin(model *m) override;
+  std::string name() const override { return "GPU memory usage"; }
+};
+
+// Builder function
+LBANN_ADD_DEFAULT_CALLBACK_BUILDER(
+  gpu_memory_usage, build_gpu_memory_usage_callback_from_pbuf);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/hang.hpp b/include/lbann/callbacks/hang.hpp
new file mode 100644
index 00000000000..246d72ca51b
--- /dev/null
+++ b/include/lbann/callbacks/hang.hpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// hang .hpp .cpp - Callback to hang LBANN for debuggers
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Hang LBANN as training starts so debuggers can attach.
+ * This will cause either a specific rank (in COMM_WORLD) or every rank to hang.
+ * Attach to the hung ranks and set the hang flag to false with a debugger to
+ * proceed.
+ */
+class hang : public callback_base {
+ public:
+  /**
+   * @param rank_to_hang The rank to hang; -1 for every rank (default).
+   */
+  hang(int rank_to_hang = -1) :
+    m_rank_to_hang(rank_to_hang) {}
+  hang(const hang&) = default;
+  hang& operator=(const hang&) = default;
+  hang* copy() const override { return new hang(*this); }
+
+  void setup(model* m) override;
+
+  /// Hang on train begin.
+  void on_train_begin(model* m) override {
+    if (m_rank_to_hang == -1 ||
+        m_rank_to_hang == m->get_comm()->get_rank_in_world()) {
+      // Set this flag to false with your debugger to resume execution.
+      volatile bool lbann_hang = true;
+      while (lbann_hang) {}
+    }
+  }
+  std::string name() const override { return "hang"; }
+ private:
+  /// The rank that will hang; -1 for every rank.
+  int m_rank_to_hang;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_hang_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED
diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp
new file mode 100644
index 00000000000..f7703ade0d9
--- /dev/null
+++ b/include/lbann/callbacks/imcomm.hpp
@@ -0,0 +1,122 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// imcomm .hpp .cpp - Send gradient updates between models
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
+
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+
+template <typename T>
+class data_type_weights;
+
+namespace callback {
+
+/**
+ * @brief Support inter-model communication after each mini-batch to
+ *        synchronize gradient updates.
+ */
+class imcomm : public callback_base {
+ public:
+  using callback_base::on_backward_prop_end;
+
+  enum comm_type {
+    NONE=0,  /** Do no gradient updates. */
+    NORMAL,  /** Simply sum gradient updates. */
+  };
+
+  /**
+   * @brief Initialize with ct being used for all weights.
+   */
+  imcomm(comm_type ct = NORMAL,
+         const std::shared_ptr<lbann_summary>& summarizer = nullptr);
+  imcomm(const imcomm&) = default;
+  imcomm& operator=(const imcomm&) = default;
+  imcomm* copy() const override {
+    return new imcomm(*this);
+  }
+  /**
+   * @brief Convenience initialization to do one update type for specific weights.
+   *
+   * @details Implies no inter-model updates for other weights.
+   */
+  imcomm(comm_type ct, std::unordered_set<weights *> weights_list,
+         const std::shared_ptr<lbann_summary>& summarizer = nullptr);
+
+  /** @brief Choose comm type ct for weights. */
+  void set_weights_comm(weights *w, comm_type ct);
+
+  /** @brief Do initialization for this model. */
+  void setup(model *m) override;
+
+  /** @brief Make sure all models have the same weights. */
+  void on_train_begin(model *m) override;
+
+  /** @brief Do inter-model gradient updates. */
+  void on_backward_prop_end(model *m) override;
+
+  std::string name() const override { return "imcomm"; }
+
+ private:
+  /** @brief Summarize relevant statistics. */
+  template <typename T>
+  void do_summary(model const& m, data_type_weights<T>& w, EvalType im_time);
+
+ private:
+  /** @brief Parameters for a given set of weights. */
+  struct imcomm_params {
+    /** @brief Type of communication done. */
+    comm_type ct = NONE;
+  };
+
+  /** @brief Default communication type. */
+  comm_type m_default_ct;
+
+  /** @brief Per-weights parameters. */
+  std::unordered_map<weights *, imcomm_params> m_weights_params;
+
+  /** @brief @brief lbann_summary */
+  std::shared_ptr<lbann_summary> m_summarizer = nullptr;
+};
+
+/** @brief returns a string representation of the weight_initialization */
+std::string get_comm_type_name(typename imcomm::comm_type m);
+
+// Builder function
+std::unique_ptr<callback_base>
+build_imcomm_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED
diff --git a/include/lbann/callbacks/learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp
new file mode 100644
index 00000000000..8973fe34b4e
--- /dev/null
+++ b/include/lbann/callbacks/learning_rate.hpp
@@ -0,0 +1,344 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
+#define LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
+
+#include <unordered_set>
+#include <unordered_map>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+// Different schedules should inherit from learning_rate.
+
+/**
+ * Base class for learning rate schedules.
+ * Child classes should implement the schedule method to make changes.
+ */
+class learning_rate : public callback_base {
+ public:
+  learning_rate();
+  learning_rate(const learning_rate&) = default;
+  learning_rate& operator=(
+    const learning_rate&) = default;
+  /** Only apply to specific weights. */
+  learning_rate(std::vector<std::string> weights_names);
+  /** Do some initialization. */
+  void setup(model *m) override;
+  /** Apply global learning rate schedules. */
+  void on_epoch_end(model *m) override;
+
+  using callback_base::on_backward_prop_end;
+  /** Apply local/per-optimizer learning rate schedules. */
+  void on_backward_prop_end(model *m) override;
+ protected:
+  /**
+   * This is called at the end of every epoch to update the learning
+   * rate for every optimizer. Adjustments should be made based on the
+   * current global learning rate.
+   * The returned learning rate will be used to automatically update
+   * the current global learning rate.
+   */
+  virtual float global_schedule(model *m) {
+    return get_current_global_learning_rate();
+  }
+
+  /**
+   * This is called at the end of every training mini-batch to update the
+   * learning rate for optimizer opt. The current global learning rate is *not*
+   * updated automatically based on this method.
+   */
+  virtual float optimizer_schedule(model *m, optimizer &opt);
+
+  const std::unordered_set<weights*>& get_weights() const noexcept {
+    return m_weights;
+  }
+
+  static float get_current_global_learning_rate() noexcept {
+    return m_cur_global_lr;
+  }
+
+  static void update_global_learning_rate(float rate) noexcept {
+    m_cur_global_lr = rate;
+  }
+
+ private:
+    /**
+   * This should be maintained by all learning rate schedule
+   * implementations as the current global learning rate. This enables
+   * coordination among different schedules, particularly ones that
+   * work on a per-optimizer basis.
+   */
+  static float m_cur_global_lr;
+
+  /** Names of the weights being updated. */
+  std::vector<std::string> m_weights_names;
+
+  /** Weights to update. */
+  std::unordered_set<weights*> m_weights;
+};
+
+/**
+ * Decrease the learning rate by a fixed proportion every X epochs.
+ */
+class step_learning_rate : public learning_rate {
+ public:
+  /** Decrease the learning rate by amt every step epochs. */
+  step_learning_rate(size_t step, float amt);
+  step_learning_rate(size_t step, float amt,
+                     std::vector<std::string> weights_names);
+  step_learning_rate(
+    const step_learning_rate&) = default;
+  step_learning_rate& operator=(
+    const step_learning_rate&) = default;
+  step_learning_rate* copy() const override {
+    return new step_learning_rate(*this);
+  }
+  std::string name() const override { return "step learning rate"; }
+ protected:
+  float global_schedule(model *m) override;
+ private:
+  /** Number of epochs between each learning rate decrease. */
+  size_t m_step;
+  /** Amount to decrease the learning rate by. */
+  float m_amt;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_step_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+/**
+ * Decrease the learning rate by a fixed proportion when validation error stops
+ * improving.
+ */
+class adaptive_learning_rate : public learning_rate {
+ public:
+  /**
+   * Decrease the learning rate by amt if accuracy does not improve for patience
+   * epochs.
+   */
+  adaptive_learning_rate(size_t patience, float amt);
+  adaptive_learning_rate(size_t patience, float amt,
+                                        std::vector<std::string> weights_names);
+  adaptive_learning_rate(
+    const adaptive_learning_rate&) = default;
+  adaptive_learning_rate& operator=(
+    const adaptive_learning_rate&) = default;
+  adaptive_learning_rate* copy() const override {
+    return new adaptive_learning_rate(*this);
+  }
+  std::string name() const override { return "adaptive learning rate"; }
+ protected:
+  float global_schedule(model *m) override;
+ private:
+  /** Number of epochs to wait for improvements. */
+  size_t m_patience;
+  /** Amount to decrease the learning rate by. */
+  float m_amt;
+  /** Current epoch. */
+  size_t m_cur_epoch = std::numeric_limits<size_t>::max();
+  /** Last recorded score. */
+  EvalType m_last_score = std::numeric_limits<EvalType>::max();
+  /** Current number of epochs without improvement. */
+  size_t m_wait = 0;
+  /** Whether to adjust learning rate for current epoch. */
+  bool m_adjust_learning_rate = false;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_adaptive_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+/**
+ * Decrease learning rate by a fixed amount at fixed times.
+ */
+class drop_fixed_learning_rate : public learning_rate {
+ public:
+  /**
+   * Decrease the learning rate by amt when each epoch in drop_epochs is
+   * reached.
+   */
+  drop_fixed_learning_rate(
+    std::vector<size_t> drop_epochs, float amt);
+  drop_fixed_learning_rate(
+    std::vector<size_t> drop_epochs, float amt,
+    std::vector<std::string> weights_names);
+  drop_fixed_learning_rate(
+    const drop_fixed_learning_rate&) = default;
+  drop_fixed_learning_rate& operator=(
+    const drop_fixed_learning_rate&) = default;
+  drop_fixed_learning_rate* copy() const override {
+    return new drop_fixed_learning_rate(*this);
+  }
+  std::string name() const override { return "drop fixed learning rate"; }
+ protected:
+  float global_schedule(model *m) override;
+ private:
+  /// Amount to decrease the learning rate by.
+  float m_amt;
+  /**
+   * Epochs to drop learning rate at. This is stored in reverse sorted order,
+   * so that the end can be examined and then popped in constant time.
+   */
+  std::vector<size_t> m_drop_epochs;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_drop_fixed_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+/**
+ * Linearly increase the learning rate to reach a target value over a
+ * fixed number of epochs.
+ * @note This currently assumes every optimizer begins with the same
+ * learning rate.  This also *forces* its schedule and will stomp over
+ * other changes.
+ */
+class linear_growth_learning_rate : public learning_rate {
+ public:
+  /**
+   * Linearly increase the learning rate to reach target after num_epochs.
+   */
+  linear_growth_learning_rate(
+    float target, size_t num_epochs);
+  linear_growth_learning_rate(
+    float target, size_t num_epochs, size_t delay);
+  linear_growth_learning_rate(
+    float target, size_t num_epochs, size_t delay,
+    std::vector<std::string> weights_names);
+  linear_growth_learning_rate(
+    const linear_growth_learning_rate&) = default;
+  linear_growth_learning_rate& operator=(
+    const linear_growth_learning_rate&) = default;
+  linear_growth_learning_rate* copy() const override {
+    return new linear_growth_learning_rate(*this); }
+  void setup(model *m) override;
+  std::string name() const override { return "linear growth learning rate"; }
+ protected:
+  float global_schedule(model *m) override;
+ private:
+  /// Initial learning rate.
+  float m_base_lr;
+  /// Target learning rate to reach.
+  float m_target;
+  /// Amount to increase each epoch.
+  float m_inc;
+  /// Number of epochs over which to scale the learning rate.
+  size_t m_num_epochs;
+  /// Number of epochs to delay before starting growth.
+  size_t m_delay;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_linear_growth_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&,std::shared_ptr<lbann_summary> const&);
+
+/**
+ * Decrease the learning rate by polynomial policy
+ * base_lr*(1 - i_cur/i_max)^p, where
+ * base_lr is the initial learning rate, i_cur is the current iteration,
+ * i_max is the maximum iteration, and p is a parameter.
+ */
+class poly_learning_rate : public learning_rate {
+ public:
+  poly_learning_rate(double p, size_t n_epochs, size_t max_iter);
+  poly_learning_rate(double p, size_t n_epochs, size_t max_iter, double endl_r,
+    std::vector<std::string> weights_names);
+  poly_learning_rate(
+    const poly_learning_rate&) = default;
+  poly_learning_rate& operator=(
+    const poly_learning_rate&) = default;
+  poly_learning_rate* copy() const override {
+    return new poly_learning_rate(*this);
+  }
+  void setup(model *m) override;
+  std::string name() const override { return "poly learning rate"; }
+ protected:
+  float global_schedule(model *m) override;
+  float optimizer_schedule(model *m, optimizer &opt) override;
+ private:
+  /// The exponent to compute new learning rate in poly policy
+  double m_p;
+  /// The number of epochs for training
+  size_t m_num_epochs;
+  /// The maximum number of iterations until which the learning rate changes
+  size_t m_max_iter;
+  /// The minimum learning rate
+  float m_end_lr;
+  /// The current rate to scale the base learning rate
+  float m_lr;
+  /// The learning rate scale used at the end of the last epoch
+  float m_last_epoch_lr;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_poly_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+/**
+ * This implements an adaptive scheme for adjust each optimizer's
+ * learning rate based on the ratio of the norms of its weights and
+ * its gradients.
+ * See: You et al. "Scaling SGD Batch Size to 32K for ImageNet
+ * Training", 2017.
+ */
+class optimizerwise_adaptive_learning_rate : public learning_rate {
+ public:
+  optimizerwise_adaptive_learning_rate(float scale);
+  optimizerwise_adaptive_learning_rate(
+    float scale, std::vector<std::string> weights_names);
+  optimizerwise_adaptive_learning_rate(
+    const optimizerwise_adaptive_learning_rate&) = default;
+  optimizerwise_adaptive_learning_rate& operator=(
+    const optimizerwise_adaptive_learning_rate&) = default;
+  optimizerwise_adaptive_learning_rate* copy() const override {
+    return new optimizerwise_adaptive_learning_rate(*this); }
+  std::string name() const override { return "optimizerwise adaptive learning rate"; }
+ protected:
+  float optimizer_schedule(model *m, optimizer &opt) override;
+ private:
+  float m_scale;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_optimizerwise_adaptive_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message&,std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/load_model.hpp b/include/lbann/callbacks/load_model.hpp
new file mode 100644
index 00000000000..670b775bdbb
--- /dev/null
+++ b/include/lbann/callbacks/load_model.hpp
@@ -0,0 +1,108 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// load_model .hpp .cpp - Callbacks to load pretrained model(s)
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <google/protobuf/message.h>
+
+// Forward-declare protobuf classes
+namespace lbann_data {
+class Model;
+}
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Load pretrained model from file
+ */
+class load_model : public callback_base {
+ public:
+  /**
+   * @param dir directory to load model
+   * @param extension file extension e.g., model, state ......
+   */
+  load_model(std::vector<std::string> dirs,
+             std::string extension="prototext") :
+    callback_base(), m_dirs(std::move(dirs)),
+    m_extension(std::move(extension)),
+    m_loaded(false)
+  {}
+  load_model(const load_model&) = default;
+  load_model& operator=(
+    const load_model&) = default;
+  load_model* copy() const override {
+    return new load_model(*this);
+  }
+
+  inline void add_dir(const std::string& dir){
+    m_dirs.emplace_back(dir);
+  }
+
+  void on_train_begin(model *m) override;
+
+  void on_test_begin(model *m) override;
+
+  /* ckptdir_is_fullpath flag if true
+ * allow user to specify full path to model weights to load
+ * and allow system to ignore appending trainer id, num of epochs/steps
+ * to default ckpt_dir*/
+  static bool load_model_weights(const std::string& ckpt_dir,
+                                 const std::string& alg_name,
+                                 model *m,
+                                 bool ckptdir_is_fullpath=false);
+
+  std::string name() const override { return "load model"; }
+
+ protected:
+  friend class lbann::model;
+
+
+ private:
+  std::vector<std::string> m_dirs; //director(ies) to load pretrained model(s)
+  /// Disables the normal behavior of saving when training is complete
+  std::string m_extension; //file extension
+
+  /// Flag to indicate if the model has already been loaded
+  bool m_loaded;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_load_model_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_LOAD_MODEL_HPP_INCLUDED
diff --git a/include/lbann/callbacks/ltfb.hpp b/include/lbann/callbacks/ltfb.hpp
new file mode 100644
index 00000000000..36ca778cdfe
--- /dev/null
+++ b/include/lbann/callbacks/ltfb.hpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+/** @brief Tournament training.
+ *
+ *  This is intended to support research into the LTFB algorithm. An
+ *  outline:
+ *    - Divide the computational resources into multiple "trainers"
+ *      that can operate in parallel.
+ *    - Setup a model on each trainer and begin training independently.
+ *    - Periodically launch tournaments to select "good" models. More
+ *      specifically, trainers partner up and exchange their models.
+ *      Each trainer evaluates a metric for its local and partner
+ *      models, using its validation data set. The model with the better
+ *      score is retained and the other one is discarded.
+ *
+ *  There are many algorithmic variations to be explored:
+ *    - How is data is divvied up amongst the trainers. Is it strictly
+ *      partitioned, partially shared, or completely replicated?
+ *    - What model components are exchanged? Just the trainable weights,
+ *      or a subset of the weights? Hyperparameters?
+ *    - Can this be used to explore model architectures?
+ *
+ *  @todo Exchange optimizer state.
+ *  @todo Support heterogeneous models.
+ */
+class ltfb : public callback_base {
+public:
+
+  /** Inter-trainer communication scheme for LTFB.
+   *
+   *  The specifics of these algorithms are experimental and will be
+   *  in flux.
+   */
+  enum class communication_algorithm {
+    /** Directly exchange weights values with sendrecv.
+     *
+     *  Corresponding ranks in partner trainers will iterate through
+     *  their weights and exchange values with sendrecvs.
+     *
+     *  Notes:
+     *    - Requires all models to be identical aside from their
+     *      weights values, so this is not suitable for hyperparameter
+     *      or model architecture exploration.
+     *    - Optimizer state is not exchanged, so there may be wonky
+     *      learning behavior immediately after a tournament.
+     *    - Optimal if communication performance between ranks is
+     *      uniform and independent. If intra-trainer communication is
+     *      fast or if communication performance is sensitive to
+     *      network traffic, it may be advantageous to gather model
+     *      data on the trainer master ranks and only perform
+     *      inter-trainer communication between them.
+     */
+    sendrecv_weights,
+
+    /** Save and load model data with checkpoint files.
+     *
+     *  @todo Implement.
+     *
+     *  Notes:
+     *    - Supports hyperparameter exploration.
+     *    - Checkpoint files currently do not store model architecture
+     *      information, so this is not suitable for model
+     *      architecture exploraiton.
+     *    - This approach is temporary and experimental, since going
+     *      through the file system is very suboptimal. When a wire
+     *      format for model checkpoints is developed, it should be
+     *      used instead.
+     */
+    checkpoint_file
+  };
+
+  /** @brief Construct the LTFB callback
+   *  @param batch_interval Number of training mini-batch steps between
+   *                        tournaments.
+   *  @param metric_name    Metric for tournament evaluation.
+   *  @param weights_names  List of weights to exchange with partner.
+   *                        If empty, then all weights are exchanged.
+   *  @param low_score_wins Whether low-scoring or high-scoring models
+   *                        survive a tournament.
+   *  @param comm_algo      Inter-trainer communication scheme.
+   *  @param summarizer     The summarizer to use for this callback
+   */
+  ltfb(
+    El::Int batch_interval,
+    std::string metric_name,
+    std::set<std::string> weights_names = std::set<std::string>(),
+    bool low_score_wins = false,
+    communication_algorithm comm_algo = communication_algorithm::sendrecv_weights,
+    const std::string& ckptdir = "",
+    bool exchange_hyperparameters = false);
+  ltfb(const ltfb& other);
+  ltfb& operator=(const ltfb& other);
+  ltfb* copy() const override { return new ltfb(*this); }
+  std::string name() const override { return "LTFB"; }
+
+  void setup(model *m) override;
+  void on_train_begin(model *m) override;
+  void on_batch_begin(model *m) override;
+
+  /** Convert string to LTFB communication algorithm.
+   *
+   *  If an empty string is provided, returns @c
+   *  communication_algorithm::sendrecv_weights.
+   */
+  static communication_algorithm string_to_comm_algo(const std::string& str);
+
+  void set_ckpt_basedir(const std::string& dir);
+  std::string get_ckpt_basedir() const;
+
+private:
+
+  /** Metric for tournament evaluation. */
+  std::string m_metric_name;
+
+  /** List of weights to exchange with partner.
+   *
+   *  If empty, then all weights are exchanged.
+   */
+  std::set<std::string> m_weights_names;
+
+  /** Whether low-scoring or high-scoring models survive a
+   *  tournament. */
+  bool m_low_score_wins;
+
+  /** Inter-trainer communication scheme. */
+  communication_algorithm m_comm_algo;
+
+  /** Base directory of the checkpoint state */
+  std::string m_ckpt_basedir;
+
+  /** Whether to exchange training hyperparameters between trainers
+  */
+  bool m_exchange_hyperparameters;
+
+  /** Workspace weights.
+   *
+   *  Used to temporarily store local weights during a tournament.
+   */
+  std::vector<std::unique_ptr<weights>> m_workspace_weights;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_ltfb_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED
diff --git a/include/lbann/callbacks/mixup.hpp b/include/lbann/callbacks/mixup.hpp
new file mode 100644
index 00000000000..b4b5873f3a6
--- /dev/null
+++ b/include/lbann/callbacks/mixup.hpp
@@ -0,0 +1,88 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_MIXUP_HPP
+#define LBANN_CALLBACKS_MIXUP_HPP
+
+#include <unordered_set>
+#include <string>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Apply mixup to named input layers.
+ *
+ * See:
+ *
+ *     Zhang, H. et al. "mixup: Beyond Empirical Risk Minimization." ICLR, 2018.
+ *
+ * This implementation does mixup within a single batch, per the recommendation
+ * within the paper.
+ *
+ * This approach may create duplicate images, and so uses
+ *
+ *     lambda = max(lambda, 1 - lambda)
+ *
+ * for the mixing value.
+ *
+ * This recommendation comes from https://docs.fast.ai/callbacks.mixup.html
+ *
+ * The recommended default alpha (from the paper) is 0.4.
+ */
+class mixup : public callback_base {
+public:
+  /** Apply mixup to layers named in layers with mixup parameter alpha. */
+  mixup(std::unordered_set<std::string> layers, float alpha) :
+    callback_base(), m_layers(layers), m_alpha(alpha) {
+    if (alpha < 0.0f) {
+      LBANN_ERROR("Mixup alpha must be non-negative.");
+    }
+  }
+
+  mixup* copy() const override { return new mixup(*this); }
+  std::string name() const override { return "mixup"; }
+
+  void on_forward_prop_end(model *m, Layer *l) override;
+
+private:
+  /** Names of input layers to apply mixup to. */
+  std::unordered_set<std::string> m_layers;
+  /** mixup parameter. */
+  float m_alpha;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_mixup_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_MIXUP_HPP
diff --git a/include/lbann/callbacks/monitor_io.hpp b/include/lbann/callbacks/monitor_io.hpp
new file mode 100644
index 00000000000..8f665c928d4
--- /dev/null
+++ b/include/lbann/callbacks/monitor_io.hpp
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// monitor_io .hpp .cpp - Callback hooks for I/O monitoring
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED
+#define LBANN_CALLBACKS_IO_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <google/protobuf/message.h>
+
+#include <string>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Print information on the amount of IO that layers do.
+ */
+class monitor_io : public callback_base {
+ public:
+  monitor_io() = default;
+  /** Only apply to specific layers. */
+  monitor_io(std::vector<std::string> const& layers)
+    : m_layers(layers.begin(), layers.end()) {}
+
+  monitor_io(const monitor_io&) = default;
+  monitor_io& operator=(const monitor_io&) = default;
+  monitor_io* copy() const override {
+    return new monitor_io(*this);
+  }
+  /** Report how much I/O has occured per data reader */
+  void on_epoch_end(model *m) override;
+  void on_test_end(model *m) override;
+  std::string name() const override { return "monitor_io"; }
+ private:
+  /** Indicies of layers to monitor. */
+  std::unordered_set<std::string> m_layers;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_monitor_io_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_IO_HPP_INCLUDED
diff --git a/include/lbann/callbacks/perturb_adam.hpp b/include/lbann/callbacks/perturb_adam.hpp
new file mode 100644
index 00000000000..3101018c6a7
--- /dev/null
+++ b/include/lbann/callbacks/perturb_adam.hpp
@@ -0,0 +1,135 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/optimizers/adam.hpp"
+
+#include <set>
+
+namespace lbann {
+namespace callback {
+
+/** @brief Hyperparameter exploration with Adam optimizers.
+ *
+ *  Goes through the Adam optimizers in a model and perturbs four
+ *  hyperparameters: the learning rate, @f$\beta_1@f$, @f$\beta_2@f$,
+ *  and @f$\epsilon@f$. Since these hyperparameters can range over
+ *  orders of magnitude, the perturbations are performed in log space.
+ *  More precisely, random values are drawn from normal distributions
+ *  (with user-provided standard deviations) and added to
+ *  @f$\log(\text{learning rate})@f$, @f$\log(1-\beta_1)@f$,
+ *  @f$\log(1-\beta_2)@f$, and @f$\log\epsilon@f$.
+ */
+class perturb_adam : public callback_base {
+public:
+
+  /** @param learning_rate_factor   Standard deviation of learning rate
+   *                                perturbation (in log space).
+   *  @param beta1_factor           Standard deviation of @f$\beta_1@f$
+   *                                perturbation (in log space).
+   *  @param beta2_factor           Standard deviation of @f$\beta_2@f$
+   *                                perturbation (in log space).
+   *  @param eps_factor             Standard deviation of @f$\epsilon@f$
+   *                                perturbation (in log space).
+   *  @param perturb_during_training    Whether to periodically perturb
+   *                                    hyperparameters during training
+   *                                    or to only perturb once during
+   *                                    setup.
+   *  @param batch_interval Number of training mini-batch steps between
+   *                        perturbations. Only used if
+   *                        @c perturb_during_training is @c true.
+   *  @param weights_names  Names of weights with Adam optimizers. If
+   *                        empty, all Adam optimizers in the model are
+   *                        perturbed.
+   */
+  perturb_adam(DataType learning_rate_factor,
+               DataType beta1_factor,
+               DataType beta2_factor,
+               DataType eps_factor = 0,
+               bool perturb_during_training = false,
+               El::Int batch_interval = 1,
+               std::set<std::string> weights_names
+               = std::set<std::string>());
+  perturb_adam* copy() const override { return new perturb_adam(*this); }
+  std::string name() const override { return "perturb Adam"; }
+
+  void setup(model* m) override;
+  void on_batch_begin(model* m) override;
+
+private:
+
+  /** Standard deviation of learning rate perturbation.
+   *
+   *  In log space.
+   */
+  DataType m_learning_rate_factor;
+  /** Standard deviation of @f$\beta_1@f$ perturbation.
+   *
+   *  In log space.
+   */
+  DataType m_beta1_factor;
+  /** Standard deviation of @f$\beta_2@f$ perturbation.
+   *
+   *  In log space.
+   */
+  DataType m_beta2_factor;
+  /** Standard deviation of @f$\epsilon@f$ perturbation.
+   *
+   *  In log space.
+   */
+  DataType m_eps_factor;
+
+  /** Whether to periodically perturb during training.
+   *
+   *  If false, only perturb once during setup.
+   */
+  bool m_perturb_during_training;
+
+  /** Optimizers for these weights will be perturbed.
+   *
+   *  If empty, all Adam optimizers in the model will be perturbed.
+   */
+  std::set<std::string> m_weights_names;
+
+  /** Perturb Adam optimizers in model. */
+  void perturb(model& m) const;
+  /** Perturb Adam optimizer hyperparameters. */
+  void perturb(lbann_comm& comm, adam<DataType>& m) const;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_perturb_adam_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED
diff --git a/include/lbann/callbacks/perturb_dropout.hpp b/include/lbann/callbacks/perturb_dropout.hpp
new file mode 100644
index 00000000000..c55722ef618
--- /dev/null
+++ b/include/lbann/callbacks/perturb_dropout.hpp
@@ -0,0 +1,88 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/layers/regularizers/dropout.hpp"
+#include <set>
+
+namespace lbann {
+namespace callback {
+
+/** @brief Hyperparameter exploration with dropouts.
+ *
+ *  Goes through the dropout layers in a model and perturbs keep probability
+ */
+class perturb_dropout : public callback_base {
+public:
+
+  /** @param keep_prob_factor   Standard deviation of learning rate
+   *                                perturbation (in log space).
+   *  @param layer_names  Names of layers with dropout keep prob to perturb. If
+   *                        empty, all dropout layers  in the model are
+   *                        perturbed.
+   */
+  perturb_dropout(EvalType keep_prob_factor,
+                              std::set<std::string> layer_names
+                              = std::set<std::string>());
+  perturb_dropout* copy() const override { return new perturb_dropout(*this); }
+  std::string name() const override { return "perturb dropout"; }
+
+  void setup(model* m) override;
+
+private:
+
+  /** Standard deviation of keep probability  perturbation.
+   *
+   *  In log space.
+   */
+  EvalType m_keep_prob_factor;
+
+  /** Keep prob for these layers will be perturbed.
+   *
+   *  If empty, all dropout layers  in the model will be perturbed.
+   */
+  std::set<std::string> m_layer_names;
+
+  template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+  dropout<TensorDataType, T_layout, Dev>* get_dropout_layer(Layer* l);
+
+  /** Perturb dropout keep prob in model. */
+  void perturb(model& m);
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_perturb_dropout_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED
diff --git a/include/lbann/callbacks/print_model_description.hpp b/include/lbann/callbacks/print_model_description.hpp
new file mode 100644
index 00000000000..9f68cc39322
--- /dev/null
+++ b/include/lbann/callbacks/print_model_description.hpp
@@ -0,0 +1,60 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** @brief Print human-readable description of model to standard input.
+ *
+ *  Message is printed when the model has finished setup. The
+ *  description includes information on the model's layers, weights,
+ *  and callbacks.
+ */
+class print_model_description : public callback_base {
+public:
+  print_model_description() : callback_base() {}
+  print_model_description(const print_model_description&) = default;
+  print_model_description& operator=(const print_model_description&) = default;
+  print_model_description* copy() const override { return new print_model_description(*this); }
+  void on_setup_end(model *m) override;
+  std::string name() const override { return "print_model_description"; }
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_print_model_description_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED
diff --git a/include/lbann/callbacks/print_statistics.hpp b/include/lbann/callbacks/print_statistics.hpp
new file mode 100644
index 00000000000..70fbc42c2ea
--- /dev/null
+++ b/include/lbann/callbacks/print_statistics.hpp
@@ -0,0 +1,71 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// print_statistics .hpp .cpp - Callback hooks to print information
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Periodically print computational results.
+ *  Prints average objective function value and metric scores after
+ *  each training epoch and evaluation.
+ */
+class print_statistics : public callback_base {
+ public:
+  print_statistics(int batch_interval = 1, bool print_global_stat_only=false) :
+    callback_base(batch_interval),
+    m_print_global_stat_only(print_global_stat_only) {}
+  print_statistics(const print_statistics&) = default;
+  print_statistics& operator=(const print_statistics&) = default;
+  print_statistics* copy() const override { return new print_statistics(*this); }
+  void setup(model *m) override;
+  void on_epoch_begin(model *m) override;
+  void on_epoch_end(model *m) override;
+  void on_validation_end(model *m) override;
+  void on_test_end(model *m) override;
+  std::string name() const override { return "print_statistics"; }
+
+ private:
+  /** Print objective function and metrics to standard output. */
+  void report_results(model *m);
+  bool m_print_global_stat_only;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_print_statistics_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/profiler.hpp b/include/lbann/callbacks/profiler.hpp
index abedbaaa428..2a1c77a21dd 100644
--- a/include/lbann/callbacks/profiler.hpp
+++ b/include/lbann/callbacks/profiler.hpp
@@ -23,7 +23,7 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 //
-// lbann_callback_timer .hpp .cpp - Callback hooks to time training
+// timer .hpp .cpp - Callback hooks to time training
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifndef LBANN_CALLBACKS_PROFILER_HPP_INCLUDED
@@ -32,16 +32,17 @@
 #include "lbann/callbacks/callback.hpp"
 
 namespace lbann {
+namespace callback {
 
 /**
  */
-class lbann_callback_profiler : public lbann_callback {
+class profiler : public callback_base {
  public:
-  lbann_callback_profiler(bool sync = false, bool skip_init = false);
-  lbann_callback_profiler(const lbann_callback_profiler&) = default;
-  lbann_callback_profiler& operator=(const lbann_callback_profiler&) = default;
-  lbann_callback_profiler* copy() const override {
-    return new lbann_callback_profiler(*this);
+  profiler(bool sync = false, bool skip_init = false);
+  profiler(const profiler&) = default;
+  profiler& operator=(const profiler&) = default;
+  profiler* copy() const override {
+    return new profiler(*this);
   }
   void on_epoch_begin(model *m) override;
   void on_epoch_end(model *m) override;
@@ -79,6 +80,12 @@ class lbann_callback_profiler : public lbann_callback {
   bool m_skip_init;
 };
 
-}  // namespace lbann
+// Builder function
+std::unique_ptr<callback_base>
+build_profiler_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
 
 #endif  // LBANN_CALLBACKS_PROFILER_HPP_INCLUDED
diff --git a/include/lbann/callbacks/replace_weights.hpp b/include/lbann/callbacks/replace_weights.hpp
new file mode 100644
index 00000000000..d42ed2573be
--- /dev/null
+++ b/include/lbann/callbacks/replace_weights.hpp
@@ -0,0 +1,81 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ *  Weights/parameters replacement on k-batch end
+ *  Currently support replacing weights/parameters using layer names
+ *  Can easily be extended to support replacement by weights name
+ *  Given two layers specified in prototext, weights are copied from source layer to destination layer.
+ */
+class replace_weights : public callback_base {
+ public:
+  replace_weights(
+    std::vector<std::string> src,
+    std::vector<std::string> dst,
+    int batch_interval=1)
+    : callback_base(batch_interval),
+      m_src_layer_names(std::move(src)),
+      m_dst_layer_names(std::move(dst)) {
+    if(m_src_layer_names.size() != m_dst_layer_names.size())
+      LBANN_ERROR("In replace weights callback: number of src and dest layers does not match.");
+  }
+
+  replace_weights(
+    const replace_weights&) = default;
+  replace_weights& operator=(
+    const replace_weights&) = default;
+  replace_weights* copy() const override {
+    return new replace_weights(*this);
+  }
+  void setup(model *m) override;
+  void on_batch_end(model *m) override;
+
+  std::string name() const override { return "replace weights"; }
+ private:
+  std::vector<std::string> m_src_layer_names, m_dst_layer_names;
+  std::vector<Layer*> m_src_layers, m_dst_layers;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_replace_weights_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/save_images.hpp b/include/lbann/callbacks/save_images.hpp
new file mode 100644
index 00000000000..cf37f33e33d
--- /dev/null
+++ b/include/lbann/callbacks/save_images.hpp
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
+
+#include <string>
+#include <vector>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Save layer outputs as image files.
+ *  Image files are in the form
+ *  "<prefix><tag>-<layer name>.<format>".
+ */
+class save_images : public callback_base {
+public:
+
+  /** Constructor.
+   *  @param layer_names  List of layer names to save as images.
+   *  @param image_format Image file format (e.g. jpg, png, pgm).
+   *  @param image_prefix Prefix for image file names.
+   */
+  save_images(std::vector<std::string> layer_names,
+              std::string image_format = "jpg",
+              std::string image_prefix = "");
+  save_images(const save_images&) = default;
+  save_images& operator=(
+    const save_images&) = default;
+  save_images* copy() const override {
+    return new save_images(*this);
+  }
+  void on_epoch_end(model *m) override;
+  void on_test_end(model *m) override;
+  std::string name() const override { return "save images"; }
+
+private:
+
+  /** List of layer names to save as images. */
+  std::vector<std::string> m_layer_names;
+  /** Image file format.
+   *  Valid options: jpg, png, pgm.
+   */
+  std::string m_image_format;
+  /** Prefix for saved image files. */
+  std::string m_image_prefix;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_save_images_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/save_model.hpp b/include/lbann/callbacks/save_model.hpp
new file mode 100644
index 00000000000..b5cf2029182
--- /dev/null
+++ b/include/lbann/callbacks/save_model.hpp
@@ -0,0 +1,103 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// save_model .hpp .cpp - Callbacks to save model, currently as protobuf
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <google/protobuf/message.h>
+
+// Forward-declare protobuf classes
+namespace lbann_data {
+class Model;
+}
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Save model to as protobuf file and set of weights
+ */
+class save_model : public callback_base {
+ public:
+  /**
+   * @param dir directory to save model
+   * @param disable_save_after_training Don't save after training
+   * @param extension file extension e.g., model, state ......
+   */
+  save_model(std::string dir,
+                            bool disable_save_after_training,
+                            std::string extension="prototext") :
+    callback_base(), m_dir(std::move(dir)),
+    m_disable_save_after_training(disable_save_after_training),
+    m_extension(std::move(extension))
+  {}
+  save_model(const save_model&) = default;
+  save_model& operator=(
+    const save_model&) = default;
+  save_model* copy() const override {
+    return new save_model(*this);
+  }
+  void on_train_end(model *m) override;
+  std::string name() const override { return "save model"; }
+  void set_target_dir(const std::string& dir) { m_dir = dir; }
+  const std::string& get_target_dir() { return m_dir; }
+
+ protected:
+  friend class lbann::model;
+
+  bool do_save_model(model *m);
+  bool do_save_model_weights(model *m);
+
+ private:
+  std::string m_dir; //directory to save file
+  /// Disables the normal behavior of saving when training is complete
+  bool m_disable_save_after_training;
+  std::string m_extension; //file extension
+  persist p;
+
+  void write_proto_binary(const lbann_data::Model& proto, const std::string filename);
+  void write_proto_text(const lbann_data::Model& proto, const std::string filename);
+};
+
+inline std::string get_save_model_dirname(const std::string& trainer_name, const std::string& model_name, const std::string& dir) {
+  return build_string(dir, '/', trainer_name, '/', model_name, '/');
+}
+
+// Builder function
+std::unique_ptr<callback_base>
+build_save_model_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED
diff --git a/include/lbann/callbacks/save_topk_models.hpp b/include/lbann/callbacks/save_topk_models.hpp
new file mode 100644
index 00000000000..4a5c3800602
--- /dev/null
+++ b/include/lbann/callbacks/save_topk_models.hpp
@@ -0,0 +1,71 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// save_topk_models .hpp .cpp - Callback to save top k models
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED
+
+#include "lbann/callbacks/save_model.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Save_topk_models for (e.g., inference and other analysis).
+   * @param dir directory to save model
+   * @param k number of models to save, should be less than number of trainers
+   * @param metric_name, evaluation metric
+   * @ordering for the topk, descending order is default
+   * Note: may end up saving more than k models if multiple models (trainers) have the same metric score
+ */
+class save_topk_models : public save_model {
+ public:
+  save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) :
+  save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {}
+  save_topk_models(const save_topk_models&) = default;
+  save_topk_models& operator=(const save_topk_models&) = default;
+  save_topk_models* copy() const override { return new save_topk_models(*this); }
+  void on_test_end(model *m) override;
+  std::string name() const override { return "save_topk_models"; }
+
+ private:
+  /*determine if a trainer's model is in top k, computation done by trainer master processes*/
+  bool am_in_topk(model *m);
+  int m_k ;
+  std::string m_metric_name;
+  bool m_ascending_ordering;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_save_topk_models_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/set_weights_value.hpp b/include/lbann/callbacks/set_weights_value.hpp
new file mode 100644
index 00000000000..37b8996b301
--- /dev/null
+++ b/include/lbann/callbacks/set_weights_value.hpp
@@ -0,0 +1,77 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED
+
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** @brief Set values in a weights object at a given training step
+ *
+ *  @todo Support weights with arbitrary data types. Currently only
+ *  floats are supported.
+ */
+class set_weights_value : public callback_base {
+ public:
+  /**
+   *  @param weights_name Name of weights object
+   *  @param value Value to set weights
+   *  @param step Mini-batch step at which to set weights value
+   */
+  set_weights_value(std::string weights_name, double value, size_t step);
+  set_weights_value(const set_weights_value&) = default;
+  set_weights_value& operator=(const set_weights_value&) = default;
+
+  set_weights_value* copy() const override;
+  std::string name() const override;
+
+  void on_batch_begin(model *m) override;
+
+ private:
+
+  /** @brief Name of weights object. */
+  std::string m_weights_name;
+  /** @brief Value to set weights. */
+  double m_value;
+  /** @brief Mini-batch step at which to set weights value. */
+  size_t m_step;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_set_weights_value_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SET_WEIGHTS_VALUE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/summarize_images.hpp b/include/lbann/callbacks/summarize_images.hpp
new file mode 100644
index 00000000000..7d396fc45a4
--- /dev/null
+++ b/include/lbann/callbacks/summarize_images.hpp
@@ -0,0 +1,228 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// summarize_images .hpp .cpp - Callback hooks to dump
+// results of image testing to event files
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED
+#define LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED
+
+
+#include "lbann/callbacks/callback.hpp"
+
+#include <google/protobuf/message.h>
+#include <lbann/base.hpp>
+#include <string>
+#include <vector>
+namespace lbann {
+namespace callback {
+
+/** @class image_output_strategy
+ *  @brief Interface for strategies for determining which images
+ *  to output to the summarizer.
+ */
+class image_output_strategy {
+
+public:
+  virtual std::vector<std::pair<size_t, El::Int>>
+  get_image_indices(model const&) const = 0;
+  virtual std::string get_tag(std::string const& layer_name,
+                              El::Int index, El::Int epoch) const = 0;
+  virtual ~image_output_strategy() = default;
+
+}; //class image_output_strategy
+
+
+/** @class CategoricalAccuracy
+ *  @brief Subclass of image_output_strategy to dump categorized
+ *  images to event files based on categorization criteria
+ */
+class categorical_accuracy_strategy : public image_output_strategy {
+public:
+
+  enum class MatchType {
+    NOMATCH=0,
+    MATCH=1,
+    ALL=2
+  };// enum class MatchType
+
+  /** @brief summarize_images Constructor.
+   *  @param cat_accuracy_layer_name Name of categorical accuracy layer
+   *  @param match_type Criteria for dumping images (MATCH, NOMATCH, or ALL)
+   *  @param num_images Number of images to summarize per epoch
+   */
+  categorical_accuracy_strategy(std::string const& cat_accuracy_layer_name,
+                                MatchType match_type=MatchType::NOMATCH,
+                                size_t num_images=10)
+    : m_cat_accuracy_layer_name(cat_accuracy_layer_name),
+      m_match_type(match_type),
+      m_num_images(num_images) {}
+
+  /** @brief Get vector containing indices of images to be dumped.
+   *  @returns std::vector<int> Vector with indices of images to dump.
+   */
+  std::vector<std::pair<size_t, El::Int>>
+  get_image_indices(model const& m) const final;
+
+  /** @brief Construct tag for image */
+  std::string get_tag(std::string const& layer_name,
+                      El::Int index, El::Int epoch) const final;
+
+private:
+   /** @brief Tests whether image should be dumped based on criteria
+    *  @returns bool Value is true if matches criteria and false otherwise
+    */
+  bool meets_criteria(const DataType& match) const noexcept;
+
+  /** @brief Name of categorical accuracy layer*/
+  std::string const m_cat_accuracy_layer_name;
+
+  /** @brief Criterion to dump images */
+  MatchType m_match_type;
+
+  /** @brief Number of images to be dumped per epoch */
+  size_t m_num_images;
+
+}; // class categorical_accuracy_strategy : image_output_strategy
+
+std::unique_ptr<image_output_strategy>
+build_categorical_accuracy_strategy_from_pbuf(google::protobuf::Message const&);
+
+/** @class Autoencoder Subclass of image_output_strategy to dump autoencoder images
+ *  @brief Dump images to event files based on strategy
+ */
+class autoencoder_strategy : public image_output_strategy {
+
+public:
+
+  /** @brief autoencoder_strategy : image_output_strategy Constructor.
+   *  @param sample_indices Vector of sample indices for images
+   */
+  autoencoder_strategy(std::string const& input_layer_name,
+                       size_t num_images = 10)
+    : m_input_layer_name{input_layer_name},
+      m_num_images{num_images} {}
+
+  /** @brief Get vector containing indices of images to be dumped.
+   *  @returns std::vector<int> Vector with indices of images to dump.
+   */
+  std::vector<std::pair<size_t, El::Int>>
+  get_image_indices(model const& m) const final;
+
+  /** @brief Construct tag for image */
+  std::string get_tag(std::string const& layer_name,
+                      El::Int index, El::Int epoch) const final;
+
+private:
+
+  /** @brief Name of input layer */
+  std::string m_input_layer_name;
+
+  /** @brief Number of images to be tracked */
+  size_t m_num_images;
+
+  /** @brief Sample indices of images to track */
+  mutable std::unordered_set<El::Int> m_tracked_images;
+
+  /** @brief A map from models to shuffled indices */
+  mutable std::unordered_map<model const*, std::vector<size_t>> m_shuffled_indices;
+
+}; // class Autoencoder : image_output_strategy
+
+std::unique_ptr<image_output_strategy>
+build_track_sample_ids_strategy_from_pbuf(google::protobuf::Message const&);
+
+/** @class summarize_images
+ *  @brief Callback to dump images to event files based on strategy
+ */
+class summarize_images : public callback_base {
+
+public:
+  /** @brief summarize_images Constructor.
+   *  @param summarizer Pointer to lbann_summary object
+   *  @param strategy Pointer to image image_output_strategy
+   *  @param img_source_layer_name Name of image layer
+   *  @param interval Interval of epochs to dump images
+   *  @param img_format Image file format (e.g. .jpg, .png, .pgm)
+   */
+  summarize_images(std::shared_ptr<lbann_summary> const& summarizer,
+                   std::unique_ptr<image_output_strategy> strategy,
+                   std::string const& img_source_layer_name,
+                   uint64_t interval = 1,
+                   std::string const& img_format = ".jpg");
+
+  /** @brief Copy constructor */
+  callback_base* copy() const override {
+    LBANN_ERROR( "This callback is not copyable.");
+    return nullptr;
+  }
+
+  /** @brief Return name of callback */
+  std::string name() const override { return "summarize_images"; }
+
+  /** @brief Hook to pull data from lbann run */
+  void on_batch_evaluate_end(model* m) override;
+
+private:
+
+  /** @brief Add image to event file */
+  void dump_images_to_summary(model const& m) const;
+
+
+private:
+
+  /* @brief lbann_summary object */
+  std::shared_ptr<lbann_summary> m_summarizer;
+
+  /* @brief image_output_strategy object */
+  std::unique_ptr<image_output_strategy> m_strategy;
+
+  /* @brief Names of layers */
+  std::string m_img_source_layer_name;
+
+  /* @brief Interval for dumping images */
+  uint64_t m_epoch_interval;
+
+  /** @brief Image file format. Valid options: .jpg, .png, .pgm. */
+  std::string m_img_format;
+
+}; // class summarize_images
+
+/** @brief Get a layer from model based on name
+ *  @param m The model
+ *  @param layer_name Name of layer
+ */
+Layer const& get_layer_by_name(model const& m, std::string const& layer_name);
+
+std::unique_ptr<callback_base>
+build_summarize_images_callback_from_pbuf(
+  const google::protobuf::Message&,
+  const std::shared_ptr<lbann_summary>& summarizer);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_SUMMARIZE_IMAGES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp
new file mode 100644
index 00000000000..eb199110378
--- /dev/null
+++ b/include/lbann/callbacks/summary.hpp
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// summary .hpp .cpp - Callback hooks to summarize to Tensorboard
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/utils/summary.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Summarize information to Tensorboard using LBANN's summary interface.
+ */
+class summary : public callback_base {
+ public:
+  /**
+   * @param summarizer The summary object to write to; this callback takes
+   * ownership of it.
+   * @param batch_interval The frequency with which to summarize
+   * @param mat_interval FIXME
+   * @todo Document mat_interval parameter.
+   */
+  summary(const std::shared_ptr<lbann_summary>& summarizer, int batch_interval = 1,
+    int mat_interval = 25);
+  summary(const summary&) = default;
+  summary& operator=(const summary&) = default;
+  summary* copy() const override {
+    return new summary(*this);
+  }
+  void on_train_begin(model *m) override;
+  void on_batch_end(model *m) override;
+  void on_epoch_end(model *m) override;
+  void on_test_end(model *m) override;
+  std::string name() const override { return "summary"; }
+
+protected:
+  /** Write out histograms from the model's layers. */
+  void save_histograms(model *m);
+
+private:
+  /** @brief lbann_summary */
+  std::shared_ptr<lbann_summary> m_summarizer = nullptr;
+
+  /** Interval for doing matrix summarization. */
+  int m_mat_interval;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_summary_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED
diff --git a/include/lbann/callbacks/sync_layers.hpp b/include/lbann/callbacks/sync_layers.hpp
new file mode 100644
index 00000000000..6fa78b5ebb9
--- /dev/null
+++ b/include/lbann/callbacks/sync_layers.hpp
@@ -0,0 +1,87 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// callback_sync_layers.hpp - Callback to synchronize layers
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/** Synchronize layers after forward and backward prop.
+ * Additionally updates layer timing information to account for this.
+ * Note that this callback should come before the summarizer callback to report
+ * time correctly (otherwise it will be shifted by one mini-batch).
+ */
+class sync_layers : public callback_base {
+ public:
+  /**
+   * @param sync_gpus The GPU stream will be synchronized.
+   * @param sync_mpi A global barrier will synchronize processes.
+   * @param only_input The only synchronization will be after the input layer in
+   * forward prop.
+   */
+  sync_layers(bool sync_gpus = true, bool sync_mpi = true,
+                             bool only_input = false) :
+    callback_base(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi),
+    m_only_input(only_input) {}
+  sync_layers(const sync_layers&) = default;
+  sync_layers& operator=(
+    const sync_layers&) = default;
+  sync_layers* copy() const override {
+    return new sync_layers(*this);
+  }
+  std::string name() const override { return "sync_layers"; }
+
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_end;
+
+  void on_forward_prop_end(model *m, Layer *l) override;
+  void on_backward_prop_end(model *m, Layer *l) override;
+
+ protected:
+  /** Whether to synchronize GPUs. */
+  bool m_sync_gpus;
+  /** Whether to do a global synchronization. */
+  bool m_sync_mpi;
+  /** Whether to only synchronize after the input layer. */
+  bool m_only_input;
+
+  virtual void do_sync(Layer *l);
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_sync_layers_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED
diff --git a/include/lbann/callbacks/timeline.hpp b/include/lbann/callbacks/timeline.hpp
new file mode 100644
index 00000000000..5b247070fab
--- /dev/null
+++ b/include/lbann/callbacks/timeline.hpp
@@ -0,0 +1,99 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
+
+#include <unordered_map>
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Record a timeline of training runtime on each rank and output it to a
+ * logfile for external processing.
+ * The logfile is named timeline.m\<model-rank\>.\<rank\>.txt.
+ * Each line is a separate event, written as name:start-time:end-time.
+ * Times are relative to the beginning of training.
+ */
+class timeline : public callback_base {
+ public:
+  timeline(std::string outdir) : callback_base(1),
+                                                m_outdir(outdir) {}
+  timeline(const timeline&) = default;
+  timeline& operator=(const timeline&) = default;
+  timeline* copy() const override {
+    return new timeline(*this);
+  }
+  std::string name() const override { return "timeline"; }
+  void on_train_begin(model *m) override;
+  void on_train_end(model *m) override;
+
+  using callback_base::on_forward_prop_begin;
+  using callback_base::on_forward_prop_end;
+  using callback_base::on_backward_prop_begin;
+  using callback_base::on_backward_prop_end;
+  using callback_base::on_optimize_begin;
+  using callback_base::on_optimize_end;
+
+  void on_forward_prop_begin(model *m, Layer *l) override;
+  void on_forward_prop_end(model *m, Layer *l) override;
+  void on_backward_prop_begin(model *m, Layer *l) override;
+  void on_backward_prop_end(model *m, Layer *l) override;
+  void on_optimize_begin(model *m, weights *w) override;
+  void on_optimize_end(model *m, weights *w) override;
+ private:
+  /// Get time relative to the start time.
+  EvalType get_rel_time() const { return get_time() - m_start_time; }
+
+  /// Directory to write output to.
+  std::string m_outdir;
+  /// Time training started; all times are relative to this.
+  EvalType m_start_time = EvalType(0);
+  /// Time the current layer's forward pass started.
+  EvalType m_fp_start_time = EvalType(0);
+  /// Time the current layer's backward pass started.
+  EvalType m_bp_start_time = EvalType(0);
+  /// Time the current weights' optimization pass started.
+  EvalType m_opt_start_time = EvalType(0);
+  /// Store (relative) timing information.
+  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_fp_times;
+  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_bp_times;
+  std::unordered_map<std::string, std::vector<std::pair<EvalType, EvalType>>> m_opt_times;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_timeline_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED
diff --git a/include/lbann/callbacks/timer.hpp b/include/lbann/callbacks/timer.hpp
new file mode 100644
index 00000000000..2afcf03d23f
--- /dev/null
+++ b/include/lbann/callbacks/timer.hpp
@@ -0,0 +1,113 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include <chrono>
+#include <map>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+/** Record and report model timing results.
+ *  Reports the total time and mini-batch time statistics for training
+ *  epochs and for model evaluations. This reports times for the
+ *  master process in each model.
+ */
+class timer : public callback_base {
+public:
+
+  timer(const std::shared_ptr<lbann_summary>& summarizer = nullptr)
+    : callback_base(1) {}
+  timer(const timer&) = default;
+  timer& operator=(const timer&) = default;
+  timer* copy() const override {
+    return new timer(*this);
+  }
+
+  /** Start timing for a training epoch. */
+  void on_epoch_begin(model *m) override      { timing_begin(*m); }
+  /** Report timing for a training epoch. */
+  void on_epoch_end(model *m) override        { timing_end(*m);   }
+  /** Start timing for validation. */
+  void on_validation_begin(model *m) override { timing_begin(*m); }
+  /** Report timing for validation. */
+  void on_validation_end(model *m) override   { timing_end(*m);   }
+  /** Start timing for testing. */
+  void on_test_begin(model *m) override       { timing_begin(*m); }
+  /** Report timing for testing. */
+  void on_test_end(model *m) override         { timing_end(*m);   }
+  /** Record training mini-batch start time. */
+  void on_batch_begin(model *m) override          { batch_timing_begin(*m); }
+  /** Record training mini-batch run time. */
+  void on_batch_end(model *m) override            { batch_timing_end(*m);   }
+  /** Record evaluation mini-batch start time. */
+  void on_batch_evaluate_begin(model *m) override { batch_timing_begin(*m); }
+  /** Record evaluation mini-batch run time. */
+  void on_batch_evaluate_end(model *m) override   { batch_timing_end(*m);   }
+
+  /** Callback name. */
+  std::string name() const override { return "timer"; }
+
+private:
+
+  /** Timing session start times. */
+  std::map<execution_mode,EvalType> m_start_times;
+  /** Mini-batch timing session start times. */
+  std::map<execution_mode,EvalType> m_batch_start_times;
+  /** Mini-batch times. */
+  std::map<execution_mode,std::vector<EvalType>> m_batch_times;
+
+  /** Start timing session. */
+  void timing_begin(const model& m);
+  /** End timing session.
+   *  Prints results to standard output.
+   */
+  void timing_end(model& m);
+  /** Start mini-batch timing session. */
+  void batch_timing_begin(const model& m);
+  /** End mini-batch timing session.
+   *  Prints results to standard output.
+   */
+  void batch_timing_end(const model& m);
+
+  /** @brief lbann_summary */
+  std::shared_ptr<lbann_summary> m_summarizer = nullptr;
+
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_timer_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
diff --git a/include/lbann/callbacks/variable_minibatch.hpp b/include/lbann/callbacks/variable_minibatch.hpp
new file mode 100644
index 00000000000..5bc5c37318b
--- /dev/null
+++ b/include/lbann/callbacks/variable_minibatch.hpp
@@ -0,0 +1,160 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
+#define LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * Support changing the mini-batch size on different schedules.
+ * Implementations should override implement the abstract methods to define
+ * concrete schedules.
+ */
+class variable_minibatch : public callback_base {
+ public:
+  variable_minibatch(size_t starting_mbsize);
+  variable_minibatch(
+    const variable_minibatch&) = default;
+  variable_minibatch& operator=(
+    const variable_minibatch&) = default;
+  /// Set the initial mini-batch size.
+  void on_train_begin(model *m) override;
+  /// Potentially change the mini-batch size.
+  void on_epoch_end(model *m) override;
+ protected:
+  /**
+   * Implemented by child classes to provide the mini-batch/learning schedule.
+   * This is called at the end of every training epoch. If it returns false,
+   * no changes are made from the currently established schedule.
+   * If this returns true, the mini-batch size will be changed accordingly.
+   * If the mini-batch size is larger than the model's maximum mini-batch size,
+   * a warning is printed and the maximum mini-batch size is used.
+   * If new_lr also non-zero, the learning rate will be changed to new_lr,
+   * with a linear ramp time. (If ramp_time is 0, it is changed immediately.)
+   * Note changing the learning rate while in a ramp may lead to unexpected
+   * behavior; also be aware of interactions with other learning rate
+   * schedules.
+   */
+  virtual bool schedule(model *m, size_t& new_mbsize, float& new_lr,
+                        size_t& ramp_time) = 0;
+  /// Change the learning rate of every layer in m to new_lr.
+  void change_learning_rate(model *m, float new_lr) const;
+  /// Get the current learning rate (assumes every layer has the same one).
+  float get_current_learning_rate(model *m) const;
+
+  /// Initial mini-batch size.
+  size_t m_starting_mbsize;
+  /**
+   * The current mini-batch size for this epoch.
+   * This is kept separately from the model's get_current_mini_batch_size()
+   * method, as calling that in on_epoch_end returns the size of the last mini-
+   * batch, not the "base" mini-batch.
+   */
+  size_t m_current_mini_batch_size;
+  /// Current number of epochs left to ramp the learning rate.
+  size_t m_ramp_count = 0;
+  /// Amount to increment the learning rate by when ramping.
+  float m_lr_incr = 0.0f;
+};
+
+/**
+ * Double the mini-batch size every set number of epochs.
+ * Also doubles the learning rate.
+ */
+class step_minibatch : public variable_minibatch {
+ public:
+  step_minibatch(size_t starting_mbsize, size_t step,
+                                size_t ramp_time = 0);
+  step_minibatch(const step_minibatch&) = default;
+  step_minibatch& operator=(
+    const step_minibatch&) = delete;
+  step_minibatch* copy() const override {
+    return new step_minibatch(*this);
+  }
+  std::string name() const override { return "step minibatch"; }
+ protected:
+  bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override;
+
+ private:
+  /// Number of epochs between mini-batch size increases.
+  size_t m_step;
+  /// Number of steps to ramp the learning rate over.
+  size_t m_ramp_time;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_step_minibatch_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+class minibatch_schedule : public variable_minibatch {
+ public:
+  /// Represents a step in a schedule of mini-batch sizes.
+  struct minibatch_step {
+    /// Epoch for this schedule to start.
+    size_t epoch;
+    /// Mini-batch size to use.
+    size_t mbsize;
+    /// Learning rate to use.
+    float lr;
+    /// Number of epochs to ramp the learning rate over.
+    size_t ramp_time;
+    minibatch_step(size_t _epoch, size_t _mbsize, float _lr, size_t _ramp_time) :
+      epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {}
+  };
+
+  minibatch_schedule(
+    size_t starting_mbsize, std::vector<minibatch_step> steps);
+  minibatch_schedule(
+    const minibatch_schedule&) = default;
+  minibatch_schedule& operator=(
+    const minibatch_schedule&) = delete;
+  minibatch_schedule* copy() const override {
+    return new minibatch_schedule(*this);
+  }
+  std::string name() const override { return "minibatch schedule"; }
+ protected:
+  bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override;
+ private:
+  /// Steps in the mini-batch schedule, stored in reverse sorted order.
+  std::vector<minibatch_step> m_steps;
+};
+
+// Builder function
+std::unique_ptr<callback_base>
+build_minibatch_schedule_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED
diff --git a/include/lbann/comm.hpp b/include/lbann/comm.hpp
index 2ab72fe1273..1af406feea9 100644
--- a/include/lbann/comm.hpp
+++ b/include/lbann/comm.hpp
@@ -41,6 +41,11 @@
 
 namespace lbann {
 
+#ifdef LBANN_HAS_ALUMINUM
+/** Convert an MPI_Op to an Aluminum reduction operator. */
+::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op);
+#endif
+
 namespace Al {
 
 /** Dummy Aluminum backend. */
@@ -165,6 +170,14 @@ class lbann_comm {
   inline int get_world_rank(int trainer, int rank) const {
     return procs_per_trainer * trainer + rank;
   }
+  /** Return the "rank" of the trainer that this rank is in */
+  inline int map_world_rank_to_trainer_rank(int world_rank) const {
+    return (world_rank / procs_per_trainer);
+  }
+  /** Return the "rank" within the trainer that this rank is in */
+  inline int map_world_rank_to_rank_in_trainer(int world_rank) const {
+    return (world_rank % procs_per_trainer);
+  }
   /** Return the rank of the master process in this trainer. */
   inline int get_trainer_master() const {
     return 0;
@@ -412,6 +425,14 @@ class lbann_comm {
     El::mpi::AllGather(&src, 1, data.data(), 1, c,
                        El::SyncInfo<El::Device::CPU>{});
   }
+  /**
+   * Allgather for a single element over the world communicator;
+   * std::vector<T> &data must be correctly sized prior to entry.
+   */
+  template <typename T>
+  void world_all_gather(T &src, std::vector<T> &data) {
+    all_gather(src, data, get_world_comm());
+  }
   /**
    * Allgather for a single element over the trainer communicator;
    * std::vector<T> &data must be correctly sized prior to entry.
@@ -702,18 +723,21 @@ class lbann_comm {
     bytes_received += count * sizeof(T) * (size_c - 1);
   }
   /** Matrix allreduce. */
-  void allreduce(AbsMat& m,
+  template <typename TensorDataType>
+  void allreduce(El::AbstractMatrix<TensorDataType>& m,
                  const El::mpi::Comm& c,
                  El::mpi::Op op = El::mpi::SUM);
   /** Matrix allreduce. */
-  void allreduce(AbsDistMat& m,
+  template <typename TensorDataType>
+  void allreduce(El::AbstractDistMatrix<TensorDataType>& m,
                  const El::mpi::Comm& c,
                  El::mpi::Op op = El::mpi::SUM);
   /** Non-blocking matrix allreduce.
    *  If LBANN has not been built with Aluminum, then this calls a
    *  blocking matrix allreduce.
    */
-  void nb_allreduce(AbsMat& m,
+  template <typename TensorDataType>
+  void nb_allreduce(El::AbstractMatrix<TensorDataType>& m,
                     const El::mpi::Comm& c,
                     Al::request& req,
                     El::mpi::Op op = El::mpi::SUM);
@@ -721,7 +745,8 @@ class lbann_comm {
    *  If LBANN has not been built with Aluminum, then this calls a
    *  blocking matrix allreduce.
    */
-  void nb_allreduce(AbsDistMat& m,
+  template <typename TensorDataType>
+  void nb_allreduce(El::AbstractDistMatrix<TensorDataType>& m,
                     const El::mpi::Comm& c,
                     Al::request& req,
                     El::mpi::Op op = El::mpi::SUM);
@@ -998,6 +1023,16 @@ class lbann_comm {
     return node_comm;
   }
 
+  /**
+   * Return a communicator containing num_per_group processors.
+   *
+   * This will attempt to pack processes so that the processes in each group
+   * are physically close together on the system.
+   *
+   * num_per_group must evenly divide the number of processors in the world.
+   */
+  const El::mpi::Comm& get_packed_group_comm(int num_per_group) const;
+
   /** Return true if rank (in comm) is on the local node. */
   bool is_rank_node_local(int rank, const El::mpi::Comm& comm) const {
     // Translating to COMM_WORLD is typically constant time.
@@ -1017,6 +1052,8 @@ class lbann_comm {
   El::mpi::Comm intertrainer_comm;
   /** Communicator for every process in the same compute node. */
   El::mpi::Comm node_comm;
+  /** Packed group communicators. */
+  mutable std::unordered_map<int, El::mpi::Comm> group_communicators;
   /** Grid for this trainer. */
   Grid *grid;
   /** Number of trainers. */
@@ -1040,11 +1077,6 @@ class lbann_comm {
    */
   int threads_per_proc;
 
-#ifdef LBANN_HAS_ALUMINUM
-  /** Convert an MPI_Op to an Aluminum reduction operator. */
-  ::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op);
-#endif
-
   // Various statistics counters.
   size_t num_trainer_barriers;
   size_t num_intertrainer_barriers;
@@ -1112,6 +1144,25 @@ void lbann_comm::broadcast<std::string>(const int root, std::string& str, const
  */
 int get_rank_in_world();
 
+#ifndef LBANN_COMM_INSTANTIATE
+#define PROTO(T)                                                                             \
+  extern template void lbann_comm::allreduce<T>(                                             \
+    El::AbstractMatrix<T>& m, const El::mpi::Comm& c, El::mpi::Op op);                       \
+  extern template void lbann_comm::allreduce<T>(                                             \
+    El::AbstractDistMatrix<T>& m, const El::mpi::Comm& c, El::mpi::Op op);                   \
+  extern template void lbann_comm::nb_allreduce<T>(                                          \
+    El::AbstractMatrix<T>& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op);     \
+  extern template void lbann_comm::nb_allreduce<T>(                                          \
+    El::AbstractDistMatrix<T>& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_COMM_INSTANTIATE
+
 } // namespace lbann
 
 #endif  // LBANN_COMM_HPP_INCLUDED
diff --git a/include/lbann/data_coordinator/CMakeLists.txt b/include/lbann/data_coordinator/CMakeLists.txt
new file mode 100644
index 00000000000..d41974adb29
--- /dev/null
+++ b/include/lbann/data_coordinator/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  data_coordinator.hpp
+  data_coordinator_metadata.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/data_coordinator/data_coordinator.hpp b/include/lbann/data_coordinator/data_coordinator.hpp
new file mode 100644
index 00000000000..e4a9ae06c01
--- /dev/null
+++ b/include/lbann/data_coordinator/data_coordinator.hpp
@@ -0,0 +1,403 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_COORDINATOR_HPP
+#define LBANN_DATA_COORDINATOR_HPP
+
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include "lbann/utils/dataset.hpp"
+#include "lbann/execution_contexts/execution_context.hpp"
+#include <cereal/types/utility.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
+
+
+namespace lbann {
+
+// Forward-declare trainer
+class trainer;
+
+class data_coordinator {
+ public:
+  using data_reader_map_t = std::map<execution_mode, generic_data_reader *>;
+  using io_buffer_map_t = std::map<execution_mode, std::atomic<int>>;
+
+ public:
+  data_coordinator(trainer& trainer, lbann_comm *comm) :
+    m_trainer(&trainer),
+    m_comm(comm),
+    m_data_set_processed(false),
+    m_execution_context(nullptr) {}
+
+  ~data_coordinator() {
+    // Data coordinator always frees data readers.
+    for (auto& dr : m_data_readers) {
+      delete dr.second;
+    }
+  }
+
+  // Data Coordinators copy their data readers.
+  data_coordinator(const data_coordinator& other)
+    : m_comm(other.m_comm),
+      m_training_dataset(other.m_training_dataset),
+      m_testing_dataset(other.m_testing_dataset),
+      m_validation_dataset(other.m_validation_dataset),
+      m_data_readers(other.m_data_readers),
+      m_execution_context(other.m_execution_context) {
+    for (auto& dr : m_data_readers) {
+      dr.second = dr.second ? dr.second->copy() : nullptr;
+    }
+  }
+
+  data_coordinator& operator=(const data_coordinator& other) {
+    for (auto& dr : m_data_readers) {
+      dr.second = dr.second ? dr.second->copy() : nullptr;
+    }
+    return *this;
+  }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(/*CEREAL_NVP(m_io_buffer),*/
+       CEREAL_NVP(m_training_dataset),
+       CEREAL_NVP(m_testing_dataset),
+       CEREAL_NVP(m_validation_dataset)/*,
+       CEREAL_NVP(m_data_readers),
+       CEREAL_NVP(m_data_set_processed)*/);
+  }
+
+  void setup(int max_mini_batch_size, std::map<execution_mode, generic_data_reader *> data_readers);
+
+  /** Check to see if there is a valid training context for the data coordinator */
+  bool has_valid_execution_context() const {
+    return (m_execution_context != nullptr);
+  }
+
+  /** Grab the training context of the data coordinator */
+  const execution_context& get_execution_context() const {
+    if(m_execution_context == nullptr) {
+      LBANN_ERROR("execution context is not set");
+    }
+    return *m_execution_context;
+  }
+
+  /** Grab the training context of the data coordinator */
+  execution_context& get_execution_context() {
+    return const_cast<execution_context&>(static_cast<const data_coordinator&>(*this).get_execution_context());
+  }
+
+  //************************************************************************
+  // Helper functions to access the data readers
+  //************************************************************************
+
+  generic_data_reader *get_data_reader(const execution_mode mode) const {
+    generic_data_reader *data_reader = nullptr;
+
+    auto it = m_data_readers.find(mode);
+    if (it != m_data_readers.end()) data_reader = it->second;
+
+    switch(mode) {
+    case execution_mode::training:
+      break;
+    case execution_mode::validation:
+      break;
+    case execution_mode::testing:
+      break;
+    default:
+      LBANN_ERROR("generic data distribution: invalid execution phase");
+    }
+    return data_reader;
+  }
+
+  /**
+   * Get the dimensions of the underlying data.
+   */
+  TargetModeDimMap get_data_dims() {
+    TargetModeDimMap map;
+    generic_data_reader *dr;
+    for(execution_mode mode : execution_mode_iterator()) {
+      dr = get_data_reader(mode);
+      if (dr != nullptr) {
+        map[data_reader_target_mode::INPUT] = dr->get_data_dims();
+        map[data_reader_target_mode::CLASSIFICATION] = std::vector<int>(1, dr->get_num_labels());
+        map[data_reader_target_mode::REGRESSION] = std::vector<int>(1, dr->get_num_responses());
+        map[data_reader_target_mode::RECONSTRUCTION] = dr->get_data_dims();
+        map[data_reader_target_mode::NA] = std::vector<int>(1, 0);
+        return map;
+      }
+    }
+    LBANN_ERROR("get_data_dims: no available data readers");
+    return {};
+  }
+
+  /**
+   * Get the dimensions of the underlying data.
+   */
+  SPModeSlicePoints get_slice_points() {
+    SPModeSlicePoints map;
+    generic_data_reader *dr;
+    for(execution_mode mode : execution_mode_iterator()) {
+      dr = get_data_reader(mode);
+      if (dr != nullptr) {
+        for(slice_points_mode sp_mode : slice_points_mode_iterator()) {
+          bool is_supported;
+          std::vector<El::Int> tmp = dr->get_slice_points(sp_mode, is_supported);
+          if(is_supported) {
+            map[sp_mode] = tmp;
+          }
+        }
+        return map;
+      }
+    }
+    LBANN_ERROR("get_data_dims: no available data readers");
+    return {};
+  }
+
+  DataReaderMetaData get_dr_metadata() {
+    DataReaderMetaData drm;
+    drm.data_dims = get_data_dims();
+    drm.slice_points = get_slice_points();
+    return drm;
+  }
+
+  // At the start of the epoch, set the execution mode and make sure
+  // that each layer points to this model
+  void reset_mode(execution_context& context) {
+    m_execution_context = static_cast<observer_ptr<execution_context>>(&context);
+  }
+
+  //************************************************************************
+  // Helper functions to access the dataset statistics
+  //************************************************************************
+  dataset& get_dataset(execution_mode m) {
+    switch(m) {
+    case execution_mode::training:
+      return m_training_dataset;
+      break;
+    case execution_mode::validation:
+      return m_validation_dataset;
+      break;
+    case execution_mode::testing:
+      return m_testing_dataset;
+      break;
+    default:
+      LBANN_ERROR("get_dataset: invalid execution mode");
+    }
+  }
+
+  const dataset& get_dataset(execution_mode m) const {
+    switch(m) {
+    case execution_mode::training:
+      return m_training_dataset;
+      break;
+    case execution_mode::validation:
+      return m_validation_dataset;
+      break;
+    case execution_mode::testing:
+      return m_testing_dataset;
+      break;
+    default:
+       LBANN_ERROR("get_dataset: invalid execution mode");
+    }
+  }
+
+  /**
+   * Return the first dataset with a valid (non-null) datareader.
+   * Returns null if none are valid.
+   */
+  dataset* select_first_valid_dataset() {
+    if (m_data_readers[execution_mode::training]) {
+      return &m_training_dataset;
+    } else if (m_data_readers[execution_mode::validation]) {
+      return &m_validation_dataset;
+    } else if (m_data_readers[execution_mode::testing]) {
+      return &m_testing_dataset;
+    } else {
+      return nullptr;
+    }
+  }
+
+  long get_num_samples_trained() const {
+    return m_training_dataset.get_num_samples_processed();
+  }
+  long get_num_samples_tested() const {
+    return m_testing_dataset.get_num_samples_processed();
+  }
+  long get_total_num_training_samples() const {
+    return m_training_dataset.get_total_samples();
+  }
+  long get_total_num_testing_samples() const {
+    return m_testing_dataset.get_total_samples();
+  }
+
+  //************************************************************************
+  //
+  //************************************************************************
+
+  void calculate_num_iterations_per_epoch(int max_mini_batch_size, generic_data_reader *data_reader);
+  void calculate_num_iterations_per_epoch(int mini_batch_size);
+
+  int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const;
+  static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm);
+
+  //************************************************************************
+  //
+  //************************************************************************
+
+  // save state of IO to a checkpoint
+  bool save_to_checkpoint_shared(persist& p) const {
+    // save state of data readers from input layer
+    data_reader_map_t::const_iterator it;
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint){
+
+      it = this->m_data_readers.find(execution_mode::training);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_shared(p, execution_mode::training);
+      }
+      it = this->m_data_readers.find(execution_mode::testing);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_shared(p, execution_mode::testing);
+      }
+      it = this->m_data_readers.find(execution_mode::validation);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_shared(p, execution_mode::validation);
+      }
+
+      if (this->m_comm->am_trainer_master()) {
+        write_cereal_archive<const data_coordinator>(*this, p, execution_mode::training, "_dc.xml");
+      }
+    }
+    return true;
+  }
+
+  // reload state of IO from a checkpoint
+  bool load_from_checkpoint_shared(persist& p) {
+    // save state of data readers from input layer
+    data_reader_map_t::const_iterator it;
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint){
+
+      it = this->m_data_readers.find(execution_mode::training);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->load_from_checkpoint_shared(p, execution_mode::training);
+      }
+      it = this->m_data_readers.find(execution_mode::testing);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->load_from_checkpoint_shared(p, execution_mode::testing);
+      }
+      it = this->m_data_readers.find(execution_mode::validation);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->load_from_checkpoint_shared(p, execution_mode::validation);
+      }
+
+      std::string buf;
+      if (this->m_comm->am_trainer_master()) {
+        read_cereal_archive<data_coordinator>(*this, p, execution_mode::training, "_dc.xml");
+        buf = create_cereal_archive_binary_string<data_coordinator>(*this);
+      }
+
+      // TODO: this assumes homogeneous processors
+      // broadcast state from rank 0
+      this->m_comm->trainer_broadcast(0, buf);
+
+      if (!this->m_comm->am_trainer_master()) {
+        unpack_cereal_archive_binary_string<data_coordinator>(*this, buf);
+      }
+    }
+
+    return true;
+  }
+
+  bool save_to_checkpoint_distributed(persist& p) const {
+    // save state of data readers from input layer
+    data_reader_map_t::const_iterator it;
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint) {
+
+      it = this->m_data_readers.find(execution_mode::training);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_distributed(p, execution_mode::training);
+      }
+      it = this->m_data_readers.find(execution_mode::testing);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_distributed(p, execution_mode::testing);
+      }
+      it = this->m_data_readers.find(execution_mode::validation);
+      if ((it != this->m_data_readers.end()) && it->second) {
+        (it->second)->save_to_checkpoint_distributed(p, execution_mode::validation);
+      }
+
+      write_cereal_archive<const data_coordinator>(*this, p, execution_mode::training, "_dc.xml");
+    }
+    return true;
+  }
+
+  bool load_from_checkpoint_distributed(persist& p) {
+    // save state of data readers from input layer
+    data_reader_map_t::const_iterator it;
+    it = this->m_data_readers.find(execution_mode::training);
+    if ((it != this->m_data_readers.end()) && it->second) {
+      (it->second)->load_from_checkpoint_distributed(p, execution_mode::training);
+    }
+    it = this->m_data_readers.find(execution_mode::testing);
+    if ((it != this->m_data_readers.end()) && it->second) {
+      (it->second)->load_from_checkpoint_distributed(p, execution_mode::testing);
+    }
+    it = this->m_data_readers.find(execution_mode::validation);
+    if ((it != this->m_data_readers.end()) && it->second) {
+      (it->second)->load_from_checkpoint_distributed(p, execution_mode::validation);
+    }
+
+    read_cereal_archive<data_coordinator>(*this, p, execution_mode::training, "_dc.xml");
+    return true;
+  }
+
+ protected:
+  /** Pointer to hosting trainer */
+  trainer *m_trainer;
+  /** Pointer to LBANN communicator. */
+  lbann_comm *m_comm;
+
+  dataset m_training_dataset;
+  dataset m_testing_dataset;
+  dataset m_validation_dataset;
+
+  data_reader_map_t m_data_readers;
+ //  std::map<execution_mode, dataset_stats> m_dataset_stats;
+public:  // @todo BVE FIXME
+  bool m_data_set_processed;
+  std::mutex dr_mutex;
+
+  /** Pointer to the execution context object used for training or evaluating this model */
+  observer_ptr<execution_context> m_execution_context;
+};
+
+} // namespace lbann
+
+#endif // LBANN_DATA_COORDINATOR_HPP
diff --git a/include/lbann/data_coordinator/data_coordinator_metadata.hpp b/include/lbann/data_coordinator/data_coordinator_metadata.hpp
new file mode 100644
index 00000000000..d9c37f23527
--- /dev/null
+++ b/include/lbann/data_coordinator/data_coordinator_metadata.hpp
@@ -0,0 +1,64 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_COORDINATOR_METADATA_HPP
+#define LBANN_DATA_COORDINATOR_METADATA_HPP
+
+#include <El.hpp>
+
+#include "lbann/utils/enum_iterator.hpp"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace lbann {
+
+// NA - Not applicable, used for input layers that don't produce a second output
+enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, INPUT, NA};
+std::string to_string(data_reader_target_mode m);
+/// Map from target modes to dimension maps
+using TargetModeDimMap = std::unordered_map<data_reader_target_mode, std::vector<int>>;
+using data_reader_target_mode_iterator = enum_iterator<data_reader_target_mode, data_reader_target_mode::CLASSIFICATION, data_reader_target_mode::NA>;
+
+enum class slice_points_mode {INDEPENDENT, DEPENDENT, NA};
+std::string to_string(const slice_points_mode m);
+slice_points_mode slice_points_mode_from_string(const std::string& m);
+/// Map from slice points modes to slice points
+using SPModeSlicePoints = std::unordered_map<slice_points_mode, std::vector<El::Int>>;
+using slice_points_mode_iterator = enum_iterator<slice_points_mode, slice_points_mode::INDEPENDENT, slice_points_mode::NA>;
+
+/// Data structure containing metadata from the data readers
+//using DataReaderMetaData = std::pair<TargetModeDimMap, TargetModeSlicePoints>;
+
+struct DataReaderMetaData {
+  TargetModeDimMap data_dims;
+  SPModeSlicePoints slice_points;
+};
+
+} // namespace lbann
+
+#endif // LBANN_DATA_COORDINATOR_METADATA_HPP
diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt
index f6d513de63a..7d56b6ebf46 100644
--- a/include/lbann/data_readers/CMakeLists.txt
+++ b/include/lbann/data_readers/CMakeLists.txt
@@ -1,42 +1,23 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   compound_data_reader.hpp
-  cv_augmenter.hpp
-  cv_colorizer.hpp
-  cv_decolorizer.hpp
-  cv_cropper.hpp
-  cv_mean_extractor.hpp
-  cv_normalizer.hpp
-  cv_process.hpp
-  cv_process_patches.hpp
-  cv_transform.hpp
-  cv_utils.hpp
   data_reader.hpp
-  data_reader_ascii.hpp
   data_reader_cifar10.hpp
   data_reader_csv.hpp
   data_reader_image.hpp
   data_reader_imagenet.hpp
-  data_reader_imagenet_patches.hpp
   data_reader_merge_features.hpp
   data_reader_merge_samples.hpp
   data_reader_mnist.hpp
-  data_reader_moving_mnist.hpp
   data_reader_nci.hpp
   data_reader_numpy.hpp
   data_reader_numpy_npz.hpp
+  data_reader_numpy_npz_conduit.hpp
   data_reader_pilot2_molecular.hpp
   data_reader_python.hpp
   data_reader_synthetic.hpp
-  image_preprocessor.hpp
-  image_utils.hpp
-  opencv.hpp
-  opencv_extensions.hpp
-  data_reader_multihead_siamese.hpp
+  data_reader_smiles.hpp
   )
 
-# Add the subdirectories
-add_subdirectory(patchworks)
-
 # Propagate the files up the tree
 set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/data_readers/cv_augmenter.hpp b/include/lbann/data_readers/cv_augmenter.hpp
deleted file mode 100644
index ba584ab18fe..00000000000
--- a/include/lbann/data_readers/cv_augmenter.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_augmenter .cpp .hpp - Augmenting functions for images in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_AUGMENTER_HPP
-#define LBANN_CV_AUGMENTER_HPP
-
-#include "cv_transform.hpp"
-#include <iostream>
-#include <ostream>
-#include <cstring>
-#include <string>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- * Supports the following transforms:
- * - Random horizontal and vertical flips
- * - Random rotations
- * - Random horizontal and vertical shifts
- * - Random shearing
- */
-class cv_augmenter : public cv_transform {
- protected:
-  // --- configuration variables ---
-  /** Whether to do horizontal flips. */
-  bool m_do_horizontal_flip;
-  /** Whether to do vertical flips. */
-  bool m_do_vertical_flip;
-
-  /** Range in degrees for rotations (0-180). */
-  float m_rotation_range;
-  /** Range (fraction of total width) for horizontal shifts. */
-  float m_horizontal_shift_range;
-  /** Range (fraction of total height) for vertical shifts. */
-  float m_vertical_shift_range;
-  /** Shear angle (radians). */
-  float m_shear_range;
-
-  // --- state variables ---
-  /// Flip decision made
-  cv_flipping m_flip; // currently more of a configuration variable but can easily become a state variable
-  /// The rest of the affine tranformations determined
-  cv::Mat_<float> m_trans;
-
-  /// Check if there is a reason to enable. (i.e., any option set)
-  bool check_to_enable() const override;
-
- public:
-  cv_augmenter();
-  cv_augmenter(const cv_augmenter& rhs);
-  cv_augmenter& operator=(const cv_augmenter& rhs);
-  cv_augmenter* clone() const override;
-
-  ~cv_augmenter() override {}
-
-  /// Set the parameters all at once
-  void set(const bool hflip, const bool vflip, const float rot,
-           const float hshift, const float vshift, const float shear);
-
-  /// Clear the states of the previous transform applied
-  void reset() override;
-
-  /**
-   * Construct an affine transformation matrix based on the options and random
-   * numbers. If successful, the tranform is enabled. If not, it is disabled.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// Augmentation is irreversible. Thus, this has no effect.
-  bool determine_inverse_transform() override { return false; }
-
-  /**
-   * Apply the transformation determined.
-   * As this method is executed, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  std::string get_type() const override { return "augmenter"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_AUGMENTER_HPP
diff --git a/include/lbann/data_readers/cv_colorizer.hpp b/include/lbann/data_readers/cv_colorizer.hpp
deleted file mode 100644
index 7d667f9cca5..00000000000
--- a/include/lbann/data_readers/cv_colorizer.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_colorizer .cpp .hpp - transform a non-color (grayscale) image into a
-//                          3-channel color image
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_COLORIZER_HPP
-#define LBANN_CV_COLORIZER_HPP
-
-#include "cv_transform.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-class cv_colorizer : public cv_transform {
- protected:
-  // --- state variables ---
-  bool m_gray; ///< whether an image is monochrome or not
-
- public:
-  cv_colorizer() : cv_transform(), m_gray(false) {}
-  cv_colorizer(const cv_colorizer& rhs);
-  cv_colorizer& operator=(const cv_colorizer& rhs);
-  cv_colorizer *clone() const override;
-
-  ~cv_colorizer() override {}
-
-  void set() { reset(); }
-  void reset() override {
-    m_enabled = false;
-    m_gray = false;
-  }
-
-  /**
-   * If a given image is in grayscale, the tranform is enabled, and not otherwise.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// convert back to color image if it used to be a grayscale image
-  bool determine_inverse_transform() override;
-
-  /**
-   * Apply color conversion if enabled.
-   * As it is applied, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  std::string get_type() const override { return "colorizer"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_COLORIZER_HPP
diff --git a/include/lbann/data_readers/cv_cropper.hpp b/include/lbann/data_readers/cv_cropper.hpp
deleted file mode 100644
index 651e7945d5b..00000000000
--- a/include/lbann/data_readers/cv_cropper.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_cropper .cpp .hpp - Functions to crop images
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_CROPPER_HPP
-#define LBANN_CV_CROPPER_HPP
-
-#include "lbann/data_readers/cv_transform.hpp"
-#include <utility>
-#include <ostream>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- * If the size of a region of interest (ROI) is defined, use the area at the
- * center of a given image. Otherwise, use the entire image.
- * Zoom in/out the image if necessary to cover the ROI. Then, crop out an area
- * of the desired size from the region either randomly within the ROI or at the
- * center depending on the given specification.
- */
-class cv_cropper : public cv_transform {
- protected:
-  // --- configuration variables ---
-  unsigned int m_width; ///< desired width of an image
-  unsigned int m_height; ///< desired height of an image
-  /// randomize the center position of the area of interest
-  bool m_rand_crop;
-  /// indicate if a specific ROI is set or supposed to use whole image
-  bool m_is_roi_set;
-  /// The size of the initial region of interest to crop from
-  std::pair<int, int> m_roi_size;
-
-  // --- state variables ---
-  double m_zoom; ///< zoom factor to prepare the initial region for a given image
-  /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR
-   *  The first choice is the default when not adaptive. The other two are used when
-   *  interpolatng  adaptively. The second is when shrinking, and the third is when enlarging
-   */
-  static const int m_interpolation_choices[3];
-  int m_interpolation; ///< id of the channel value interpolation method used
-  bool m_adaptive_interpolation; ///< whether to use adaptive interpolation
-
-  void unset_roi();
-
- public:
-  cv_cropper();
-  cv_cropper(const cv_cropper& rhs) = default;
-  cv_cropper& operator=(const cv_cropper& rhs) = default;
-  cv_cropper *clone() const override;
-  ~cv_cropper() override {}
-
-  /**
-   * Set the parameters all at once
-   * @param width  desired width of the crop
-   * @param height desired height of the crop
-   * @param random_crop whether to crop randomly from the initial region of interest or at the center
-   * @param roi the size of the initial region of interest to crop from. Set (0,0) to use the full image.
-   * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized
-   */
-  void set(const unsigned int width, const unsigned int height,
-           const bool random_crop = false,
-           const std::pair<int, int>& roi = std::make_pair(0,0),
-           const bool adaptive_interpolation = false);
-
-  unsigned int get_crop_width() const { return m_width; }
-  unsigned int get_crop_height() const { return m_height; }
-
-  /// Clear the states of the previous transform applied
-  void reset() override;
-
-  /**
-   * Construct transformation parameters based on the options and random
-   * numbers. If successful, the tranform is enabled.If not, it is disabled.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// Cropping is irreversible. Thus, this has no effect.
-  bool determine_inverse_transform() override { return false; }
-
-  /**
-   * Apply the transformation determined.
-   * As this method is executed, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  std::string get_type() const override { return "cropper"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_CROPPER_HPP
diff --git a/include/lbann/data_readers/cv_decolorizer.hpp b/include/lbann/data_readers/cv_decolorizer.hpp
deleted file mode 100644
index 18e09aea0cf..00000000000
--- a/include/lbann/data_readers/cv_decolorizer.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_decolorizer .cpp .hpp - transform a color image into a single-channel
-//                            monochrome image
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_DECOLORIZER_HPP
-#define LBANN_CV_DECOLORIZER_HPP
-
-#include "lbann_config.hpp"
-#include "cv_transform.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-class cv_decolorizer : public cv_transform {
- protected:
-  // --- state variables ---
-  bool m_color; ///< whether an image is color or not
-  /// Method to used: either pick one channel, or mix BGR channels (default)
-  bool m_pick_1ch;
-
- public:
-  cv_decolorizer() : cv_transform(), m_color(false), m_pick_1ch(false) {}
-  cv_decolorizer(const cv_decolorizer& rhs);
-  cv_decolorizer& operator=(const cv_decolorizer& rhs);
-  cv_decolorizer *clone() const override;
-
-  ~cv_decolorizer() override {}
-
-  void set(const bool pick_1ch);
-  void reset() override {
-    m_enabled = false;
-    m_color = false;
-  }
-
-  /**
-   * If a given image is in color, the tranform is enabled, and not otherwise.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// The decolorizing transform is irreversible. Thus, this has no effect.
-  bool determine_inverse_transform() override { return false; }
-
-  /**
-   * Convert a color image to a monochrome image if enabled.
-   * As it is applied, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  std::string get_type() const override { return "decolorizer"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_DECOLORIZER_HPP
diff --git a/include/lbann/data_readers/cv_mean_extractor.hpp b/include/lbann/data_readers/cv_mean_extractor.hpp
deleted file mode 100644
index eef53a0afa5..00000000000
--- a/include/lbann/data_readers/cv_mean_extractor.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_mean_extractor .cpp .hpp - accumulate mean over the image set
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_MEAN_EXTRACTOR_HPP
-#define LBANN_CV_MEAN_EXTRACTOR_HPP
-
-#include "cv_transform.hpp"
-#include <type_traits>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- *  Computes a cumulative pixel-wise average of a stream of images.
- *  It is assumed that the images have the same size and the same number of
- *  channels. However, they are not required to have the same channel depth.
- *  If a channel value is an integral type, it is normalized to a floating
- *  point number of type Float_T between 0 and 1 (inclusive at both ends).
- *  If a channel value is already in a floating point type, the value is used
- *  without normalization.
- *  Images accumulate per pixel and a mean image is obtained by dividing each
- *  pixel accumulation by the total number of images (if m_batch_size is larger
- *  than the number of all the images observed). The current mean of images can
- *  be obtained at any point during the operation by the member function
- *  extract<Channel_T>(). This returns the image normalized to the range of
- *  channel type, Channel_T. For example, if Channel_T is uint8_t, the range of
- *  mean values from 0.0 to 1.0 maps to the range from 0 to 256.
- *  To cope with a large number of images, one might rely on semi-moving average
- *  method. Up to m_batch_size number of images accumulate aa a batch while the
- *  moving average of batches is computed upon request by calling extract().
- *  This is particularly useful when Float_T is single precision with a limited
- *  number of bits to represent a wide range of numbers and the images have a
- *  large bit depth.
- */
-class cv_mean_extractor : public cv_transform {
- public:
-  /// type of image statistics value accumulated
-  using Float_T = double;
-  static const unsigned int m_default_batch_size = 65536u;
-
- protected:
-  // --- configuration variables ---
-  unsigned int m_batch_size; ///< number of samples per batch
-
-  // --- state variables ---
-  unsigned int m_batch_cnt; ///< number of complete batches
-  unsigned int m_partial_cnt; ///< number of samples currently contributing towards a batch
-  /// OpenCv type code used to create  m_sum and m_avg based on Float_T and the number of channels
-  int m_type_code;
-  cv::Mat m_sum; ///< partial batch accumulated so far
-  cv::Mat m_avg; ///< cumulative moving average
-
-  /// create the matrices for accumulating image statistics
-  void create_matrices(const unsigned int width, const unsigned int height, const unsigned int n_ch);
-
- public:
-  cv_mean_extractor();
-  cv_mean_extractor(const cv_mean_extractor& rhs);
-  cv_mean_extractor& operator=(const cv_mean_extractor& rhs);
-  cv_mean_extractor *clone() const override;
-
-  ~cv_mean_extractor() override {}
-
-  void set(const unsigned int width, const unsigned int height, const unsigned int n_ch,
-           const unsigned int batch_sz = cv_mean_extractor::m_default_batch_size);
-  void set(const unsigned int batch_sz);
-  void reset() override;
-
-  bool determine_transform(const cv::Mat& image) override;
-  /// The transform does not modify the image. Thus, this has no effect.
-  bool determine_inverse_transform() override;
-  bool apply(cv::Mat& image) override;
-
-  template<typename Channel_T = uint8_t>
-  cv::Mat extract() const;
-
-  std::string get_type() const override { return "mean extractor"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-/**
- * Convert the maxtrix representing the cumulative moving average of images
- * observed so far into an image with the channel type 'Channel_T'. The default
- * is uint8_t. If it is given as void, the matrix is returned as is.
- */
-template<typename Channel_T>
-inline cv::Mat cv_mean_extractor::extract() const {
-  cv::Mat avg_so_far;
-  if (m_partial_cnt == 0u) {
-    avg_so_far = m_avg;
-  } else {
-    cv::addWeighted(m_avg, m_batch_cnt/static_cast<double>(m_batch_cnt+1),
-                    m_sum, 1/static_cast<double>((m_batch_cnt + 1) * m_partial_cnt),
-                    0.0, avg_so_far, m_type_code);
-  }
-
-  if (avg_so_far.empty()) return cv::Mat();
-
-  if (std::is_void<Channel_T>::value) return avg_so_far;
-
-  double minVal = 0.0;
-  double maxVal = 0.0;
-  cv::minMaxLoc(avg_so_far, &minVal, &maxVal, nullptr, nullptr);
-  //const double max_channel_type = std::numeric_limits<Channel_T>::max();
-  const double max_channel_type = depth_normalization<Channel_T>::inverse_factor();
-
-  cv::Mat recovered;
-  if ((minVal < 0.0) || (maxVal > 1.0)) {
-    // This condition may rise either because of unnormalized images with raw
-    // floating point values or because of precision error. In these cases,
-    // the minimum value maps to 0 and the maximum value maps to the greatest
-    // value of Channel_T
-    const double range = maxVal-minVal;
-    if (range == 0.0) return cv::Mat();
-    const double alpha = max_channel_type/range;
-    const double beta  = - alpha*minVal;
-    avg_so_far.convertTo(recovered, cv_image_type<Channel_T>::T(),
-                         alpha, beta);
-  } else {
-    // In this case, 0 maps to 0, and 1 maps to the greatest value of Channel_T
-    avg_so_far.convertTo(recovered, cv_image_type<Channel_T>::T(),
-                         max_channel_type, 0.0);
-  }
-
-  return recovered;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_MEAN_EXTRACTOR_HPP
diff --git a/include/lbann/data_readers/cv_normalizer.hpp b/include/lbann/data_readers/cv_normalizer.hpp
deleted file mode 100644
index dfaf2954f89..00000000000
--- a/include/lbann/data_readers/cv_normalizer.hpp
+++ /dev/null
@@ -1,399 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_cv_normalizer .cpp .hpp - Normalizing functions for images
-//                                 in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_NORMALIZER_HPP
-#define LBANN_CV_NORMALIZER_HPP
-
-#include <typeinfo> // typeid
-#include "cv_transform.hpp"
-#include "lbann/base.hpp" // DataType
-#include "lbann/utils/mild_exception.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-/**
- *  Modifies the channel values of each pixel according to the chosen normalization
- *  strategies:
- *  - Standardize to 0 mean
- *  - Standardize to unit variance
- *  - Scale to the range [0, 1]
- *  - Normalize via z-score
- *
- *  Combine these strategies into a single per-pixel linear transform, and
- *  process them all at once.
- *  It tries to replace the values in place if possible, rather
- *  than creating a new copy of data, especially, if the channel data type of
- *  source image is the same as that of the resultant image.
- */
-class cv_normalizer : public cv_transform {
- public:
-  /** This is the interim type of input values computed from image data
-   *  It does not have to be the same as the type of the values stored, i.e., DataType.
-   */
-  using ComputeType = DataType;
-  //using ComputeType = double;
-  /**
-   * Define the type of normalization methods available.
-   * z-score method is essentially the combination of mean subtraction and unit variance
-   */
-  enum normalization_type {_none=0, _u_scale=1, _mean_sub=2, _unit_var=4, _z_score=6};
-  using channel_trans_t = std::pair<ComputeType, ComputeType>;
-
- protected:
-  // --- configuration variables ---
-  /// Whether to normalize to 0 mean.
-  bool m_mean_subtraction;
-  /// Whether to normalize to unit variance.
-  bool m_unit_variance;
-  /// Whether to scale to [0, 1].
-  bool m_unit_scale;
-  /// Whether to normalize via z-score.
-  bool m_z_score;
-
-
-  // --- state variables ---
-  /**
-   *  The parameter to use for linearly transforming channel values of each pixel as:
-   *  new_value[ch] = cv::saturate_cast<T>(m_trans[ch].first*value[ch] + m_trans[ch].second)
-   */
-  std::vector<channel_trans_t> m_trans;
-
-
-  /// Set a normalization bit flag
-  normalization_type set_normalization_bits(const normalization_type ntype, const normalization_type flag) const {
-    return static_cast<normalization_type>(static_cast<uint32_t>(ntype) | static_cast<uint32_t>(flag));
-  }
-
-  /// Mask normalization bits
-  normalization_type mask_normalization_bits(const normalization_type ntype, const normalization_type flag) const {
-    return static_cast<normalization_type>(static_cast<uint32_t>(ntype) & static_cast<uint32_t>(flag));
-  }
-
-  /// Enable a particular normalization method
-  normalization_type& set_normalization_type(normalization_type& ntype, const normalization_type flag) const;
-
-  /// Check if there is a reason to enable. (i.e., any option set)
-  bool check_to_enable() const override;
-
- public:
-
-  cv_normalizer();
-  cv_normalizer(const cv_normalizer& rhs);
-  cv_normalizer& operator=(const cv_normalizer& rhs);
-  cv_normalizer *clone() const override;
-
-  ~cv_normalizer() override {}
-
-  /// Set the parameters all at once
-  void set(const bool meansub, const bool unitvar, const bool unitscale, const bool zscore);
-
-  /// Whether to subtract the per-channel and per-sample mean.
-  void subtract_mean(bool b) {
-    m_mean_subtraction = b;
-  }
-  /// Whether to normalize to unit variance, per-channel and per-sample.
-  void unit_variance(bool b) {
-    m_unit_variance = b;
-  }
-  /// Whether to scale to [0, 1]
-  void unit_scale(bool b) {
-    m_unit_scale = b;
-  }
-  /// Whether to normalize by z-scores, per-channel and per-sample.
-  void z_score(bool b) {
-    m_z_score = b;
-  }
-
-  /// Set a pre-determined normalization transform.
-  void set_transform(const std::vector<channel_trans_t>& t);
-
-  /// Clear the states of the previous transform applied
-  void reset() override;
-
-  /// Returns the channel-wise scaling parameter for normalization transform
-  std::vector<channel_trans_t> transform() const {
-    return (m_enabled? m_trans : std::vector<channel_trans_t>());
-  }
-
-  /**
-   * Combine the normalizations enabled and define a linear transform
-   * per pixel to address them all. If successful, the tranform is enabled.
-   * If not, it is disabled.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /**
-   * Reverse the normalization done as x' = alpha*x + beta by
-   * x = (x'- beta)/alpha
-   * If successful, the tranform is enabled. If not, it is disabled.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_inverse_transform() override;
-
-  /**
-   * Apply the normalization defined as a linear tranform per pixel.
-   * As this method is executed, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  // utilities
-  template<class InputIterator, class OutputIterator>
-  static OutputIterator scale(InputIterator first, InputIterator last, OutputIterator result,
-                              const std::vector<channel_trans_t> trans);
-
-  template<typename Tsrc, typename Tdst>
-  static bool scale_with_known_type(cv::Mat& image, const std::vector<channel_trans_t>& trans);
-
-  /**
-   * Scale an image using a set of parameters for linearly transforming channel
-   * values per pixel.
-   * The resultant image will contain channel values of LBANN's DataType.
-   */
-  static bool scale(cv::Mat& image, const std::vector<channel_trans_t>& trans);
-
-
-  template<typename T>
-  static bool compute_mean_stddev_with_known_type(const cv::Mat& image,
-      std::vector<ComputeType>& mean, std::vector<ComputeType>& stddev, cv::InputArray mask);
-
-  /// Compute the per-channel and per-sample mean and standard deviation
-  static bool compute_mean_stddev(const cv::Mat& image,
-                                  std::vector<ComputeType>& mean, std::vector<ComputeType>& stddev,
-                                  cv::InputArray mask=cv::noArray());
-
-  std::string get_type() const override { return "normalizer"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-
-/**
- * Linearly transform each value while copying it from one sequential container
- * to another, which may be the same container if the type of the initial value
- * and that of the result are the same.
- * The transformation is alpha[ch]*input[ch] + beta[ch] -> output[ch]
- * @param first  The beginning of the input interator
- * @param last   The last of the input iterator
- * @param result The beginning of the output iterator
- * @param trans  Parameters for linearly transforming channel values per pixel
- * @return the last of output iterator
- */
-template<class InputIterator, class OutputIterator>
-inline OutputIterator cv_normalizer::scale(
-  InputIterator first, InputIterator last, OutputIterator result,
-  const std::vector<channel_trans_t> trans) {
-  const size_t NCh = trans.size();
-  bool trivial_alpha = true;
-  bool trivial_beta = true;
-
-  for (size_t ch=0u; ch < NCh; ++ch) {
-    trivial_alpha = trivial_alpha && (trans[ch].first  == 1.0);
-    trivial_beta  = trivial_beta  && (trans[ch].second == 0.0);
-  }
-
-  if (trivial_alpha && trivial_beta) {
-    if ((typeid(*first) == typeid(*result)) &&
-        (reinterpret_cast<const void *>(&(*first)) ==
-         reinterpret_cast<const void *>(&(*result))))
-      // This way, it works both for iterator and for pointer
-    {
-      std::advance(result, std::distance(first,last));
-      return result;
-    } else {
-      return std::copy(first, last, result);
-    }
-  }
-
-  using T = typename std::iterator_traits<OutputIterator>::value_type;
-
-  // At this point NCh should not be zero because both alpha and beta are not trivial.
-  if (NCh == 1) {
-    const ComputeType a = trans[0].first;
-    const ComputeType b = trans[0].second;
-
-    while (first != last) {
-      *result = cv::saturate_cast<T>(a * (*first) + b);
-      ++result;
-      ++first;
-    }
-  } else {
-    size_t ch = 0u;
-
-    while (first != last) {
-      *result = cv::saturate_cast<T>(trans[ch].first * (*first) + trans[ch].second);
-      ++result;
-      ++first;
-      ++ch;
-      ch = (ch % NCh);
-    }
-  }
-  return result;
-}
-
-
-/**
- * Linear transform image pixels by scaling parameters given for each channel
- * The transformation is trans[ch].first*input[ch] + trans[ch].second -> output[ch].
- * The first template parameter is the channel value type of the input image.
- * The second one is the channel value type desired for the output image.
- *
- * @param image  The image to be modified, which is the input and also the ouput.
- * @param trans  Parameters for linearly transforming channel values per pixel
- * @return true if successful. The input image will be modified to a new one.
- */
-template<typename Tsrc, typename Tdst>
-inline bool cv_normalizer::scale_with_known_type(cv::Mat& image,
-    const std::vector<channel_trans_t>& trans) {
-  const auto Width  = static_cast<unsigned int>(image.cols);
-  const auto Height = static_cast<unsigned int>(image.rows);
-  const auto NCh    = static_cast<unsigned int>(image.channels());
-  if ((trans.size() > 0u) && (trans.size() != NCh)) {
-    return false;
-  }
-
-
-  // overwrite the storage of the source image if the source and the result have
-  // the same data type. Otherwise, create a new image for the result. The result
-  // will replace the image referenced by the input.
-  if (std::is_same<Tsrc, Tdst>::value) {
-    if (image.isContinuous()) {
-      scale(reinterpret_cast<const Tsrc *>(image.datastart),
-            reinterpret_cast<const Tsrc *>(image.dataend),
-            reinterpret_cast<Tsrc *>(image.data), trans);
-    } else {
-      // TODO: Should we make this to copy to a new continuous block instead of
-      // updating the values in-place?
-      const unsigned int stride = Width*NCh;
-      for (unsigned int i = 0u; i < Height; ++i) {
-        auto *optr = reinterpret_cast<Tsrc *>(image.ptr<Tsrc>(i));
-        const Tsrc *iptr = optr;
-        scale(iptr, iptr+stride, optr, trans);
-      }
-    }
-  } else {
-    cv::Mat image_out = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType<Tdst>::depth, NCh));
-
-    if (image.isContinuous()) {
-      scale(reinterpret_cast<const Tsrc *>(image.datastart),
-            reinterpret_cast<const Tsrc *>(image.dataend),
-            reinterpret_cast<Tdst *>(image_out.data), trans);
-    } else {
-      const unsigned int stride = Width*NCh;
-      auto *ptr_out = reinterpret_cast<Tdst *>(image_out.data);
-      for (unsigned int i = 0u; i < Height; ++i, ptr_out += stride) {
-        const Tsrc *ptr = reinterpret_cast<Tsrc *>(image.ptr<Tsrc>(i));
-        scale(ptr, ptr+stride, ptr_out, trans);
-      }
-    }
-    image = image_out;
-  }
-  return true;
-}
-
-
-/**
- * Compute the per-channel and per-sample mean and standard deviation
- * for a sample image of channel value type T
- */
-template<typename T>
-inline bool cv_normalizer::compute_mean_stddev_with_known_type(const cv::Mat& image,
-    std::vector<ComputeType>& mean, std::vector<ComputeType>& stddev, cv::InputArray mask) {
-  mean.clear();
-  stddev.clear();
-  if (image.empty()) {
-    return false;
-  }
-
-  const int NCh = image.channels();
-  const int num_pixels = image.rows * image.cols;
-  ComputeType sum[NCh];
-  ComputeType sqsum[NCh];
-  ComputeType shift[NCh];
-
-  for (int ch = 0; ch < NCh; ++ch) {
-    sum[ch] = 0.0;
-    sqsum[ch] = 0.0;
-    const auto *ptr = reinterpret_cast<const T *>(image.datastart);
-    shift[ch] = static_cast<ComputeType>(*(ptr+ch));
-  }
-
-  mean.resize(NCh);
-  stddev.resize(NCh);
-
-  if (image.isContinuous()) {
-    const auto *ptr = reinterpret_cast<const T *>(image.datastart);
-    const auto *const ptrend = reinterpret_cast<const T *>(image.dataend);
-
-    int ch = 0;
-    do {
-      const ComputeType diff = (*ptr - shift[ch]);
-      sum[ch] += diff;
-      sqsum[ch] += diff*diff;
-      ++ch;
-      ch = ch % NCh;
-    } while ((++ptr) != ptrend);
-
-    for (int c = 0; c < NCh; ++c) {
-      const ComputeType shifted_mean = sum[c] / num_pixels;
-      mean[c] = shifted_mean + shift[c];
-      stddev[c] = sqrt(std::max(sqsum[c]/num_pixels - shifted_mean * shifted_mean, ComputeType(0)));
-    }
-  } else {
-    const int stride = image.cols*NCh;
-    const int Height = image.rows;
-
-    for (int i = 0; i < Height; ++i) {
-      const auto *ptr = reinterpret_cast<const T *>(image.ptr<const T>(i));
-      const T *const ptrend = ptr + stride;
-
-      int ch = 0;
-      do {
-        const ComputeType diff = (*ptr - shift[ch]);
-        sum[ch] += diff;
-        sqsum[ch] += diff*diff;
-        ++ch;
-        ch = ch % NCh;
-      } while ((++ptr) != ptrend);
-    }
-
-    for (int ch = 0; ch < NCh; ++ch) {
-      const ComputeType shifted_mean = sum[ch] / num_pixels;
-      mean[ch] = shifted_mean + shift[ch];
-      stddev[ch] = sqrt(std::max(sqsum[ch]/num_pixels - shifted_mean*shifted_mean, ComputeType(0)));
-    }
-  }
-  return true;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_NORMALIZER_HPP
diff --git a/include/lbann/data_readers/cv_process.hpp b/include/lbann/data_readers/cv_process.hpp
deleted file mode 100644
index ffc315016a4..00000000000
--- a/include/lbann/data_readers/cv_process.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_process .cpp .hpp - structure that defines the operations
-//                        on image data in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_PROCESS_HPP
-#define LBANN_CV_PROCESS_HPP
-
-#include "cv_transform.hpp"
-#include "cv_normalizer.hpp"
-#include "cv_subtractor.hpp"
-#include "cv_augmenter.hpp"
-#include "cv_colorizer.hpp"
-#include "cv_decolorizer.hpp"
-#include "cv_cropper.hpp"
-#include "cv_resizer.hpp"
-#include "cv_mean_extractor.hpp"
-#include <memory>
-#include <limits> // std::numeric_limits
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/** A structure packs the parameters for image pre-/post-processing that takes
- *  advantage of the OpenCV framework.
- */
-class cv_process {
-  /// OpenCV flip codes: c<0 for top_left <-> bottom_right, c=0 for top<->down, and c>0 for left<->right
-
- protected:
-  /// unique name for the processor
-  std::string m_name;
-  /// Whether to flip an image
-  cv_transform::cv_flipping m_flip;
-  /// Whether to split channels
-  bool m_split;
-  /// whether a normalizing transform is set or not
-  bool m_is_normalizer_set;
-  /// The index of the normalizing transform in the array of transforms
-  unsigned int m_normalizer_idx;
-
-  /// Array of transforms
-  std::vector<std::unique_ptr<cv_transform> > m_transforms;
-
-  /// Check if the last transform registered in the list is a normalizer and not a subtractor
-  bool to_fuse_normalizer_with_copy() const;
-
-  void set_normalizer_info();
-
- public:
-  cv_process()
-    : m_flip(cv_transform::_no_flip_), m_split(true), m_is_normalizer_set(false), m_normalizer_idx(0u) {}
-
-  cv_process(const cv_process& rhs);
-  cv_process& operator=(const cv_process& rhs);
-
-  cv_process(const cv_transform::cv_flipping flip_code, const bool tosplit)
-    : m_flip(flip_code), m_split(tosplit), m_is_normalizer_set(false), m_normalizer_idx(0u) {}
-
-  virtual ~cv_process() {}
-
-  std::string get_name() const { return m_name; }
-  void set_name(const std::string& name) { m_name = name; }
-
-  /// Reset all the transforms
-  void reset();
-
-  /// Check whether to flip
-  bool to_flip() const {
-    return (m_flip != cv_transform::_no_flip_);
-  }
-  /// Tell how to flip
-  int how_to_flip() const {
-    return static_cast<int>(m_flip);
-  }
-  /**
-   *  Set the flipping behavior. This is to deal with custom image format, which
-   *  is not supported by OpenCV's builtin decoders and may impose different pixel
-   *  coordinate system in its custom decoder.
-   *  It is not to substitute for random flipping in augmentation.
-   */
-  void set_to_flip(const cv_transform::cv_flipping f) {
-    m_flip = f;
-  }
-  /// Set to split channels
-  bool to_split() const {
-    return m_split;
-  }
-
-  /// Export transform operator of normalizer to allow lazy application
-  std::vector<cv_normalizer::channel_trans_t> get_transform_normalize() const;
-  /// Export transform operator of normalizer for a specific channel
-  std::vector<cv_normalizer::channel_trans_t> get_transform_normalize(const unsigned int ch) const;
-
-  /// Turn off normalizer. This is useful to make sure it off after potential lazy application
-  void disable_lazy_normalizer();
-
-  /// Turn off all transforms
-  void disable_transforms();
-
-  /// Add a tranform
-  bool add_transform(std::unique_ptr<cv_transform> tr);
-
-  /// Add a normalizing tranform
-  bool add_normalizer(std::unique_ptr<cv_normalizer> tr);
-  bool add_normalizer(std::unique_ptr<cv_subtractor> tr);
-
-  /// Allow access to the list of transforms registered
-  const std::vector<std::unique_ptr<cv_transform> >& get_transforms() const {
-    return m_transforms;
-  }
-
-  /// Allow read-only access to a particular transform indexed by idx
-  const cv_transform* get_transform(const unsigned int idx) const;
-
-  /// Allow read-write access to a particular transform indexed by idx
-  cv_transform* get_transform(const unsigned int idx);
-
-  /// Retrun the number of transforms registered
-  unsigned int get_num_transforms() const { return m_transforms.size(); }
-
-  /** Return final image dimension {width, height} after all the transforms
-   *  If a cropper is set, returns {crop_width, crop_height}. Otherwise, {0,0}.
-   */
-  std::vector<unsigned int> get_data_dims() const;
-
-  void determine_inverse_lazy_normalization();
-
-  /// Execute a range of transforms [tr_strart, tr_end) on the given image in order
-  bool preprocess(cv::Mat& image, unsigned int tr_start = 0u,
-                  unsigned int tr_end = std::numeric_limits<unsigned int>::max());
-  /// Execute all the inverse transforms on the given image in the reverse order
-  bool postprocess(cv::Mat& image);
-
-  virtual std::string get_type() const { return "cv_process"; }
-  virtual std::string get_description() const;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_PROCESS_HPP
diff --git a/include/lbann/data_readers/cv_process_patches.hpp b/include/lbann/data_readers/cv_process_patches.hpp
deleted file mode 100644
index b9c52ff955a..00000000000
--- a/include/lbann/data_readers/cv_process_patches.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_process_patches .cpp .hpp - structure that defines the operations
-//                      on patches extracted from an image in the opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_PROCESS_PATCHES_HPP
-#define LBANN_CV_PROCESS_PATCHES_HPP
-
-#include "cv_process.hpp"
-#include "patchworks/patchworks_patch_descriptor.hpp"
-#include <limits> // std::numeric_limits
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/// Similar to cv_process but works on patches that are extracted from an image
-class cv_process_patches : public cv_process {
- protected:
-  patchworks::patch_descriptor m_pd;
-  bool m_self_label;
-  unsigned int m_when_to_extract;
-
- public:
-  cv_process_patches();
-  cv_process_patches(const bool self_label);
-  cv_process_patches(const cv_process_patches& rhs);
-  cv_process_patches(const cv_transform::cv_flipping flip_code, const bool tosplit);
-  cv_process_patches& operator=(const cv_process_patches& rhs);
-
-  ~cv_process_patches() override {}
-
-  void set_patch_descriptor(const patchworks::patch_descriptor& pd,
-                            const unsigned int when_to_extract =
-                                  std::numeric_limits<unsigned int>::max());
-  patchworks::patch_descriptor& patch_descriptor() {
-    return m_pd;
-  }
-  const patchworks::patch_descriptor& patch_descriptor() const {
-    return m_pd;
-  }
-  unsigned int get_when_to_extract() const { return m_when_to_extract; }
-  bool is_self_labeling() const { return m_self_label; }
-  unsigned int get_num_labels() const { return m_pd.get_num_labels(); }
-  virtual unsigned int get_patch_label() const { return m_pd.get_last_label(); }
-  unsigned int get_num_patches() const { return m_pd.get_num_patches(); }
-  std::vector<unsigned int> get_data_dims() const {
-    return {m_pd.get_num_patches(), m_pd.get_patch_width(), m_pd.get_patch_height()};
-  }
-
-  bool preprocess(cv::Mat& image, std::vector<cv::Mat>& patches);
-
-  std::string get_type() const override { return "cv_process_patches"; }
-  std::string get_description() const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_PROCESS_PATCHES_HPP
diff --git a/include/lbann/data_readers/cv_resizer.hpp b/include/lbann/data_readers/cv_resizer.hpp
deleted file mode 100644
index 69555897d2c..00000000000
--- a/include/lbann/data_readers/cv_resizer.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_resizer .cpp .hpp - Functions to resize images
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_RESIZER_HPP
-#define LBANN_CV_RESIZER_HPP
-
-#include "lbann/data_readers/cv_transform.hpp"
-#include <utility>
-#include <ostream>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- * Simple image resizing without maintaining the aspect ratio.
- */
-class cv_resizer : public cv_transform {
- protected:
-  // --- configuration variables ---
-  unsigned int m_width; ///< desired width of an image
-  unsigned int m_height; ///< desired height of an image
-
-  // --- state variables ---
-  /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR
-   *  The first choice is the default when not adaptive. The other two are used when
-   *  interpolatng  adaptively. The second is when shrinking, and the third is when enlarging
-   */
-  static const int m_interpolation_choices[3];
-  int m_interpolation; ///< id of the channel value interpolation method used
-  bool m_adaptive_interpolation; ///< whether to use adaptive interpolation
-
- public:
-  cv_resizer();
-  cv_resizer(const cv_resizer& rhs) = default;
-  cv_resizer& operator=(const cv_resizer& rhs) = default;
-  cv_resizer *clone() const override;
-  ~cv_resizer() override {}
-
-  /**
-   * Set the parameters all at once
-   * @param width  desired width
-   * @param height desired height
-   * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized
-   */
-  void set(const unsigned int width, const unsigned int height,
-           const bool adaptive_interpolation = false);
-
-  unsigned int get_width() const { return m_width; }
-  unsigned int get_height() const { return m_height; }
-
-  /// Clear the states of the previous transform applied
-  void reset() override;
-
-  /**
-   * Determine whether to enable transformation.
-   * @return false if not enabled.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// Determine whether to enable inverse transformation.
-  bool determine_inverse_transform() override { return false; }
-
-  /**
-   * Apply the transformation.
-   * As this method is executed, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  std::string get_type() const override { return "resizer"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_RESIZER_HPP
diff --git a/include/lbann/data_readers/cv_subtractor.hpp b/include/lbann/data_readers/cv_subtractor.hpp
deleted file mode 100644
index 169181c4576..00000000000
--- a/include/lbann/data_readers/cv_subtractor.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_subtractor .cpp .hpp - subtract channel values of an image (possibly the
-// pixel-wise mean of dataset) from the corresponding values of another (input)
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_SUBTRACTOR_HPP
-#define LBANN_CV_SUBTRACTOR_HPP
-
-#include "cv_transform.hpp"
-#include "lbann/base.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- * Subtract channel values of an image from the corresponding values of another.
- * The former is likely to carry pre-computed mean data per pixel and per channel.
- * The latter is an input image. Both image needs to have the same size and the
- * same number of channels. The subtracted result is represented in the scale
- * between 0 and 1 (both inclusive).
- * In the common current use case, a colorizer comes before a subtractor which is
- * followed by a random cropper. In this scenario, the input images must be resized
- * in advance to match the size of the mean image.
- * In another scenario, where the random cropping is not used but resizing is done
- * on-line, the subtractor can come after cropper without requiring the input images
- * to be resized in advance.
- * Alternatively, even a simpler approach is to use a mean image with uniform pixels.
- * In this way, it does not need to know the size of input images, and is not impacted
- * by random cropping or flipping augmentation.
- */
-class cv_subtractor : public cv_transform {
- protected:
-  // --- configuration variables ---
-  /**
-   * The image to subtract from an input image in the pixel-wise fashion.
-   * It has channel values of a floating point type, in the scale from 0 to 1.
-   * An input image will be mapped into the scale before subtraction by linearly
-   * mapping the smallest representative value to 0 and the largest representative
-   * value to 1.
-   */
-  cv::Mat m_img_to_sub;
-
-  /**
-   * The image to divide an input image in the pixel-wise fashion.
-   * It has channel values of a floating point type, in the scale from 0 to 1.
-   * An input image will be mapped into the scale before division.
-   */
-  cv::Mat m_img_to_div;
-
-  /** uniform mean per channel used for channel-wise mean-subtraction.
-   *  This is used to construct the m_img_to_sub when the size of the image is known.
-   */
-  std::vector<lbann::DataType> m_channel_mean;
-
-  /** uniform standard deviation per channel used for channel-wise z-score (division).
-   *  This is used to construct the m_img_to_div when the size of the image is known.
-   */
-  std::vector<lbann::DataType> m_channel_stddev;
-
-  // --- state variables ---
-  bool m_applied; ///< has been subtracted
-
- public:
-  cv_subtractor() : cv_transform(), m_applied(false) {}
-  cv_subtractor(const cv_subtractor& rhs);
-  cv_subtractor& operator=(const cv_subtractor& rhs);
-  cv_subtractor *clone() const override;
-
-  ~cv_subtractor() override {}
-
-  static cv::Mat read_binary_image_file(const std::string filename);
-
-  /// Load and set the image to subtract from every input image.
-  void set_mean(const std::string name_of_img, const int depth_code = cv_image_type<lbann::DataType>::T());
-
-  /**
-   * Set the mean fixed per channel for mean-subtracting each input image.
-   * This supports an alternative method for mean subtraction given that the
-   * mean per channel is uniform.
-   */
-  void set_mean(const std::vector<lbann::DataType> channel_mean);
-
-  /**
-   * Set the dataset-wise mean image to subtract from each input image.
-   * The image represents the pre-computed pixel-wise mean of the dataset.
-   * In case that this image is not in a floating point type, it is converted to
-   * one with the depth specified by depth_code.
-   */
-  void set_mean(const cv::Mat& img, const int depth_code = cv_image_type<lbann::DataType>::T());
-
-  /// Load and set the image to normalize the pixels of every input image.
-  void set_stddev(const std::string name_of_img, const int depth_code = cv_image_type<lbann::DataType>::T());
-
-  /**
-   * Set the dataset-wise standard deviation fixed per channel for normalizing
-   * each input image.
-   * This supports an alternative method for normalizing with stddev given that
-   * it is uniform per channel.
-   */
-  void set_stddev(const std::vector<lbann::DataType> channel_stddev);
-
-  /**
-   * Set the dataset-wise standard deviation to normalize each input image.
-   * In case that this image is not in a floating point type, it is converted to
-   * one with the depth specified by depth_code.
-   */
-  void set_stddev(const cv::Mat& img, const int depth_code = cv_image_type<lbann::DataType>::T());
-
-  void reset() override {
-    m_enabled = false;
-    m_applied = false;
-  }
-
-  /**
-   * If a given image is in grayscale, the tranform is enabled, and not otherwise.
-   * @return false if not enabled or unsuccessful.
-   */
-  bool determine_transform(const cv::Mat& image) override;
-
-  /// convert back to color image if it used to be a grayscale image
-  bool determine_inverse_transform() override;
-
-  /**
-   * Apply color conversion if enabled.
-   * As it is applied, the transform becomes deactivated.
-   * @return false if not successful.
-   */
-  bool apply(cv::Mat& image) override;
-
-  /// true if both sub and div are channel-wise
-  bool check_if_channel_wise() const;
-
-  std::string get_type() const override { return "subtractor"; }
-  std::string get_description() const override;
-  std::ostream& print(std::ostream& os) const override;
-
- protected:
-  /// Construct an image of the unform channel values using the channel-wise mean.
-  bool create_img_to_sub(int width, int height, int n_channels);
-  /// Construct an image of the unform channel values using the channel-wise stddev.
-  bool create_img_to_div(int width, int height, int n_channels);
-};
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_SUBTRACTOR_HPP
diff --git a/include/lbann/data_readers/cv_transform.hpp b/include/lbann/data_readers/cv_transform.hpp
deleted file mode 100644
index 72455fc8907..00000000000
--- a/include/lbann/data_readers/cv_transform.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_transform .cpp .hpp - base class for the transformation
-//                          on image data in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_TRANSFORM_HPP
-#define LBANN_CV_TRANSFORM_HPP
-
-#include "opencv.hpp"
-#include "opencv_extensions.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-class cv_transform {
- protected:
-  // --- configuration variables ---
-  // place for the variables to keep the configuration set during initialization
-
-  std::string m_name;
-
-  // --- state variables ---
-  /// per-image indicator of whether to apply transform or not
-  bool m_enabled;
-
-  // transform prepared given the configuration (and the image)
-  // m_trans;
-
-  // Allow to manually shut transform off without destroying it
-  //bool m_manual_switch;
-
-  /** Check if transform is configured to apply.
-   * (e.g., if any of the augmentaion methods is enabled)
-   */
-  virtual bool check_to_enable() const {
-    return true;
-  }
-
- public:
-  enum cv_flipping {_both_axes_=-1, _vertical_=0, _horizontal_=1, _no_flip_=2};
-  static const constexpr char* const cv_flip_desc[] = {"both_axes", "vertical", "horizontal", "none"};
-  static std::string flip_desc(const cv_flipping flip_code) { return std::string(cv_flip_desc[static_cast<int>(flip_code)+1]); }
-
-  static const float pi;
-
-
-  cv_transform();
-  cv_transform(const cv_transform& rhs);
-  cv_transform& operator=(const cv_transform& rhs);
-  virtual cv_transform *clone() const;
-
-  virtual ~cv_transform() {}
-
-  // define a method to configure the transform
-  // void set(args) { reset(); ... }
-  /// Reset the transform state but do not alter the configuration variables
-  virtual void reset() {
-    m_enabled = false;
-    // e.g., m_trans.clear();
-  }
-
-  virtual bool determine_transform(const cv::Mat& image);
-  virtual bool determine_inverse_transform();
-  virtual bool apply(cv::Mat& image) = 0;
-
-  /// Turn transform on
-  void enable() {
-    m_enabled = true;
-  }
-  /// Turn transform off
-  void disable() {
-    m_enabled = false;
-  }
-  /// Check if transform is on
-  bool is_enabled() const {
-    return m_enabled;
-  }
-
-  //bool toggle_manual_switch() { return (m_manual_switch = !m_manual_switch); }
-
-  // administrative methods
-  /** Return this transform's type, e.g: "augmenter," "normalizer," etc. */
-  virtual std::string get_type() const = 0;
-
-  /// Returns this transform's name
-  std::string get_name() const { return m_name; }
-
-  /** Sets this transform's name; this is an arbitrary string, e.g, assigned in a prototext file. */
-  void set_name(const std::string& name) { m_name = name; }
-
-  /** Returns a description of the parameters passed to the ctor */
-  virtual std::string get_description() const;
-
-  virtual std::ostream& print(std::ostream& os) const;
-};
-
-/// Default constructor
-inline cv_transform::cv_transform()
-  : m_name(""), m_enabled(false)//, m_manual_switch(false)
-{}
-
-/// Deep-copying constructor
-inline cv_transform::cv_transform(const cv_transform& rhs)
-  : m_name(rhs.m_name), m_enabled(rhs.m_enabled) {}
-
-/// Assignement operator. deep-copy everything
-inline cv_transform& cv_transform::operator=(const cv_transform& rhs) {
-  m_enabled = rhs.m_enabled;
-  m_name = rhs.m_name;
-  return *this;
-}
-
-/** Prepare transform for the given image as configured.
- *  Then, check if they are valid, and turn the transform on if so.
- *  The preparation includes as much precomputation as possible. For example,
- *  if the transformation consists of constructing four affine transform matrices
- *  and applying them to the given image in sequence, the transform matrices
- *  will be reduced to one. Then, the following function apply(image) will
- *  finally apply it to the image.
- */
-inline bool cv_transform::determine_transform(const cv::Mat& image) {
-  // clear any transform state computed for previous image
-  // reset()
-  m_enabled = check_to_enable();
-  // if (!m_enabled) return false;
-  // compute m_trans for the image and the configuration of the transform
-  // Here, some transform may not applicable to the given image.
-  // In that case, set m_enabled = false (or fruther throw an exception).
-  return m_enabled;
-}
-
-/** Prepare the inverse transform to undo preprocessing transforms if needed
- *  for postprocessing. Not all transforms can be or need to be inversed.
- *  Then, check if they are valid, and turn the transform on if so.
- *  By default, turn this off as we do not need to undo in most of the cases.
- *  In need of manual overriding to enable/disable inverse transform, implement
- *  such a logic in this fuction and interfaces to enable/disable.
- */
-inline bool cv_transform::determine_inverse_transform() {
-  // In case of manual overriding, if (!m_manual_switch) return false;
-  // If this transform, by design, can not be or does not need to be inversed,
-  //   return (m_enabled = false);
-  //
-  // If the transform has not been applied (e.g., m_trans has not been set),
-  //   return (m_enabled = false);
-  // Note that this cannot be determined by m_enabled as the transform is turned
-  // off once applied.
-  //
-  // Compute the inverse of m_trans and overwrite m_trans;
-  // set m_enabled to true;
-  // return true;
-  return false;
-}
-
-/** Apply transform once and turn it off
- *  To conditionally apply the transform given an image,
- *  determine_transform(image) or determine_inverse_transform() must be called
- *  in advance. These will do as much precomputation as possible. For example,
- *  if the transformation consists of constructing four affine transform matrices
- *  and multiplying them to the given image in sequence, the transform matrices
- *  will be reduced to one. Then, this function will finally apply it to the image.
- *  There are three possible ways to implement condition checking as shown below,
- *  but here the third option is preferred for minimizing the number of calls
- *  1. checking m_enabled internally
- *  2. externally call is_enabled()
- *  3. rely on the return value of determine_transform()/determine_inverse_transform()
- */
-inline bool cv_transform::apply(cv::Mat& image) {
-  // As the transform is applied once, turn this off
-  m_enabled = false;
-  // Return the success of transform
-  return true;
-}
-
-/// Return the pointer of a newly copy-constructed object
-inline cv_transform *cv_transform::clone() const {
-  return static_cast<cv_transform *>(nullptr);
-}
-
-//inline std::string cv_transform::get_type() { return "image transform"; }
-
-inline std::string cv_transform::get_description() const {
-  return std::string {} + get_type();
-}
-
-inline std::ostream& cv_transform::print(std::ostream& os) const {
-  os << get_description(); // Print out configuration variables
-  // Additionally, print out state variables as well
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const cv_transform& tr);
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_TRANSFORM_HPP
diff --git a/include/lbann/data_readers/cv_utils.hpp b/include/lbann/data_readers/cv_utils.hpp
deleted file mode 100644
index fdac1bc77e3..00000000000
--- a/include/lbann/data_readers/cv_utils.hpp
+++ /dev/null
@@ -1,498 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_utils .cpp .hpp - operations related to opencv images
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_CV_UTILS_HPP
-#define LBANN_CV_UTILS_HPP
-
-#include <type_traits>
-#include <typeinfo>   // operator typeid
-#include "opencv_extensions.hpp"
-#include "cv_process.hpp"
-#include "lbann/utils/mild_exception.hpp"
-
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-class cv_utils {
- public:
-
-  // copy_cvMat_to_buf (with a tempoary buffer)
-  template<typename T = uint8_t, int NCh = 3>
-  static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp);
-
-  template<typename T = uint8_t>
-  static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp);
-
-  /** Copy a cv::Mat image into a serialized buffer.
-   *  The argument pp specifies the parameters for image preprocessing that
-   *  takes advantage of the OpenCV framework. Returns true if successful.
-   */
-  static bool copy_cvMat_to_buf(const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp);
-
-
-  // copy_buf_to_cvMat (with a tempoary buffer)
-  template<typename T = uint8_t, int NCh = 3>
-  static cv::Mat copy_buf_to_cvMat_with_full_info(const std::vector<uint8_t>& buf, const int Width, const int Height, const cv_process& pp);
-
-  template<typename T = uint8_t>
-  static cv::Mat copy_buf_to_cvMat_with_known_type(const std::vector<uint8_t>& buf, const int Width, const int Height, const cv_process& pp);
-
-  /** Reconstruct a cv::Mat image from a serialized buffer.
-   *  The image size is specified by Width and Height. Type indetifies the
-   *  OpenCV image type. The last argument pp specifies the parameters for
-   *  image postprocessing that takes advantage of the OpenCV framework.
-   *  Returns a reconstructed cv::Mat image if successful and an empty one
-   *  otherwise.
-   */
-  static cv::Mat copy_buf_to_cvMat(const std::vector<uint8_t>& buf, const int Width, const int Height, const int Type, const cv_process& pp);
-
-
-  // copy_buf_to_cvMat (with an El::Matrix<DataType> block)
-  template<typename T = DataType, int NCh = 3>
-  static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, CPUMat& buf, const cv_process& pp);
-
-  template<typename T = DataType>
-  static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, CPUMat& buf, const cv_process& pp);
-
-  /** Copy a cv::Mat image into an El::Matrix<DataType> block.
-   *  The argument pp specifies the parameters for image preprocessing that
-   *  takes advantage of the OpenCV framework. Returns true if successful.
-   */
-  static bool copy_cvMat_to_buf(const cv::Mat& image, CPUMat& buf, const cv_process& pp);
-
-
-  // copy_buf_to_cvMat (with an El::Matrix<DataType> block)
-  template<typename T = DataType, int NCh = 3>
-  static cv::Mat copy_buf_to_cvMat_with_full_info(const CPUMat& buf, const int Width, const int Height, const cv_process& pp);
-
-  template<typename T = DataType>
-  static cv::Mat copy_buf_to_cvMat_with_known_type(const CPUMat& buf, const int Width, const int Height, const cv_process& pp);
-
-  /** Reconstruct a cv::Mat image from an El::Matrix<DataType> block.
-   *  The image size is specified by Width and Height. Type indetifies the
-   *  OpenCV image type. The last argument pp specifies the parameters for
-   *  image postprocessing that takes advantage of the OpenCV framework.
-   *  Returns a reconstructed cv::Mat image if successful and an empty one
-   *  otherwise.
-   */
-  static cv::Mat copy_buf_to_cvMat(const CPUMat& buf, const int Width, const int Height, const int Type, const cv_process& pp);
-
-  /**
-   *  Use cv::imdecode() to load an image data instead of relying on cv::imread().
-   *  This avoids reading the image header to determine the decoder directly from
-   *  the file but allow doing so from the memory.
-   *  The arguments are the same as the ones with cv::imread() as well as the
-   *  return type. Avoiding the extra access to the underlying filesystem may
-   *  result in a better performance.
-   */
-  static cv::Mat lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf, cv::Mat* image = nullptr);
-};
-
-
-//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-//                       copy_cvMat_to_buf (vector<uchar>)
-/**
- * Copy a cv::Mat image into a serialized buffer. This requires the type of
- * channel values and the number of channels in the image to be known at
- * compile time. The default for these are the type uint8_t and 3 channels.
- * The argument pp specifies the parameters for image preprocessing that
- * takes advantage of the OpenCV framework. Returns true if successful.
- */
-template<typename T, int NCh>
-inline bool cv_utils::copy_cvMat_to_buf_with_full_info(
-  const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  const int Width = image.cols;
-  const int Height = image.rows;
-  const int sz = Height*Width;
-
-  buf.resize(sz*NCh*sizeof(T));
-  auto *Pixels = reinterpret_cast<T *>(&(buf[0]));
-
-  if (pp.to_split()) {
-    // TODO: like the case with the output in El::Matrixi type, branch on whether the
-    // input channel type T is same as that of the output (especially ::DataType)
-    std::vector<cv_normalizer::channel_trans_t> trans = pp.get_transform_normalize();
-    if (trans.size() == 0u) {
-      trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0));
-    }
-    _LBANN_MILD_EXCEPTION((trans.size() != NCh),
-                          "Incorrect number of channels in transform", false);
-    std::vector<cv::Mat> channels(NCh);
-
-    for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-      channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels);
-    }
-    cv::split(image, channels);
-
-    Pixels = reinterpret_cast<T *>(&(buf[0]));
-
-    for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-      cv_normalizer::
-      scale(Pixels, Pixels + sz, Pixels, {trans[ch]});
-    }
-  } else {
-    if (image.isContinuous()) {
-      cv_normalizer::
-      scale(reinterpret_cast<const T *>(image.datastart),
-            reinterpret_cast<const T *>(image.dataend),
-            Pixels, pp.get_transform_normalize());
-    } else {
-      const int stride = Width*NCh;
-      for (int i = 0; i < Height; ++i, Pixels += stride) {
-        const auto *ptr = reinterpret_cast<const T *>(image.ptr<const T>(i));
-        cv_normalizer::
-        scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize());
-      }
-    }
-  }
-
-  return true;
-}
-
-/**
- * Copy a cv::Mat image into a serialized buffer. This requires the type of
- * channel values to be known at compile time. The default type is uint8_t.
- * The argument pp specifies the parameters for image preprocessing that
- * takes advantage of the OpenCV framework. Returns true if successful.
- */
-template<typename T>
-inline bool cv_utils::copy_cvMat_to_buf_with_known_type(
-  const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp) {
-  _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \
-                                     copy_cvMat_to_buf_with_full_info, \
-                                     image, buf, pp)
-  return false;
-}
-//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
-
-
-//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-//                       copy_buf_to_cvMat (vector<uchar>)
-/**
- * Reconstruct a cv::Mat image from a serialized buffer. This requires the type
- * of channel values and the number of channels in the image to be known at
- * compile time. The default for these are the type uint8_t and 3 channels.
- * The image size is specified by Width and Height. The argument pp specifies
- * the parameters for image postprocessing that takes advantage of the OpenCV
- * framework. Returns an empty image if unsuccessful.
- */
-template<typename T, int NCh>
-inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info(
-  const std::vector<uint8_t>& buf, const int Width, const int Height, const cv_process& pp) {
-
-  const int sz = Height*Width;
-
-  _LBANN_MILD_EXCEPTION(sz*NCh*sizeof(T) != buf.size(), \
-                        "Size mismatch. Buffer has " << buf.size() << " items when " \
-                        << sz*NCh*sizeof(T) << " are expected.", \
-                        cv::Mat())
-
-  const auto *Pixels = reinterpret_cast<const T *>(&(buf[0]));
-
-  cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType<T>::depth, NCh));
-
-  if (pp.to_split()) {
-    // TODO: like the case with the output of El::Matrix type, branch on whether the
-    // input channel type T is same as that of the output (especially ::DataType)
-    std::vector<cv_normalizer::channel_trans_t> trans = pp.get_transform_normalize();
-    if (trans.size() == 0u) {
-      trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0));
-    }
-    _LBANN_MILD_EXCEPTION((trans.size() != NCh),
-                          "Incorrect number of channels in transform", cv::Mat());
-    std::vector<cv::Mat> channels(NCh);
-
-    for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-      channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), const_cast<T *>(Pixels));
-    }
-
-    cv::merge(channels, image);
-    auto *optr = reinterpret_cast<T *>(image.data);
-    for(size_t ch=0; ch < NCh; ++ch, optr += sz) {
-      cv_normalizer::
-      scale(reinterpret_cast<const T *>(image.datastart),
-            reinterpret_cast<const T *>(image.dataend),
-            optr, {trans[ch]});
-    }
-  } else {
-    cv_normalizer::
-    scale(Pixels, Pixels + sz*NCh, reinterpret_cast<T *>(image.data), pp.get_transform_normalize());
-  }
-
-  return image;
-}
-
-/**
- * Reconstruct a cv::Mat image from a serialized buffer. This requires the type
- * of channel values to be known at compile time. The default type is uint8_t.
- * The image size is specified by Width and Height. The last argument pp
- * specifies the parameters for image postprocessing that takes advantage of the
- * OpenCV framework. Returns a reconstructed cv::Mat image if successful and an
- * empty one otherwise.
- */
-template<typename T>
-inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type(
-  const std::vector<uint8_t>& buf, const int Width, const int Height, const cv_process& pp) {
-  _LBANN_MILD_EXCEPTION(buf.size() == 0u || Width == 0 || Height == 0, \
-                        "An empty image (" << Height << " x " << Width << ") or a buffer (" << buf.size() << ")", \
-                        cv::Mat())
-
-  const auto sz = static_cast<size_t>(Width*Height*sizeof(T));
-  const size_t NCh = buf.size()/sz;
-
-  _LBANN_MILD_EXCEPTION(sz*NCh != buf.size(), \
-                        "Size mismatch. Buffer has " << buf.size() << " items when " << sz*NCh << " are expected.", \
-                        cv::Mat())
-
-  _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \
-                                     copy_buf_to_cvMat_with_full_info, \
-                                     buf, Width, Height, pp);
-
-  _LBANN_DEBUG_MSG(NCh << "-channel image is not supported.");
-  return cv::Mat();
-}
-//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
-
-
-
-//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-//                       copy_cvMat_to_buf (Elemental)
-/**
- * Copy a cv::Mat image into a data block of El::Matrix<DataType> type. This
- * requires the type of channel values and the number of channels in the image
- * to be known at compile time. The default for these are the DataType of LBANN
- * and 3 channels. In case of copying a single image into a collection of
- * images as an existing El::Matrix<DataType> matrix, a sub-matrix View can be passed.
- * The argument pp specifies the parameters for image preprocessing that
- * takes advantage of the OpenCV framework. Returns true if successful.
- */
-template<typename T, int NCh>
-inline bool cv_utils::copy_cvMat_to_buf_with_full_info(
-  const cv::Mat& image, CPUMat& buf, const cv_process& pp) {
-  // NCh need not be a template parameter here. It can be a function argument.
-  // However, keeping it as a static parameter enables custom accesses on pixels
-  // For example,
-  //   using Vec_T = cv::Vec<T, NCh>;
-  //   image.at<Vec_T>(y, x) = newPixel;
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  const int Width = image.cols;
-  const int Height = image.rows;
-  const int sz = Height*Width;
-
-  if (buf.Height() != sz*NCh) {
-#if 0
-    return false;
-#else
-    //_LBANN_DEBUG_MSG("Resizing buffer height to " << sz*NCh);
-    buf.Resize(sz*NCh, ((buf.Width()<1)? 1 : buf.Width()));
-#endif
-  }
-
-  DataType *Pixels = buf.Buffer();
-
-  if (pp.to_split()) {
-    std::vector<cv_normalizer::channel_trans_t> trans = pp.get_transform_normalize();
-    if (trans.size() == 0u) {
-      trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0));
-    }
-    _LBANN_MILD_EXCEPTION((trans.size() != NCh),
-                          "Incorrect number of channels in transform", false);
-    std::vector<cv::Mat> channels(NCh);
-
-    if (std::is_same<DataType, T>::value) {
-      for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-        // create a separate image per channel aliasing the memory of buf
-        channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels);
-      }
-      Pixels = buf.Buffer();
-
-      cv::split(image, channels);
-
-      for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-        cv_normalizer::
-        scale(Pixels, Pixels + sz, Pixels, {trans[ch]});
-      }
-    } else {
-      cv::split(image, channels);
-
-      for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-        cv_normalizer::
-        scale(reinterpret_cast<const T *>(channels[ch].datastart),
-              reinterpret_cast<const T *>(channels[ch].dataend),
-              Pixels, {trans[ch]});
-      }
-    }
-  } else {
-    if (image.isContinuous()) {
-      cv_normalizer::
-      scale(reinterpret_cast<const T *>(image.datastart),
-            reinterpret_cast<const T *>(image.dataend),
-            Pixels, pp.get_transform_normalize());
-    } else {
-      const int stride = Width*NCh;
-      for (int i = 0; i < Height; ++i, Pixels += stride) {
-        const auto *ptr = reinterpret_cast<const T *>(image.ptr<const T>(i));
-        cv_normalizer::
-        scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize());
-      }
-    }
-  }
-
-  return true;
-}
-
-/**
- * Copy a cv::Mat image into a data block of El::Matrix<DataType> type. This
- * requires the type of channel values in the image to be known at compile time.
- * The default for these are the DataType of LBANN.
- * The argument pp specifies the parameters for image preprocessing that
- * takes advantage of the OpenCV framework. Returns true if successful.
- */
-template<typename T>
-inline bool cv_utils::copy_cvMat_to_buf_with_known_type(
-  const cv::Mat& image, CPUMat& buf, const cv_process& pp) {
-  _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \
-                                     copy_cvMat_to_buf_with_full_info, \
-                                     image, buf, pp)
-  return false;
-}
-//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
-
-
-//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-//                       copy_buf_to_cvMat (Elemental)
-/**
- * Reconstruct a cv::Mat image from a data block of El::Matrix<DataType> type.
- * This requires the type of channel values and the number of channels in the
- * image to be known at compile time. The default for these are DataType of
- * LBANN and 3 channels. In case of copying a single image data in a matrix
- * of multiple images, a sub-matrix View can be passed.
- * The image size is specified by Width and Height. The argument pp specifies
- * the parameters for image postprocessing that takes advantage of the OpenCV
- * framework. Returns an empty image if unsuccessful.
- */
-template<typename T, int NCh>
-inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info(
-  const CPUMat& buf, const int Width, const int Height, const cv_process& pp) {
-
-  const int sz = Height*Width;
-  _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \
-                        "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \
-                        << sz*NCh << " are expected.", \
-                        cv::Mat())
-
-  const DataType *Pixels = buf.LockedBuffer();
-
-  cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType<T>::depth, NCh));
-
-  if (pp.to_split()) {
-    std::vector<cv_normalizer::channel_trans_t> trans = pp.get_transform_normalize();
-    if (trans.size() == 0u) {
-      trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0));
-    }
-    _LBANN_MILD_EXCEPTION((trans.size() != NCh),
-                          "Incorrect number of channels in transform", cv::Mat());
-    std::vector<cv::Mat> channels(NCh);
-
-    if (std::is_same<DataType, T>::value) {
-      for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-        channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1),
-                               const_cast<DataType *>(Pixels));
-      }
-
-      cv::merge(channels, image);
-      const auto *iptr = reinterpret_cast<const T *>(image.data);
-      auto *optr = reinterpret_cast<T *>(image.data);
-
-      cv_normalizer::
-      scale(iptr, iptr+sz*NCh, optr, trans);
-    } else {
-      for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) {
-        channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1));
-        cv_normalizer::
-        scale(Pixels, Pixels+sz,
-              reinterpret_cast<T *>(channels[ch].data), {trans[ch]});
-      }
-      cv::merge(channels, image);
-    }
-  } else {
-    cv_normalizer::
-    scale(Pixels, Pixels + sz*NCh,
-          reinterpret_cast<T *>(image.data),
-          pp.get_transform_normalize());
-  }
-
-  return image;
-}
-
-/**
- * Reconstruct a cv::Mat image from a data block of El::Matrix<DataType> type.
- * This requires the type of channel values to be known at compile time. The
- * default type is DataType. In this case, the new image may require conversion
- * to an integer type during postprocessing such that it can be stored in an
- * typical image file format. An image can sometimes be constructed even when
- * T is different from DataType if the type casting of a DataType value into T
- * is valid.
- * The image size is specified by Width and Height. The last argument pp
- * specifies the parameters for image postprocessing that takes advantage of the
- * OpenCV framework. This returns a reconstructed cv::Mat image if successful
- * and an empty one otherwise.
- */
-template<typename T>
-inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type(
-  const CPUMat& buf, const int Width, const int Height, const cv_process& pp) {
-  _LBANN_MILD_EXCEPTION(buf.Height() == 0u || buf.Width() == 0u || Width == 0 || Height == 0, \
-                        "An empty image (" << Height << " x " << Width << ") or a buffer (" \
-                        << buf.Height() << " x " << buf.Width() << ").", \
-                        cv::Mat())
-
-  const int sz = Height*Width;
-  const int NCh = buf.Height()/sz;
-
-  _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \
-                        "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \
-                        << sz*NCh << " are expected.", \
-                        cv::Mat())
-
-  _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \
-                                     copy_buf_to_cvMat_with_full_info, \
-                                     buf, Width, Height, pp)
-
-  _LBANN_DEBUG_MSG(NCh << "-channel image is not supported.");
-  return cv::Mat();
-}
-//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
-
-#endif // LBANN_CV_UTILS_HPP
diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp
index b70c576c376..c2a4c8db152 100644
--- a/include/lbann/data_readers/data_reader.hpp
+++ b/include/lbann/data_readers/data_reader.hpp
@@ -30,21 +30,23 @@
 #define LBANN_DATA_READER_HPP
 
 #include "lbann/base.hpp"
-#include "lbann/utils/random.hpp"
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include "lbann/utils/random_number_generators.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/comm.hpp"
 #include "lbann/io/file_io.hpp"
 #include "lbann/io/persist.hpp"
-#include "lbann/data_readers/image_preprocessor.hpp"
 #include "lbann/utils/options.hpp"
-#include "lbann/utils/threads/thread_pool.hpp"
+#include "lbann/transforms/transform_pipeline.hpp"
 #include <cassert>
 #include <algorithm>
 #include <string>
 #include <vector>
 #include <unistd.h>
 #include <unordered_set>
-
+#include <cereal/types/utility.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
 
 #define NOT_IMPLEMENTED(n) { \
   std::stringstream s; \
@@ -54,7 +56,8 @@
 namespace lbann {
 
 class data_store_conduit;
-class model;
+class thread_pool;
+class trainer;
 
 /**
  * A data reader manages reading in data in a particular format.
@@ -62,7 +65,7 @@ class model;
  * classes should implement load and the appropriate subset of fetch_datum,
  * fetch_label, and fetch_response.
  */
-class generic_data_reader : public lbann_image_preprocessor {
+class generic_data_reader {
  public:
 
  #define JAG_NOOP_VOID if (m_jag_partitioned) { return; }
@@ -72,6 +75,7 @@ class generic_data_reader : public lbann_image_preprocessor {
    * ctor
    */
   generic_data_reader(bool shuffle = true) :
+    m_verbose(options::get()->get_bool("verbose")),
     m_data_store(nullptr),
     m_comm(nullptr),
     m_mini_batch_size(0), m_current_pos(0),
@@ -99,14 +103,23 @@ class generic_data_reader : public lbann_image_preprocessor {
     m_procs_per_partition(1),
     m_io_thread_pool(nullptr),
     m_jag_partitioned(false),
-    m_model(nullptr)
-  {}
+    m_trainer(nullptr),
+    m_issue_warning(true)
+  {
+  }
   generic_data_reader(const generic_data_reader&) = default;
   generic_data_reader& operator=(const generic_data_reader&) = default;
 
-  ~generic_data_reader() override {}
+  virtual ~generic_data_reader() {}
   virtual generic_data_reader* copy() const = 0;
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_current_mini_batch_idx),
+       CEREAL_NVP(m_current_pos),
+       CEREAL_NVP(m_shuffled_indices));
+  }
+
   /// set the comm object
   void set_comm(lbann_comm *comm) {
     m_comm = comm;
@@ -249,16 +262,7 @@ class generic_data_reader : public lbann_image_preprocessor {
    * Set an idenifier for the dataset.
    * The role should be one of "train", "test", or "validate".
    */
-  virtual void set_role(std::string role) {
-    m_role = role;
-    if (options::get()->has_string("jag_partitioned")
-        && get_role() == "train") {
-      m_jag_partitioned = true;
-      if (is_master()) {
-        std::cerr << "USING JAG DATA PARTITIONING\n";
-      }
-    }
-  }
+  virtual void set_role(std::string role);
 
   /**
    * Get the role for this dataset.
@@ -281,7 +285,7 @@ class generic_data_reader : public lbann_image_preprocessor {
    * If the base offset is not specified set it to 0
    * If the stride is not specified set it to batch size
    */
-  virtual void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool);
+  virtual void setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool);
 
   /** Return this data_reader's type */
   virtual std::string get_type() const = 0;
@@ -293,15 +297,6 @@ class generic_data_reader : public lbann_image_preprocessor {
   /// Fetch this mini-batch's responses into Y.
   virtual int fetch_responses(CPUMat& Y);
 
-  /**
-   * Save pixels to an image. The implementing data reader is responsible for
-   * handling format detection, conversion, etc.
-   */
-  // TODO: This function needs to go away from here
-  void save_image(Mat& pixels, const std::string filename,
-                          bool do_scale = true) override {
-    NOT_IMPLEMENTED("save_image");
-  }
   /**
    * During the network's update phase, the data reader will
    * advanced the current position pointer.  If the pointer wraps
@@ -353,6 +348,13 @@ class generic_data_reader : public lbann_image_preprocessor {
   virtual const std::vector<int> get_data_dims() const {
     return std::vector<int>(0);
   }
+
+  virtual std::vector<El::Int> get_slice_points(const slice_points_mode var_category,
+                                                bool& is_supported) {
+    is_supported = false;
+    return {};
+  }
+
   /// True if the data reader's current position is valid.
   virtual bool position_valid() const {
     return (m_current_pos < get_num_data());
@@ -567,9 +569,17 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
 
   /**
-   * Select the appropriate subset of data based on settings.
+   * Optionally resizes the shuffled indices based on the data reader
+   * prototext settings: absolute_sample_count, percent_of_data_to_use.
+   * (dah - this was formerly part of select_subset_of_data)
    */
-  virtual void select_subset_of_data();
+  void resize_shuffled_indices();
+
+  /**
+   * Select the appropriate subset of data for the validation set based on
+   * the data reader prototext setting: validation_percent
+   */
+  void select_subset_of_data();
 
   /// called by select_subset_of_data() if data set is partitioned
   void select_subset_of_data_partitioned();
@@ -593,96 +603,15 @@ class generic_data_reader : public lbann_image_preprocessor {
 
 
   /** \brief Given directory to store checkpoint files, write state to file and add to number of bytes written */
-  bool save_to_checkpoint_shared(persist& p, const char *name);
+  bool save_to_checkpoint_shared(persist& p, execution_mode mode);
 
   /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */
-  bool load_from_checkpoint_shared(persist& p, const char *name);
+  bool load_from_checkpoint_shared(persist& p, execution_mode mode);
 
-  bool save_to_checkpoint_distributed(persist& p, const char *name);
+  bool save_to_checkpoint_distributed(persist& p, execution_mode mode);
 
   /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */
-  bool load_from_checkpoint_distributed(persist& p, const char *name);
-
-  struct packing_header {
-    uint64_t current_pos;
-    uint64_t current_mini_batch_idx;
-    uint64_t data_size;
-  };
-  bool pack_scalars(persist& p, const char *name) {
-    char fieldname[1024];
-    lbann::persist_type persist_value;
-    std::string s_name(name);
-    if(s_name.compare("data_reader_validation") == 0){
-      persist_value = persist_type::validate;
-    } else {
-       persist_value= persist_type::train;
-    }
-
-
-    snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name);
-    p.write_uint64(persist_value, fieldname, (uint64_t) m_current_mini_batch_idx);
-
-    int size = m_shuffled_indices.size();
-    snprintf(fieldname, sizeof(fieldname), "%s_data_size", name);
-    p.write_uint64(persist_value, fieldname, (uint64_t) size);
-
-    snprintf(fieldname, sizeof(fieldname), "%s_data_position", name);
-    p.write_uint64(persist_value, fieldname, (uint64_t) m_current_pos);
-
-    snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name);
-    p.write_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size);
-
-    return true;
-  }
-
-  bool unpack_scalars(persist& p, struct packing_header *header, const char *name){
-    char fieldname[1024];
-    lbann::persist_type persist_value;
-    std::string s_name(name);
-    if(s_name.compare("data_reader_validation") == 0){
-      persist_value = persist_type::validate;
-    } else {
-       persist_value= persist_type::train;
-    }
-    // Closest to non checkpoint run only loads m_current_pos
-
-    // record minibatch index
-    uint64_t val;
-
-    snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name);
-    p.read_uint64(persist_value, fieldname, &val);
-    m_current_mini_batch_idx = (int) val;
-
-    snprintf(fieldname, sizeof(fieldname), "%s_data_size", name);
-    p.read_uint64(persist_value, fieldname, &val);
-    auto size = (int) val;
-
-    // get current position within data
-    snprintf(fieldname, sizeof(fieldname), "%s_data_position", name);
-    p.read_uint64(persist_value, fieldname, &val);
-    m_current_pos = (int) val;
-    //resize shuffled index array to hold values
-    m_shuffled_indices.resize(size);
-
-     //read list of indices
-    snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name);
-    p.read_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size);
-
-    if(header != nullptr){
-      //shuffled data indices array size, used for resize after broadcast. Not unpacked.
-      header->data_size = size;
-      // all else, unpacked and set in unpack header.
-      header->current_pos = m_current_pos;
-      header->current_mini_batch_idx = m_current_mini_batch_idx;
-    }
-
-  return true;
-  }
-
-  void unpack_header(struct packing_header& header){
-    m_current_pos = (int) header.current_pos;
-    m_current_mini_batch_idx = (int) header.current_mini_batch_idx;
-  }
+  bool load_from_checkpoint_distributed(persist& p, execution_mode mode);
 
   /// returns a const ref to the data store
   virtual const data_store_conduit& get_data_store() const {
@@ -702,17 +631,9 @@ class generic_data_reader : public lbann_image_preprocessor {
   /// until later.
   void setup_data_store(int mini_batch_size);
 
-  void instantiate_data_store(const std::vector<int>& local_list_sizes = std::vector<int>());
+  void instantiate_data_store();
 
-  // note: don't want to make this virtual, since then all derived classes
-  //       would have to override. But, this should only be called from within
-  //       derived classes where it makes sense to do so.
-  //       Once the sample_list class and file formats are generalized and
-  //       finalized, it should (may?) be possible to code a single
-  //       preload_data_store method.
-  virtual void preload_data_store() {
-    LBANN_ERROR("you should not be here");
-  }
+  virtual void preload_data_store();
 
   void set_gan_labelling(bool has_gan_labelling) {
      m_gan_labelling = has_gan_labelling;
@@ -726,14 +647,26 @@ class generic_data_reader : public lbann_image_preprocessor {
 
   virtual bool priming_data_store() const;
 
-  void set_model(model *m) { m_model = m; }
+  void set_trainer(trainer *t) { m_trainer = t; }
+
+  trainer& get_trainer() const {
+    if(m_trainer == nullptr) { LBANN_ERROR("get_trainer called with nullptr"); }
+    return *m_trainer;
+  }
 
   /// experimental; used to ensure all readers for jag_conduit_hdf5
   /// have identical shuffled indices
   virtual void post_update() {}
 
+  /** Set the transform pipeline this data reader will use. */
+  void set_transform_pipeline(transform::transform_pipeline&& tp) {
+    m_transform_pipeline = std::move(tp);
+  }
+
  protected:
 
+  bool m_verbose = false;
+
   // For use with conduit when samples are corrupt.
   mutable std::unordered_set<int> m_using_random_node;
 
@@ -759,7 +692,7 @@ class generic_data_reader : public lbann_image_preprocessor {
 
   lbann_comm *m_comm;
 
-  virtual bool fetch_data_block(CPUMat& X, El::Int thread_index, El::Int mb_size, El::Matrix<El::Int>& indices_fetched);
+  virtual bool fetch_data_block(CPUMat& X, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix<El::Int>& indices_fetched);
 
   /**
    * Fetch a single sample into a matrix.
@@ -861,9 +794,29 @@ class generic_data_reader : public lbann_image_preprocessor {
 
   bool m_master;
 
+  /** @brief Print the return values from various get_X methods to file
+   *
+   * For use in unit testing. Only the master prints.
+   * Currently only prints values from get_X methods that only depend
+   * on the data_reader (i.e, not on the trainer, model, etc)
+   */
+  void print_get_methods(const std::string filename);
+
+  /**
+   * Returns the number of the shuffled indices that are to be
+   * used. Code in this method was formerly in select_subset_of_data()
+   */
+  size_t get_num_indices_to_use() const;
+
   friend class data_reader_merge_features;
   friend class data_reader_merge_samples;
 
+private:
+
+  virtual void do_preload_data_store() {
+    LBANN_ERROR("Not implemented.");
+  }
+
  protected :
   //var to support GAN
   bool m_gan_labelling; //boolean flag of whether its GAN binary label, default is false
@@ -896,7 +849,7 @@ class generic_data_reader : public lbann_image_preprocessor {
 
   std::vector<std::vector<char>> m_thread_buffer;
 
-  std::shared_ptr<thread_pool> m_io_thread_pool;
+  observer_ptr<thread_pool> m_io_thread_pool;
 
   /// special handling for 1B jag; each reader
   /// owns a unique subset of the data
@@ -906,7 +859,19 @@ class generic_data_reader : public lbann_image_preprocessor {
   /// this sets various member variables (num_iterations, m_reset_mini_batch_index,
   /// etc.
   void set_jag_variables(int mb_size);
-  model *m_model;
+  trainer *m_trainer;
+
+  /** Transform pipeline for preprocessing data. */
+  transform::transform_pipeline m_transform_pipeline;
+
+  /// for use with data_store: issue a warning a single time if m_data_store != nullptr,
+  /// but we're not retrieving a conduit::Node from the store. This typically occurs
+  /// during the test phase
+  bool m_issue_warning;
+
+  /// throws exception if get_absolute_sample_count() and
+  /// get_use_percent() are incorrect
+  void error_check_counts() const;
 };
 
 template<typename T>
diff --git a/include/lbann/data_readers/data_reader_ascii.hpp b/include/lbann/data_readers/data_reader_ascii.hpp
deleted file mode 100644
index 09504b49397..00000000000
--- a/include/lbann/data_readers/data_reader_ascii.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_ascii .hpp .cpp - generic_data_reader class for ASCII text files
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_DATA_READER_ASCII_HPP
-#define LBANN_DATA_READER_ASCII_HPP
-
-#include "data_reader.hpp"
-
-namespace lbann {
-
-class ascii_reader : public generic_data_reader {
- public:
-  ascii_reader(int sequence_length = 1, bool shuffle = true);
-  ascii_reader(const ascii_reader&) = default;
-  ascii_reader& operator=(const ascii_reader&) = default;
-  ~ascii_reader() override = default;
-  ascii_reader* copy() const override { return new ascii_reader(*this); }
-
-  std::string get_type() const override {
-    return "ascii_reader";
-  }
-
-  void load() override;
-
-  int get_linearized_data_size() const override {
-    return 128 * m_sequence_length;
-  }
-  int get_linearized_label_size() const override {
-    return 128 * m_sequence_length;
-  }
-  const std::vector<int> get_data_dims() const override {
-    return {128 * m_sequence_length};
-  }
-
- protected:
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
-  /** Length of text sequence. */
-  int m_sequence_length;
-  /** Size of data file in bytes. */
-  int m_file_size;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_DATA_READER_ASCII_HPP
diff --git a/include/lbann/data_readers/data_reader_cifar10.hpp b/include/lbann/data_readers/data_reader_cifar10.hpp
index 7c72975bf98..a0c7ae61257 100644
--- a/include/lbann/data_readers/data_reader_cifar10.hpp
+++ b/include/lbann/data_readers/data_reader_cifar10.hpp
@@ -23,7 +23,7 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 //
-// lbann_data_reader_cifar10 .hpp .cpp - generic_data_reader class for CIFAR10 dataset
+// data_reader_cifar10 .hpp .cpp - Data reader for CIFAR-10/100
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifndef LBANN_DATA_READER_CIFAR10_HPP
@@ -33,14 +33,23 @@
 
 namespace lbann {
 
+/**
+ * A data reader for the CIFAR-10/100 datasets.
+ *
+ * This requires the binary distributions of the datasets, which
+ * must retain their original filenames.
+ * CIFAR-10 vs -100 is inferred by the number of labels set.
+ * @note This does not store the coarse labels from CIFAR-100.
+ *
+ * See:
+ *     https://www.cs.toronto.edu/~kriz/cifar.html
+ */
 class cifar10_reader : public image_data_reader {
  public:
-  /// constructor
   cifar10_reader(bool shuffle = true);
   cifar10_reader(const cifar10_reader&) = default;
   cifar10_reader& operator=(const cifar10_reader&) = default;
 
-  /// destructor
   ~cifar10_reader() override;
 
   cifar10_reader* copy() const override { return new cifar10_reader(*this); }
@@ -58,7 +67,13 @@ class cifar10_reader : public image_data_reader {
   bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
 
  private:
-  std::vector<std::vector<unsigned char> > m_data;
+  /**
+   * Loaded image data.
+   * This will be stored in "OpenCV" format for ease of preprocessing.
+   */
+  std::vector<std::vector<unsigned char>> m_images;
+  /** Loaded label information. */
+  std::vector<uint8_t> m_labels;
 };
 
 }  // namespace lbann
diff --git a/include/lbann/data_readers/data_reader_csv.hpp b/include/lbann/data_readers/data_reader_csv.hpp
index 58c55885c68..ae0ead7811f 100644
--- a/include/lbann/data_readers/data_reader_csv.hpp
+++ b/include/lbann/data_readers/data_reader_csv.hpp
@@ -30,7 +30,6 @@
 #define LBANN_DATA_READER_CSV_HPP
 
 #include "data_reader.hpp"
-#include "image_preprocessor.hpp"
 #include <unordered_map>
 
 namespace lbann {
diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp
index ba809f6547c..cde595e781e 100644
--- a/include/lbann/data_readers/data_reader_image.hpp
+++ b/include/lbann/data_readers/data_reader_image.hpp
@@ -30,8 +30,7 @@
 #define IMAGE_DATA_READER_HPP
 
 #include "data_reader.hpp"
-#include "image_preprocessor.hpp"
-#include "cv_process.hpp"
+#include "lbann/data_store/data_store_conduit.hpp"
 
 namespace lbann {
 class image_data_reader : public generic_data_reader {
@@ -54,7 +53,7 @@ class image_data_reader : public generic_data_reader {
   // dataset specific functions
   void load() override;
 
-  void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) override;
+  void setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) override;
 
   int get_num_labels() const override {
     return m_num_labels;
@@ -79,11 +78,6 @@ class image_data_reader : public generic_data_reader {
     return {m_image_num_channels, m_image_height, m_image_width};
   }
 
-  void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override {
-    internal_save_image(pixels, filename, m_image_height, m_image_width,
-                        m_image_num_channels, do_scale);
-  }
-
   /// Return the sample list of current minibatch
   std::vector<sample_t> get_image_list_of_current_mb() const;
 
@@ -100,13 +94,18 @@ class image_data_reader : public generic_data_reader {
     return m_image_list.at(idx);
   }
 
+  void do_preload_data_store() override;
+
+  void load_conduit_node_from_file(int data_id, conduit::Node &node);
+
  protected:
+   void copy_members(const image_data_reader &rhs);
+
   /// Set the default values for the width, the height, the number of channels, and the number of labels of an image
   virtual void set_defaults();
   bool fetch_label(Mat& Y, int data_id, int mb_idx) override;
   void set_linearized_image_size();
 
- protected:
   std::string m_image_dir; ///< where images are stored
   std::vector<sample_t> m_image_list; ///< list of image files and labels
   int m_image_width; ///< image width
@@ -114,7 +113,9 @@ class image_data_reader : public generic_data_reader {
   int m_image_num_channels; ///< number of image channels
   int m_image_linearized_size; ///< linearized image size
   int m_num_labels; ///< number of labels
-  std::vector<cv::Mat> m_thread_cv_buffer;
+
+  bool  load_conduit_nodes_from_file(const std::unordered_set<int> &data_ids);
+
 };
 
 }  // namespace lbann
diff --git a/include/lbann/data_readers/data_reader_imagenet.hpp b/include/lbann/data_readers/data_reader_imagenet.hpp
index 4d6484e24c4..7f226f965de 100644
--- a/include/lbann/data_readers/data_reader_imagenet.hpp
+++ b/include/lbann/data_readers/data_reader_imagenet.hpp
@@ -30,35 +30,25 @@
 #define LBANN_DATA_READER_IMAGENET_HPP
 
 #include "data_reader_image.hpp"
-#include "cv_process.hpp"
 
 namespace lbann {
 class imagenet_reader : public image_data_reader {
  public:
-  imagenet_reader(bool shuffle) = delete;
-  imagenet_reader(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
-  imagenet_reader(const imagenet_reader&);
-  imagenet_reader& operator=(const imagenet_reader&);
+  imagenet_reader(bool shuffle = true);
+  imagenet_reader(const imagenet_reader&) = default;
+  imagenet_reader& operator=(const imagenet_reader&) = default;
   ~imagenet_reader() override;
 
   imagenet_reader* copy() const override { return new imagenet_reader(*this); }
 
-  void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) override;
-
   std::string get_type() const override {
     return "imagenet_reader";
   }
 
  protected:
   void set_defaults() override;
-  virtual bool replicate_processor(const cv_process& pp, const int nthreads);
   virtual CPUMat create_datum_view(CPUMat& X, const int mb_idx) const;
   bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-
- protected:
-  /// preprocessor duplicated for each omp thread
-  std::vector<std::unique_ptr<cv_process> > m_pps;
-  std::unique_ptr<cv_process> m_master_pps;
 };
 
 }  // namespace lbann
diff --git a/include/lbann/data_readers/data_reader_imagenet_patches.hpp b/include/lbann/data_readers/data_reader_imagenet_patches.hpp
deleted file mode 100644
index 49539429fab..00000000000
--- a/include/lbann/data_readers/data_reader_imagenet_patches.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_data_reader_imagenet_patches .hpp .cpp - extract patches from ImageNet dataset
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_DATA_READER_IMAGENET_PATCHES_HPP
-#define LBANN_DATA_READER_IMAGENET_PATCHES_HPP
-
-#include "data_reader_image.hpp"
-#include "cv_process_patches.hpp"
-
-namespace lbann {
-class imagenet_reader_patches : public image_data_reader {
- public:
-  imagenet_reader_patches(bool shuffle) = delete;
-  imagenet_reader_patches(const std::shared_ptr<cv_process_patches>& pp, bool shuffle = true);
-  imagenet_reader_patches(const imagenet_reader_patches&);
-  imagenet_reader_patches& operator=(const imagenet_reader_patches&);
-  ~imagenet_reader_patches() override;
-
-  imagenet_reader_patches* copy() const override { return new imagenet_reader_patches(*this); }
-
-  void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) override;
-
-  std::string get_type() const override {
-    return "imagenet_reader_patches";
-  }
-
-  int get_linearized_data_size() const override {
-    return m_image_linearized_size * m_num_patches;
-  }
-  const std::vector<int> get_data_dims() const override {
-    return {m_num_patches*m_image_num_channels, m_image_height, m_image_width};
-  }
-
- protected:
-  void set_defaults() override;
-  virtual bool replicate_processor(const cv_process_patches& pp, const int nthreads);
-  virtual std::vector<CPUMat> create_datum_views(CPUMat& X, const int mb_idx) const;
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-
- protected:
-  int m_num_patches; ///< number of patches extracted
-  /// preprocessor for patches duplicated for each omp thread
-  std::vector<std::unique_ptr<cv_process_patches> > m_pps;
-  std::unique_ptr<cv_process_patches> m_master_pps;
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_DATA_READER_IMAGENET_PATCHES_HPP
diff --git a/include/lbann/data_readers/data_reader_jag.hpp b/include/lbann/data_readers/data_reader_jag.hpp
deleted file mode 100644
index c10daf0c9de..00000000000
--- a/include/lbann/data_readers/data_reader_jag.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef _DATA_READER_JAG_HPP_
-#define _DATA_READER_JAG_HPP_
-
-#include "cnpy.h"
-#include <string>
-#include <vector>
-#include "lbann/base.hpp"
-#include "lbann/data_readers/opencv.hpp"
-#include "data_reader.hpp"
-
-namespace lbann {
-
-/**
- * Loads the pairs of JAG simulation inputs and results
- */
-class data_reader_jag : public generic_data_reader {
- public:
-  using data_t = double;
-  using scalar_t = double;
-  using input_t = double;
-
-  /**
-   * Dependent/indepdendent variable types
-   * - JAG_Image: simulation output images
-   * - JAG_Scalar: simulation output scalars
-   * - JAG_Input: simulation input parameters
-   * - Undefined: the default
-   */
-  enum variable_t {Undefined = 0, JAG_Image, JAG_Scalar, JAG_Input};
-
-  data_reader_jag(bool shuffle = true);
-  // TODO: copy constructor and assignment operator for deep-copying if needed
-  // The cnpy structure relies on shared_ptr
-  data_reader_jag(const data_reader_jag&) = default;
-  data_reader_jag& operator=(const data_reader_jag&) = default;
-  ~data_reader_jag() override;
-  data_reader_jag* copy() const override { return new data_reader_jag(*this); }
-
-  std::string get_type() const override {
-    return "data_reader_jag";
-  }
-
-  /// Choose which data to use for independent variable
-  void set_independent_variable_type(const std::vector< std::vector<variable_t> >& independent);
-  /// Choose which data to use for dependent variable
-  void set_dependent_variable_type(const std::vector< std::vector<variable_t> >& dependent);
-
-  /// Tell which data to use for independent variable
-  std::vector<variable_t> get_independent_variable_type() const;
-  /// Tell which data to use for dependent variable
-  std::vector<variable_t> get_dependent_variable_type() const;
-
-  /// Set normalization mode: 0 = none, 1 = dataset-wise, 2 = image-wise
-  void set_normalization_mode(int mode);
-
-  /// Set the image dimension
-  void set_image_dims(const int width, const int height);
-
-  /// Load data and do data reader's chores.
-  void load() override;
-
-  /// Show the description
-  std::string get_description() const;
-
-  /// Return the number of samples
-  size_t get_num_samples() const;
-
-  /// Return the linearized size of an image
-  size_t get_linearized_image_size() const;
-  /// Return the linearized size of scalar outputs
-  size_t get_linearized_scalar_size() const;
-  /// Return the linearized size of inputs
-  size_t get_linearized_input_size() const;
-
-  int get_linearized_data_size() const override;
-  int get_linearized_response_size() const override;
-  std::vector<size_t> get_linearized_data_sizes() const;
-  std::vector<size_t> get_linearized_response_sizes() const;
-  const std::vector<int> get_data_dims() const override;
-
-  /// Return the pointer to the raw image data
-  data_t* get_image_ptr(const size_t i) const;
-  /// Return the image data as a 1-D vector of lbann::DataType
-  cv::Mat get_image(const size_t i) const;
-
-  /// Return the pointer to the raw scalar data
-  scalar_t* get_scalar_ptr(const size_t i) const;
-  /// Return the scalar values of the i-th sample
-  std::vector<DataType> get_scalar(const size_t i) const;
-
-  /// Return the pointer to the raw input data
-  input_t* get_input_ptr(const size_t i) const;
-  /// Return the input values of the simulation correspoding to the i-th sample
-  std::vector<DataType> get_input(const size_t i) const;
-
-  void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override;
-
- protected:
-  /// add data type for independent variable
-  void add_independent_variable_type(const variable_t independent);
-  /// add data type for dependent variable
-  void add_dependent_variable_type(const variable_t dependent);
-
-  /// check if type t is used in the independent variable
-  bool is_independent(const variable_t t) const;
-  /// check if type t is used in the dependent variable
-  bool is_dependent(const variable_t t) const;
-  /// check if type t is used in either the indepedent or the dependent variable
-  bool is_used(const variable_t t) const;
-
-  using generic_data_reader::get_linearized_size;
-  /// Return the linearized size of a particular JAG variable type
-  size_t get_linearized_size(const variable_t t) const;
-  /// Return the dimension of a particular JAG variable type
-  const std::vector<int> get_dims(const variable_t t) const;
-
-  virtual std::vector<CPUMat>
-    create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const;
-
-  bool fetch(CPUMat& X, int data_id, int mb_idx,
-             const data_reader_jag::variable_t vt, const std::string tag);
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
-  /**
-   * Load the data in the numpy format file.
-   * Use only first_n available samples if specified.
-   */
-  void load(const std::string image_file, const std::string scalar_file,
-            const std::string input_file, const size_t first_n = 0u);
-
-  /// Check the dimensions of loaded data
-  bool check_data(size_t& num_samples) const;
-
-  /**
-   * Normalize image data to [0 1] scale once after loading based on the mode
-   *  0 (none): no normalization
-   *  1 (dataset-wise): map the min/max of all the pixels in image data to 0/1
-   *  2 (image-wise): map the min/max of all the pixels in current image to 0/1
-   */
-  void normalize_image();
-
-  /// Set the linearized size of an image
-  void set_linearized_image_size();
-  /// Set the linearized size of scalar outputs
-  void set_linearized_scalar_size();
-  /// Return the linearized size of inputs
-  void set_linearized_input_size();
-
-  int get_num_labels() const override {
-    return m_num_labels;
-  }
-
-  int get_linearized_label_size() const override {
-    return m_num_labels;
-  }
-  /// Return the maximum element of all the images
-  data_t get_image_max() const;
-  /// Return the minimum element of all the images
-  data_t get_image_min() const;
-
- protected:
-  /// independent variable type
-  std::vector<variable_t> m_independent;
-  /// dependent variable type
-  std::vector<variable_t> m_dependent;
-
-  /// Whether image output data have been loaded
-  bool m_image_loaded;
-  /// Whether scalar output data have been loaded
-  bool m_scalar_loaded;
-  /// Whether simulation input data have been loaded
-  bool m_input_loaded;
-
-  /// The number of samples
-  size_t m_num_samples;
-  /// The linearized size of an image
-  size_t m_linearized_image_size;
-  /// The linearized size of scalar outputs
-  size_t m_linearized_scalar_size;
-  /// The linearized size of inputs
-  size_t m_linearized_input_size;
-
-  /// image normalization mode
-  int m_image_normalization;
-  int m_image_width; ///< image width
-  int m_image_height; ///< image height
-
-  /// List of jag output images
-  cnpy::NpyArray m_images;
-  /// List of jag scalar outputs
-  cnpy::NpyArray m_scalars;
-  /// List of jag input
-  cnpy::NpyArray m_inputs;
-
-  /// The smallest pixel value in image data (useful for normalization or visualization)
-  data_t m_img_min;
-  /// The largest pixel value in image data (useful for normalization or visualization)
-  data_t m_img_max;
-  int m_num_labels; ///< number of labels
-};
-
-} // end of namespace lbann
-#endif // _DATA_READER_JAG_HPP_
diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp
index 0938fa79438..be53df9aced 100644
--- a/include/lbann/data_readers/data_reader_jag_conduit.hpp
+++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp
@@ -27,20 +27,23 @@
 #ifndef _DATA_READER_JAG_CONDUIT_HPP_
 #define _DATA_READER_JAG_CONDUIT_HPP_
 
-#include "lbann_config.hpp" // may define LBANN_HAS_CONDUIT
+#include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-#include "lbann/data_readers/opencv.hpp"
 #include "data_reader.hpp"
 #include "conduit/conduit.hpp"
 #include "hdf5.h"
-#include "lbann/data_readers/cv_process.hpp"
 #include <string>
 #include <set>
 #include <unordered_map>
 #include <map>
 #include <memory>
-#include "lbann/data_readers/sample_list_jag.hpp"
+
+//#define _USE_IO_HANDLE_
+#ifdef _USE_IO_HANDLE_
+#include "lbann/data_readers/sample_list_conduit_io_handle.hpp"
+#else
+#include "lbann/data_readers/sample_list_hdf5.hpp"
+#endif
 
 namespace lbann {
 
@@ -58,8 +61,16 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Type for the pair of the key string of a sample and the handle of the file that contains it
   using sample_locator_t = std::pair<std::string, hid_t>;
   using sample_map_t = std::vector<sample_locator_t>; ///< valid sample map type
-  using sample_t = sample_list_jag::sample_t;
-  using sample_file_id_t = sample_list_jag::sample_file_id_t;
+  using sample_name_t = std::string;
+#ifdef _USE_IO_HANDLE_
+  using sample_list_t = sample_list_conduit_io_handle<sample_name_t>;
+#else
+  using sample_list_t = sample_list_hdf5<sample_name_t>;
+#endif
+  using file_handle_t = sample_list_t::file_handle_t;
+  using sample_file_id_t = sample_list_t::sample_file_id_t;
+  using sample_t = std::pair<sample_file_id_t, sample_name_t>;
+  //using sample_t = sample_list_t::sample_t;
   /// linear transform on X defined as: first * X + second => X'
   using linear_transform_t = std::pair<double, double>;
 
@@ -76,15 +87,13 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Type to define a prefix string and the minimum length requirement to filter out a key
   using prefix_t = std::pair<std::string, size_t>;
 
-  data_reader_jag_conduit(bool shuffle = true) = delete;
-  data_reader_jag_conduit(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
+  data_reader_jag_conduit(bool shuffle = true);
   data_reader_jag_conduit(const data_reader_jag_conduit&);
-  data_reader_jag_conduit(const data_reader_jag_conduit&, const std::vector<int>& ds_sample_move_list);
   data_reader_jag_conduit& operator=(const data_reader_jag_conduit&);
   ~data_reader_jag_conduit() override;
   data_reader_jag_conduit* copy() const override { return new data_reader_jag_conduit(*this); }
 
-  void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) override;
+  void setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) override;
 
   std::string get_type() const override {
     return "data_reader_jag_conduit";
@@ -165,8 +174,8 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Set every reader instances in a model to have an independent index list
   void set_list_per_model(bool flag) { m_list_per_model = flag; };
 
-  bool has_list_per_model() const { return m_list_per_model; }
-  bool has_list_per_trainer() const { return m_list_per_trainer; }
+  bool has_list_per_model() const override { return m_list_per_model; }
+  bool has_list_per_trainer() const override { return m_list_per_trainer; }
 
 
   /// Fetch data of a mini-batch or reuse it from the cache of the leading reader
@@ -199,16 +208,14 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Return the dimension of data
   const std::vector<int> get_data_dims() const override;
 
-  /// Return the slice points for linearized independent variables
-  std::vector<El::Int> get_slice_points_independent() const;
-  /// Return the slice points for linearized dependent variables
-  std::vector<El::Int> get_slice_points_dependent() const;
-
   int get_num_data() const override;
   int get_num_labels() const override;
   int get_linearized_label_size() const override;
   int get_linearized_size(const std::string& desc) const override;
 
+  std::vector<El::Int> get_slice_points(const slice_points_mode var_category,
+                                        bool& is_supported) override;
+
   void set_split_image_channels();
   void unset_split_image_channels();
   bool check_split_image_channels() const;
@@ -216,15 +223,6 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Show the description
   std::string get_description() const;
 
-  /// Return the image simulation output of the i-th sample
-  std::vector<cv::Mat> get_cv_images(const size_t i, conduit::Node& sample) const;
-
-  /**
-   * Return the images of the i-th sample as an 1-D vector of lbann::DataType
-   * There is one image per view, each of which is taken at closest to the bang time.
-   */
-  std::vector<ch_t> get_images(const size_t i, conduit::Node& sample) const;
-
   /// Return the scalar simulation output data of the i-th sample
   std::vector<scalar_t> get_scalars(const size_t i, conduit::Node& sample) const;
 
@@ -234,13 +232,8 @@ class data_reader_jag_conduit : public generic_data_reader {
   template<typename S>
   static size_t add_val(const std::string key, const conduit::Node& n, std::vector<S>& vals);
 
-  void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override;
-
   void setup_data_store(int mini_batch_size);
 
-  /// A untiliy function to convert the pointer to image data into an opencv image
-  static cv::Mat cast_to_cvMat(const std::pair<size_t, const ch_t*> img,
-                               const int height, const int num_ch=1);
   /// A utility function to convert a JAG variable type to name string
   static std::string to_string(const variable_t t);
 
@@ -259,11 +252,10 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// once the sample_list class and file formats are generalized and
   /// finalized, it should (may?) be possible to code a single
   /// preload_data_store method.
-  void preload_data_store() override;
+  void do_preload_data_store() override;
 
   virtual void set_defaults();
-  virtual bool replicate_processor(const cv_process& pp, const int nthreads);
-  virtual void copy_members(const data_reader_jag_conduit& rhs, const std::vector<int>& ds_sample_move_list = std::vector<int>());
+  virtual void copy_members(const data_reader_jag_conduit& rhs);
 
   /// add data type for independent variable
   void add_independent_variable_type(const variable_t independent);
@@ -280,7 +272,12 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Return the dimension of a particular JAG variable type
   const std::vector<int> get_dims(const variable_t t) const;
   /// Return the slice points for linearized data or responses
-  std::vector<El::Int> get_slice_points(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const;
+  std::vector<El::Int> get_slice_points_impl(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const;
+  /// Return the slice points for linearized independent variables
+  std::vector<El::Int> get_slice_points_independent() const;
+  /// Return the slice points for linearized dependent variables
+  std::vector<El::Int> get_slice_points_dependent() const;
+
   /// A utility function to make a string to show all the variable types
   static std::string to_string(const std::vector<variable_t>& vec);
   /// A utility function to make a string to show all the groups of variable types
@@ -349,6 +346,9 @@ class data_reader_jag_conduit : public generic_data_reader {
    */
   static bool check_non_numeric(const std::string key);
 
+  bool has_path(const file_handle_t& h, const std::string& path) const;
+  void read_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const;
+
   /// Allow const access to the conduit data structure
   static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key);
   /** Load the conduit node with the data of the sample i identified by key
@@ -361,14 +361,14 @@ class data_reader_jag_conduit : public generic_data_reader {
   bool has_conduit_path(const size_t i, const std::string& key) const;
 
   /// Obtain image data
-  std::vector< std::vector<ch_t> > get_image_data(const size_t i, conduit::Node& sample) const;
+  std::vector< std::vector<DataType> > get_image_data(const size_t i, conduit::Node& sample) const;
 
-  bool data_store_active() const {
+  bool data_store_active() const override {
     bool flag = generic_data_reader::data_store_active();
     return (m_data_store != nullptr && flag);
   }
 
-  bool priming_data_store() const {
+  bool priming_data_store() const override {
     bool flag = generic_data_reader::priming_data_store();
     return (m_data_store != nullptr && flag);
   }
@@ -410,10 +410,6 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Keys to select a set of simulation input parameters to use. By default, use all.
   std::vector<std::string> m_input_keys;
 
-  /// preprocessor duplicated for each omp thread
-  std::vector<std::unique_ptr<cv_process> > m_pps;
-  std::unique_ptr<cv_process> m_master_pps;
-
   /**
    * Set of keys that are associated with non_numerical values.
    * Such a variable requires a specific method for mapping to a numeric value.
@@ -467,14 +463,11 @@ class data_reader_jag_conduit : public generic_data_reader {
   std::vector<linear_transform_t> m_input_normalization_params;
 
   typedef std::pair<std::string, std::string> conduit_sample;
-  sample_list_jag m_sample_list;
+  sample_list_t m_sample_list;
   bool m_list_per_trainer;
   bool m_list_per_model;
 
-  /** temporary image normalization
-   * The inputs are the image to normalize, the image source id and the channel id.
-   */
-  void image_normalization(cv::Mat& img, size_t i, size_t ch) const;
+  void preload_helper(const hid_t& h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node);
 };
 
 /**
@@ -602,5 +595,4 @@ inline size_t data_reader_jag_conduit::add_val(const std::string key, const cond
 }
 
 } // end of namespace lbann
-#endif // LBANN_HAS_CONDUIT
 #endif // _DATA_READER_JAG_CONDUIT_HPP_
diff --git a/include/lbann/data_readers/data_reader_mnist.hpp b/include/lbann/data_readers/data_reader_mnist.hpp
index 2d3b30e0ed6..ebd8df8ec27 100644
--- a/include/lbann/data_readers/data_reader_mnist.hpp
+++ b/include/lbann/data_readers/data_reader_mnist.hpp
@@ -30,7 +30,6 @@
 #define LBANN_DATA_READER_MNIST_HPP
 
 #include "data_reader_image.hpp"
-#include "image_preprocessor.hpp"
 
 namespace lbann {
 
diff --git a/include/lbann/data_readers/data_reader_mnist_siamese.hpp b/include/lbann/data_readers/data_reader_mnist_siamese.hpp
deleted file mode 100644
index 4536e3cebad..00000000000
--- a/include/lbann/data_readers/data_reader_mnist_siamese.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_mnist_siamese .hpp .cpp - data reader class for mnist dataset
-//                     employing two images per sample to feed siamese model
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef DATA_READER_MNIST_SIAMESE_HPP
-#define DATA_READER_MNIST_SIAMESE_HPP
-
-#include "data_reader_multi_images.hpp"
-#include "cv_process.hpp"
-#include <vector>
-#include <string>
-#include <utility>
-#include <iostream>
-
-namespace lbann {
-
-/**
- * With MNIST dataset, there is no individual image file. All the images or
- * labels are packed into a single binary file respectively. This reader
- * pre-loads all the data into memory as minist_reader does.
- * However, to feed a siamese model, this reader randomly chooses the paired
- * input on-line. It maintains another data index list, 'm_shuffled_indices2'.
- * It first copies the primary list maintined by the base class to the secondary
- * list, and shuffles the secondary whenever the primary gets shuffled via the
- * overridden shuffle_indices() method.
- */
-class data_reader_mnist_siamese : public data_reader_multi_images {
- public:
-  using label_t = unsigned char;
-  using sample_t = std::pair<int, int>;
-
-  data_reader_mnist_siamese(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
-  data_reader_mnist_siamese(const data_reader_mnist_siamese&);
-  data_reader_mnist_siamese& operator=(const data_reader_mnist_siamese&);
-  ~data_reader_mnist_siamese() override;
-
-  data_reader_mnist_siamese* copy() const override {
-    return new data_reader_mnist_siamese(*this);
-  }
-
-  std::string get_type() const override {
-    return "data_reader_mnist_siamese";
-  }
-
-  /** Set up MNIST dataset-specific input parameters, which are pre-defined
-   *  and also set as the default. This does not change the setup, but only
-   *  preserves the default.
-   */
-  void set_input_params(const int, const int, const int, const int) override;
-
-  // dataset specific functions
-  void load() override;
-
-  /// Fetch this mini-batch's samples into X by calling the new overloaded fetch_datum()
-  int fetch_data(CPUMat& X, El::Matrix<El::Int>& indices_fetched) override;
-  /// Fetch this mini-batch's labels into Y by calling the new overloaded fetch_label()
-  int fetch_labels(CPUMat& Y) override;
-
- protected:
-  /**
-   * Set the default configuration such as the width, height, and number of
-   * channels of the image sample.
-   */
-  void set_defaults() override;
-
-  // unused virtual interfaces replaced by the new interfaces that taks a pair
-  // of indices to sample list.
-  using data_reader_multi_images::fetch_datum;
-  using data_reader_multi_images::fetch_label;
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
-  /**
-   * Fetch two data items identified by the pair of indices to the pre-loaded data list,
-   * and put them into the column mb_idx of matrix x.
-   */
-  virtual bool fetch_datum(CPUMat& X, std::pair<int, int> data_id, int mb_idx);
-  /**
-   * Take a pair of indices to the preloaded sample list, and compare the labels
-   * of the corresponding samples. Store 1 if equal or 0 at the column mb_idx of
-   * the given matrix Y.
-   */
-  virtual bool fetch_label(CPUMat& Y, std::pair<int, int> data_id, int mb_idx);
-
-  /**
-   * Shuffle the second index list added in this class as well as the one in the
-   * base class whenever the latter gets shuffled.
-   */
-  void shuffle_indices() override;
-
- protected:
-  using generic_data_reader::m_shuffled_indices;
-  /// To randomly choose the siamese pair input online
-  std::vector<int> m_shuffled_indices2;
-  /// Store the preloaded data
-  std::vector<std::vector<unsigned char>> m_image_data;
-};
-
-}  // namespace lbann
-
-#endif  // DATA_READER_MNIST_SIAMESE_HPP
diff --git a/include/lbann/data_readers/data_reader_moving_mnist.hpp b/include/lbann/data_readers/data_reader_moving_mnist.hpp
deleted file mode 100644
index 034bca57880..00000000000
--- a/include/lbann/data_readers/data_reader_moving_mnist.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_DATA_READER_MOVING_MNIST_HPP
-#define LBANN_DATA_READER_MOVING_MNIST_HPP
-
-#include "data_reader.hpp"
-
-namespace lbann {
-
-class moving_mnist_reader : public generic_data_reader {
-public:
-  moving_mnist_reader(El::Int num_frames,
-                      El::Int image_height,
-                      El::Int image_width,
-                      El::Int num_objects);
-  moving_mnist_reader(const moving_mnist_reader&) = default;
-  moving_mnist_reader& operator=(const moving_mnist_reader&) = default;
-  ~moving_mnist_reader() override = default;
-  moving_mnist_reader* copy() const override { return new moving_mnist_reader(*this); }
-
-  std::string get_type() const override {
-    return "moving_mnist_reader";
-  }
-
-  void load() override;
-
-  const std::vector<int> get_data_dims() const override;
-  int get_num_labels() const override;
-  int get_linearized_data_size() const override;
-  int get_linearized_label_size() const override;
-
-protected:
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
-private:
-
-  /** Number of frames. */
-  El::Int m_num_frames;
-  /** Frame height. */
-  El::Int m_image_height;
-  /** Frame width. */
-  El::Int m_image_width;
-  /** Number of MNIST digits in each frame. */
-  El::Int m_num_objects;
-
-  /** Number of MNIST samples. */
-  El::Int m_num_raw_images = 0;
-  /** MNIST image height. */
-  El::Int m_raw_image_height = 0;
-  /** MNIST image width. */
-  El::Int m_raw_image_width = 0;
-  /** Raw MNIST image data. */
-  std::vector<unsigned char> m_raw_image_data;
-  /** Raw MNIST label data. */
-  std::vector<unsigned char> m_raw_label_data;
-
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_DATA_READER_MOVING_MNIST_HPP
diff --git a/include/lbann/data_readers/data_reader_multi_images.hpp b/include/lbann/data_readers/data_reader_multi_images.hpp
deleted file mode 100644
index 93a2959bd7d..00000000000
--- a/include/lbann/data_readers/data_reader_multi_images.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_multi_images .hpp .cpp - generic data reader class for datasets
-//                                      employing multiple images per sample
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef DATA_READER_MULTI_IMAGES_HPP
-#define DATA_READER_MULTI_IMAGES_HPP
-
-#include "data_reader_imagenet.hpp"
-#include "cv_process.hpp"
-#include <vector>
-#include <string>
-#include <utility>
-#include <iostream>
-
-namespace lbann {
-class data_reader_multi_images : public imagenet_reader {
- public:
-  using img_src_t = std::vector<std::string>;
-  using sample_t = std::pair<img_src_t, label_t>;
-
-  data_reader_multi_images(bool shuffle) = delete;
-  data_reader_multi_images(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
-  data_reader_multi_images(const data_reader_multi_images&);
-  data_reader_multi_images& operator=(const data_reader_multi_images&);
-  ~data_reader_multi_images() override;
-
-  data_reader_multi_images* copy() const override {
-    return new data_reader_multi_images(*this);
-  }
-
-  std::string get_type() const override {
-    return "data_reader_multi_images";
-  }
-
-  /** Set up imagenet specific input parameters
-   *  If argument is set to 0, then this method does not change the value of
-   *  the corresponding parameter. However, width and height can only be both
-   *  zero or both non-zero.
-   */
-  void set_input_params(const int width, const int height, const int num_ch,
-                        const int num_labels, const int num_img_srcs);
-
-  void set_input_params(const int width, const int height, const int num_ch,
-                        const int num_labels) override;
-
-  // dataset specific functions
-  void load() override;
-
-  int get_linearized_data_size() const override {
-    return m_image_linearized_size * m_num_img_srcs;
-  }
-  const std::vector<int> get_data_dims() const override {
-    return {static_cast<int>(m_num_img_srcs)*m_image_num_channels, m_image_height, m_image_width};
-  }
-
-  /// Return the sample list of current minibatch
-  std::vector<sample_t> get_image_list_of_current_mb() const;
-
-  /// Allow read-only access to the entire sample list
-  const std::vector<sample_t>& get_image_list() const {
-    return m_image_list;
-  }
-
-  sample_t get_sample(size_t idx) const {
-    return m_image_list.at(idx);
-  }
-
-  /// The number of image sources or the number of siamese heads. e.g., 2;
-  /// this method is added to support data_store functionality
-  unsigned int get_num_img_srcs() const {
-    return m_num_img_srcs;
-  }
-
- protected:
-  void set_defaults() override;
-  virtual std::vector<CPUMat> create_datum_views(CPUMat& X, const int mb_idx) const;
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
-  bool read_text_stream(std::istream& text_stream, std::vector<sample_t>& list);
-  bool load_list(const std::string file_name, std::vector<sample_t>& list,
-                 const bool fetch_list_at_once = false);
-
- protected:
-  std::vector<sample_t> m_image_list; ///< list of image files and labels
-  /// The number of image sources or the number of siamese heads. e.g., 2
-  unsigned int m_num_img_srcs;
-};
-
-}  // namespace lbann
-
-#endif  // DATA_READER_MULTI_IMAGES_HPP
diff --git a/include/lbann/data_readers/data_reader_multihead_siamese.hpp b/include/lbann/data_readers/data_reader_multihead_siamese.hpp
deleted file mode 100644
index dc95f3cb7e8..00000000000
--- a/include/lbann/data_readers/data_reader_multihead_siamese.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_multihead_siamese .hpp .cpp - data reader to use m patches
-//                                 generated offline.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef DATA_READER_MULTIHEAD_SIAMESE_HPP
-#define DATA_READER_MULTIHEAD_SIAMESE_HPP
-
-#include "data_reader_multi_images.hpp"
-#include "cv_process.hpp"
-#include "offline_patches_npz.hpp"
-#include <vector>
-#include <string>
-#include <utility>
-#include <iostream>
-
-namespace lbann {
-class data_reader_multihead_siamese : public data_reader_multi_images {
- public:
-  using label_t = offline_patches_npz::label_t;
-  using sample_t = offline_patches_npz::sample_t;
-
-  data_reader_multihead_siamese(const std::shared_ptr<cv_process>& pp, unsigned int nimages, bool shuffle = true);
-  data_reader_multihead_siamese(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
-
-  data_reader_multihead_siamese(const data_reader_multihead_siamese&);
-  data_reader_multihead_siamese& operator=(const data_reader_multihead_siamese&);
-  ~data_reader_multihead_siamese() override;
-
-  data_reader_multihead_siamese* copy() const override {
-    return new data_reader_multihead_siamese(*this);
-  }
-
-  std::string get_type() const override {
-    return "data_reader_multihead_siamese";
-  }
-
-  /** Set up imagenet specific input parameters
-   *  If argument is set to 0, then this method does not change the value of
-   *  the corresponding parameter. However, width and height can only be both
-   *  zero or both non-zero.
-   */
-  void set_input_params(const int width, const int height, const int num_ch,
-                        const int num_labels) override;
-
-  // dataset specific functions
-  void load() override;
-
-  /// Return the sample list of current minibatch
-  std::vector<sample_t> get_image_list_of_current_mb() const;
-
-  /// Allow read-only access to the entire sample list
-  std::vector<sample_t> get_image_list() const;
-
-  sample_t get_sample(size_t idx) const {
-    return m_samples.get_sample(idx);
-  }
-
- protected:
-  void set_defaults() override;
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
- protected:
-  offline_patches_npz m_samples;
-};
-
-}  // namespace lbann
-
-#endif  // DATA_READER_MULTIHEAD_SIAMESE_HPP
diff --git a/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp
new file mode 100644
index 00000000000..1e691fbd5d8
--- /dev/null
+++ b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp
@@ -0,0 +1,180 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef LBANN_DATA_READER_NPZ_RAS_LIPID_HPP
+#define LBANN_DATA_READER_NPZ_RAS_LIPID_HPP
+
+#include "conduit/conduit.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/data_readers/data_reader.hpp"
+#include "conduit/conduit.hpp"
+#include <cnpy.h>
+#include <memory>
+
+namespace lbann {
+  /**
+   * Data reader for data stored in numpy (.npz) files that are encapsulated
+   * in conduit::Nodes
+   */
+class ras_lipid_conduit_data_reader : public generic_data_reader {
+
+public:
+
+  ras_lipid_conduit_data_reader(const bool shuffle);
+  ras_lipid_conduit_data_reader(const ras_lipid_conduit_data_reader&);
+  ras_lipid_conduit_data_reader& operator=(const ras_lipid_conduit_data_reader&);
+  ~ras_lipid_conduit_data_reader() override {}
+
+  ras_lipid_conduit_data_reader* copy() const override { return new ras_lipid_conduit_data_reader(*this); }
+
+  std::string get_type() const override {
+    return "ras_lipid_conduit_data_reader";
+  }
+
+  void load() override;
+
+  void set_num_labels(int n) { m_num_labels = n; }
+
+  int get_linearized_data_size() const override { return m_seq_len*m_num_features; }
+  int get_linearized_label_size() const override {  return m_seq_len*m_num_labels; }
+  int get_linearized_response_size() const override { return m_num_response_features; }
+  //const std::vector<int> get_data_dims() const override {  return m_data_dims; }
+  const std::vector<int> get_data_dims() const override {  return {get_linearized_data_size()}; }
+  int get_num_labels() const override { return m_seq_len*m_num_labels; }
+
+private:
+
+  int m_num_features = 0;
+  int m_num_labels = 3;
+  int m_num_response_features = 0;
+  std::vector<int> m_data_dims;
+
+  /** @brief Total of train + validate samples */
+  size_t m_num_global_samples;
+  size_t m_num_train_samples;
+  size_t m_num_validate_samples;
+
+  /** the number of sequential samples that are combined into a multi-sample */
+  int m_seq_len = 1;
+
+  // owner map for multi-samples
+  std::unordered_map<int, int> m_multi_sample_to_owner;
+
+  std::unordered_map<std::string, std::set<int>> m_filename_to_multi_sample;
+  //std::unordered_map<std::string, std::unordered_set<int>> m_filename_to_multi_sample;
+
+  std::unordered_map<int, int> m_multi_sample_id_to_first_sample;
+
+//  sample_list_t m_sample_list;
+
+  /** @brief List of input npz filenames */
+  std::vector<std::string> m_filenames;
+
+  /** @brief m_samples_per_file[j] contains the number of samples in the j-th file */
+  std::vector<int> m_samples_per_file;
+
+  /** @brief Maps a data_id to the file index (in m_filenames) that
+   * contains the sample, and the offset in that file's npy array */
+  std::unordered_map<int, std::pair<int, int>> m_data_id_map;
+
+  /** @brief Maps a field name to the data's shape
+   *
+   * Example: "bbs" -> {184, 3}
+   */
+  std::unordered_map<std::string, std::vector<size_t>> m_datum_shapes;
+
+  /** @brief Maps a field name to the word size */
+  std::unordered_map<std::string, size_t> m_datum_word_sizes;
+
+  /** @brief Maps a field name to the number of bytes in the datum
+   *
+   * Example: "bbs" -> 184*3*word_size
+   */
+  std::unordered_map<std::string, size_t> m_datum_num_bytes;
+
+  /** @brief Maps a field name to the number of words in the datum */
+  std::unordered_map<std::string, size_t> m_datum_num_words;
+
+  std::vector<double> m_min;
+  std::vector<double> m_max_min;
+  std::vector<double> m_mean;
+  std::vector<double> m_std_dev;
+  bool m_use_min_max;
+  bool m_use_z_score;
+
+  //=====================================================================
+  // private methods follow
+  //=====================================================================
+
+  /** @brief Contains common code for operator= and copy ctor */
+  void copy_members(const ras_lipid_conduit_data_reader& rhs);
+
+  void do_preload_data_store() override;
+
+  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
+  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
+  bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
+
+  /** @brief Populates in m_datum_shapes, m_datum_num_bytes, m_datum_word_sizes */
+  void fill_in_metadata();
+
+  /** @brief Re-build the data store's owner map
+   *
+   * This one-off, wouldn't need to do this if we were using sample lists.
+   */
+  void rebuild_data_store_owner_map();
+
+  /** @brief Fills in m_samples_per_file */
+  void get_samples_per_file();
+
+  /** @brief Write file sizes to disk
+   *
+   * Each line of the output file contains: filename num_samples
+   */
+  void write_file_sizes();
+
+  /** @brief Read file that contains: filename num_samples
+   *
+   * see: write_file_sizes()
+   */
+  void read_file_sizes();
+
+  void read_normalization_data();
+
+  /** Print some statistics to cout */
+  void print_shapes_etc();
+
+  void load_the_next_sample(conduit::Node &node, int sample_index, std::map<std::string, cnpy::NpyArray> &data);
+
+  void construct_multi_sample(std::vector<conduit::Node> &work, int data_id, conduit::Node &node); 
+
+};
+
+}  // namespace lbann
+
+#endif //LBANN_DATA_READER_NPZ_RAS_LIPID_HPP
diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp
index 7d7cd00bf93..57473224f9f 100644
--- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp
+++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp
@@ -30,6 +30,7 @@
 #define LBANN_DATA_READER_NUMPY_NPZ_CONDUIT_HPP
 
 #include "lbann/data_readers/data_reader.hpp"
+#include "conduit/conduit.hpp"
 #include <cnpy.h>
 
 namespace lbann {
@@ -37,7 +38,8 @@ namespace lbann {
    * Data reader for data stored in numpy (.npz) files that are encapsulated .
    * in conduit::Nodes
    */
-  class numpy_npz_conduit_reader : public generic_data_reader {
+class numpy_npz_conduit_reader : public generic_data_reader {
+
  public:
   numpy_npz_conduit_reader(const bool shuffle);
   // These need to be explicit because of some issue with the cnpy copy
@@ -73,7 +75,7 @@ namespace lbann {
   const std::vector<int> get_data_dims() const override { return m_data_dims; }
 
   protected:
-    void preload_data_store();
+    void do_preload_data_store() override;
 
     bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
     bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
@@ -105,7 +107,16 @@ namespace lbann {
     void fill_in_metadata();
 
     std::vector<std::string> m_filenames;
-  };
+
+    bool load_numpy_npz_from_file(const std::unordered_set<int> &data_ids, std::unordered_set<int>& label_classes); 
+
+    void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset = true);
+
+    std::unordered_map<int, std::map<std::string, cnpy::NpyArray>> m_npz_cache;
+
+    void load_npz(const std::string filename, int data_id, conduit::Node &node);
+
+};
 
 }  // namespace lbann
 
diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp
index 35264a18e7b..372e449d3b6 100644
--- a/include/lbann/data_readers/data_reader_python.hpp
+++ b/include/lbann/data_readers/data_reader_python.hpp
@@ -29,113 +29,18 @@
 
 #include "data_reader.hpp"
 #ifdef LBANN_HAS_PYTHON
-#include <Python.h>
+#include "lbann/utils/python.hpp"
 
 namespace lbann {
 
-namespace python {
-
-/** @brief Singleton class to manage embedded Python session.
- *
- *  This is very experimental. Be warned.
- */
-class manager {
-public:
-
-  /** @brief Get singleton instance. */
-  static manager& get_instance();
-  /** @brief Construct singleton instance.
-   *  @details If there is already an instance, it is destroyed.
-   */
-  static void create();
-  /** Destroy singleton instance. */
-  static void destroy();
-
-  /** @brief Check if a Python error has occurred.
-   *
-   *  Throw an exception if an error is detected.
-   *
-   *  @param force_error Whether to force an exception to be thrown.
-   */
-  void check_error(bool force_error = false) const;
-
-  ~manager();
-
-private:
-
-  /** @brief Singleton instance. */
-  static std::unique_ptr<manager> m_instance;
-
-  /** @brief State on main Python thread. */
-  PyThreadState* m_thread_state = nullptr;
-
-  // Lifetime functions
-  manager();
-  manager(const manager&) = delete;
-  manager& operator=(const manager&) = delete;
-
-};
-
-/** @brief RAII wrapper for Python GIL.
- *
- *  The Python interpreter is not thread-safe, so it uses the "global
- *  interpreter lock" to ensure only one thread is executing at a
- *  time. Multithreading is achieved by periodically transferring
- *  control of the GIL between threads. This makes it hard to get
- *  meaningful speedups from simple multithreading. Certain
- *  operations, e.g. I/O and numerical kernels in NumPy, can be
- *  efficiently parallelized because they yield control of the GIL
- *  while working.
- *
- *  This is very experimental. Be warned.
- */
-class global_interpreter_lock {
-public:
-
-  global_interpreter_lock(const manager&);
-  ~global_interpreter_lock();
-
-private:
-
-  global_interpreter_lock(const global_interpreter_lock&) = delete;
-  global_interpreter_lock& operator=(const global_interpreter_lock&) = delete;
-
-  PyGILState_STATE m_gil_state;
-
-};
-
-/** @brief Convenience wrapper around @c PyObject pointer.
- *
- *  This is very experimental. Be warned.
- */
-class object {
-public:
-  object(PyObject* obj = nullptr);
-  object(std::string val);
-  object(El::Int val);
-  object(DataType val);
-  object(const object& other);
-  object& operator=(const object& other);
-  object(object&& other);
-  object& operator=(object&& other);
-  ~object();
-  inline PyObject* get()                  { return m_ptr; }
-  inline const PyObject* get() const      { return m_ptr; }
-  inline operator PyObject*()             { return get(); }
-  inline operator const PyObject*() const { return get(); }
-private:
-  PyObject* m_ptr;
-};
-
-} // namespace python
-
 class python_reader : public generic_data_reader {
 public:
   python_reader(std::string module,
                 std::string module_dir,
                 std::string sample_function,
                 std::string num_samples_function,
-                std::string sample_dims_function);
+                std::string sample_dims_function,
+                bool shuffle);
   python_reader(const python_reader&) = default;
   python_reader& operator=(const python_reader&) = default;
   ~python_reader() override;
@@ -150,22 +55,58 @@ class python_reader : public generic_data_reader {
   int get_linearized_data_size() const override;
   int get_linearized_label_size() const override;
 
-  void setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) override;
+  void setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) override;
   void load() override;
 
 protected:
   bool fetch_data_block(CPUMat& X,
-                        El::Int thread_id,
+                        El::Int block_offset,
+                        El::Int block_stride,
                         El::Int mb_size,
                         El::Matrix<El::Int>& indices_fetched) override;
   bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
 
 private:
+
+  /** @brief Dimensions of data sample tensor. */
   std::vector<El::Int> m_sample_dims;
+  /** @brief Number of data samples in data set. */
   El::Int m_num_samples;
+
+  /** @brief User-provided Python function to access data samples.
+   *
+   *  The function is expected to take one integer argument for the
+   *  sample index. It must return an iterator that defines the
+   *  entries in a data sample.
+   */
   python::object m_sample_function;
+
+  /** @brief Wrapper function around sample access function.
+   *
+   *  This function will be executed on worker processes (see @c
+   *  m_process_pool). It will obtain a data sample from @c
+   *  m_sample_function and copy it into a @c m_shared_memory_array.
+   */
+  python::object m_sample_function_wrapper;
+
+  /** @brief Pool of worker processes.
+   *
+   *  From the Python @c multiprocessing module.
+   */
   python::object m_process_pool;
 
+  /** @brief Shared memory array.
+   *
+   *  @c RawArray from the Python @c multiprocessing module.
+   */
+  python::object m_shared_memory_array;
+
+  /** @brief Pointer into shared memory array.
+   *
+   *  Points to buffer for @c m_shared_memory_array.
+   */
+  DataType* m_shared_memory_array_ptr = nullptr;
+
 };
 
 } // namespace lbann
diff --git a/include/lbann/data_readers/data_reader_smiles.hpp b/include/lbann/data_readers/data_reader_smiles.hpp
new file mode 100644
index 00000000000..b820f6d9d37
--- /dev/null
+++ b/include/lbann/data_readers/data_reader_smiles.hpp
@@ -0,0 +1,151 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_READER_SMILES_HPP
+#define LBANN_DATA_READER_SMILES_HPP
+
+#include "conduit/conduit.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/data_readers/data_reader.hpp"
+
+namespace lbann {
+  /**
+   * Data reader for SMILES data that has been converted to an array
+   * of short ints and stored in binary format.
+   * Binary format is: (n_int, int (repeating n_int times) ) repeating
+   * last entry in the file is the only entry stored as an integer; it
+   * contains the number of samples. Second to last entry is the maximum
+   * number of ints in any sample; this is stored as a short int
+   */
+class smiles_data_reader : public generic_data_reader {
+
+public:
+
+  smiles_data_reader(const bool shuffle);
+  smiles_data_reader(const smiles_data_reader&);
+  smiles_data_reader& operator=(const smiles_data_reader&);
+  ~smiles_data_reader() override;
+
+  smiles_data_reader* copy() const override { return new smiles_data_reader(*this); }
+
+  std::string get_type() const override {
+    return "smiles_data_reader";
+  }
+
+  void load() override;
+
+  int get_linearized_data_size() const override { return m_linearized_data_size; }
+  int get_linearized_label_size() const override {  return m_linearized_label_size; }
+  int get_linearized_response_size() const override { return m_linearized_response_size; }
+  const std::vector<int> get_data_dims() const override {  return {get_linearized_data_size()}; }
+  int get_num_labels() const override { return m_num_labels; }
+
+  void set_sequence_length(int n) { m_linearized_data_size = n; }
+  int get_sequence_length() { return get_linearized_data_size(); }
+
+private:
+
+  /// used for sanity checking in load() and do_preload();
+  /// may eventually go away
+  int m_min_index = INT_MAX;
+  int m_max_index = 0;
+
+  //==== start hack to make it work fast ====
+  
+  // maps: sample_id to <sample offset, wrt m_data[0], sample_size>
+  std::unordered_map<int, std::pair<size_t, short>> m_sample_lookup;
+
+  std::vector<char> m_data;
+
+  void get_sample(int sample_id, std::vector<short> &sample_out);
+
+  void setup_local_cache();
+
+  // to enable this feature, add '#define DEBUG_F' to data_reader_smiles.cpp;
+  // this is ONLY for testing/development; if enabled, each rank will encode
+  // all samples after loading, and prior to the first epoch
+  void test_encode();
+
+  char m_delimiter = '\0';
+
+  // CAUTION: line_number is same as sample_id, i.e, assumes a single
+  //          data input file
+  int get_smiles_string_length(const std::string &line, int line_number);
+
+  //==== end hack to make it work fast ====
+
+  int m_linearized_data_size = 0;
+  int m_linearized_label_size = 0;
+  int m_linearized_response_size = 0;
+  int m_num_labels = 0;
+
+  // these may be changed when the vocab file is read
+  short m_pad = 420;
+  short m_unk = 421;
+  short m_bos = 422;
+  short m_eos = 423;
+
+  bool m_has_header = true;
+
+  std::unordered_map<char, short> m_vocab;
+  std::unordered_map<short,std::string> m_vocab_inv;
+
+  std::mutex m_mutex;
+
+  size_t m_missing_char_in_vocab_count = 0;
+  std::unordered_set<char> m_missing_chars;
+
+  //=====================================================================
+  // private methods follow
+  //=====================================================================
+
+  void get_delimiter();
+
+  /// returns a lower bound on memory usage for dataset
+  size_t get_mem_usage() const;
+
+  /** @brief Contains common code for operator= and copy ctor */
+  void copy_members(const smiles_data_reader& rhs);
+
+  void do_preload_data_store() override;
+
+  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
+  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
+  bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
+
+  void print_statistics() const;
+  void load_vocab();
+  int get_num_lines(std::string fn); 
+  void construct_conduit_node(int data_id, const std::string &line, conduit::Node &node); 
+  void encode_smiles(const char *smiles, short size, std::vector<short> &data, int data_id); 
+  void encode_smiles(const std::string &smiles, std::vector<short> &data, int data_id); 
+  void decode_smiles(const std::vector<short> &data, std::string &out);
+};
+
+}  // namespace lbann
+
+#endif //LBANN_DATA_READER_SMILES_HPP
diff --git a/include/lbann/data_readers/data_reader_triplet.hpp b/include/lbann/data_readers/data_reader_triplet.hpp
deleted file mode 100644
index a1ee9e07871..00000000000
--- a/include/lbann/data_readers/data_reader_triplet.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_triplet .hpp .cpp - data reader to use triplet patches
-//                                 generated offline.
-//
-// Depreciated and replaced by data_reader_multihead_siamese .hpp .cpp.
-// Kept here just for reference.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef DATA_READER_TRIPLET_HPP
-#define DATA_READER_TRIPLET_HPP
-
-#include "data_reader_multi_images.hpp"
-#include "cv_process.hpp"
-#include "offline_patches_npz.hpp"
-#include <vector>
-#include <string>
-#include <utility>
-#include <iostream>
-
-namespace lbann {
-class data_reader_triplet : public data_reader_multi_images {
- public:
-  using label_t = offline_patches_npz::label_t;
-  using sample_t = offline_patches_npz::sample_t;
-
-  data_reader_triplet(const std::shared_ptr<cv_process>& pp, bool shuffle = true);
-  data_reader_triplet(const data_reader_triplet&);
-  data_reader_triplet& operator=(const data_reader_triplet&);
-  ~data_reader_triplet() override;
-
-  data_reader_triplet* copy() const override {
-    return new data_reader_triplet(*this);
-  }
-
-  std::string get_type() const override {
-    return "data_reader_triplet";
-  }
-
-  /** Set up imagenet specific input parameters
-   *  If argument is set to 0, then this method does not change the value of
-   *  the corresponding parameter. However, width and height can only be both
-   *  zero or both non-zero.
-   */
-  void set_input_params(const int width, const int height, const int num_ch,
-                        const int num_labels) override;
-
-  // dataset specific functions
-  void load() override;
-
-  /// Return the sample list of current minibatch
-  std::vector<sample_t> get_image_list_of_current_mb() const;
-
-  /// Allow read-only access to the entire sample list
-  std::vector<sample_t> get_image_list() const;
-
-  sample_t get_sample(size_t idx) const {
-    return m_samples.get_sample(idx);
-  }
-
- protected:
-  void set_defaults() override;
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
-  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
-
- protected:
-  offline_patches_npz m_samples;
-};
-
-}  // namespace lbann
-
-#endif  // DATA_READER_TRIPLET_HPP
diff --git a/include/lbann/data_readers/image_preprocessor.hpp b/include/lbann/data_readers/image_preprocessor.hpp
deleted file mode 100644
index fb730e23bf1..00000000000
--- a/include/lbann/data_readers/image_preprocessor.hpp
+++ /dev/null
@@ -1,209 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// image_preprocessor.hpp - Preprocessing utilities for image inputs
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_IMAGE_PREPROCESSOR
-#define LBANN_IMAGE_PREPROCESSOR
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#include "lbann/data_readers/opencv.hpp"
-#else
-#error OpenCV required
-#endif
-#include "lbann/base.hpp"
-
-namespace lbann {
-
-/**
- * Support class for preprocessing image inputs.
- * Supports the following transforms:
- * - Random horizontal and vertical flips
- * - Random rotations
- * - Random horizontal and vertical shifts
- * - Random shearing
- * - Standardize to 0 mean
- * - Standardize to unit variance
- * - Scale to the range [0, 1]
- * - Normalize via z-score
- */
-class lbann_image_preprocessor {
- public:
-  lbann_image_preprocessor();
-  lbann_image_preprocessor(const lbann_image_preprocessor&) = default;
-  lbann_image_preprocessor& operator=(
-    const lbann_image_preprocessor&) = default;
-  virtual ~lbann_image_preprocessor() {}
-
-  /** Whether to do random horizontal flips. */
-  void horizontal_flip(bool b) {
-    m_horizontal_flip = b;
-  }
-  /** Whether to do random vertical flips. */
-  void vertical_flip(bool b) {
-    m_vertical_flip = b;
-  }
-  /** Do random rotations up to range degrees (0-180). */
-  void rotation(float range) {
-    m_rotation_range = range;
-  }
-  /** Do random horizontal shifts up to range (fraction of image width). */
-  void horizontal_shift(float range) {
-    m_horizontal_shift = range;
-  }
-  /** Do random vertical shifts up to range (fraction of image height). */
-  void vertical_shift(float range) {
-    m_vertical_shift = range;
-  }
-  /** Do random shears up to range (radians). */
-  void shear_range(float range) {
-    m_shear_range = range;
-  }
-  /** Whether to subtract the sample-wise mean. */
-  void subtract_mean(bool b) {
-    m_mean_subtraction = b;
-  }
-  /** Whether to normalize to unit variance, sample-wise. */
-  void unit_variance(bool b) {
-    m_unit_variance = b;
-  }
-  /** Whether to scale to [0, 1] (assumes max value is 255). */
-  void scale(bool b) {
-    m_scale = b;
-  }
-  /**
-   * Whether to normalize by z-scores, sample-wise.
-   * This and mean subtraction/unit variance are mutually exclusive.
-   */
-  void z_score(bool b) {
-    m_z_score = b;
-  }
-  /** Disable all data augmentation. */
-  void disable_augmentation() {
-    horizontal_flip(false);
-    vertical_flip(false);
-    rotation(0.0f);
-    horizontal_shift(0.0f);
-    vertical_shift(0.0f);
-    shear_range(0.0f);
-  }
-
-  /**
-   * Add noise to data (disable by default)
-   * noise_factor control the ammount of noise
-   * to be set to a value above zero but less than 1 (say 0.5)
-   * */
-  void add_noise(float noise_factor=0.0f) {
-    m_noise_factor = noise_factor;
-   }
-
-  /**
-   * Preprocess pixels according to the currently-set augmentation transforms.
-   * @param pixels The pixels to process as a column vector (num x 1 mat).
-   * @param imheight Height of the image in pixels.
-   * @param imwidth Width of the image in pixels.
-   * @param num_channels The number of channels pixels has.
-   */
-  void augment(Mat& pixels, unsigned imheight, unsigned imwidth,
-               unsigned num_channels);
-  /**
-   * Normalize poxels according to the currently-set transforms.
-   * @param pixels The pixels to process as a column vector.
-   * @param num_channels The number of channels pixels has.
-   */
-  void normalize(Mat& pixels, unsigned num_channels);
-
-  /**
-   * External interface to saving an image.
-   * Classes that want to support this should use it to interface with
-   * internal_save_image.
-   * @param pixels The image to save (as a column vector).
-   * @param filename The image filename (type inferred from extension).
-   * @param do_scale Whether pixels has been scaled (default true).
-   */
-  virtual void save_image(Mat& pixels, const std::string filename,
-                          bool do_scale = true) {}
-
- protected:
-  /** Whether to do horizontal flips. */
-  bool m_horizontal_flip;
-  /** Whether to do vertical flips. */
-  bool m_vertical_flip;
-  /** Range in degrees for rotations (0-180). */
-  float m_rotation_range;
-  /** Range (fraction of total width) for horizontal shifts. */
-  float m_horizontal_shift;
-  /** Range (fraction of total height) for vertical shifts. */
-  float m_vertical_shift;
-  /** Shear angle (radians). */
-  float m_shear_range;
-  /** Whether to normalize to 0 mean. */
-  bool m_mean_subtraction;
-  /** Whether to normalize to unit variance. */
-  bool m_unit_variance;
-  /** Whether to scale to [0, 1]. */
-  bool m_scale;
-  /** Whether to normalize via z-score. */
-  bool m_z_score;
-
-  float m_noise_factor;
-
-  void mean_subtraction(Mat& pixels, unsigned num_channels);
-  void unit_variance(Mat& pixels, unsigned num_channels);
-  void unit_scale(Mat& pixels, unsigned num_channels);
-  void z_score(Mat& pixels, unsigned num_channels);
-
-  void pixel_noise(Mat& pixels);
-
-  /**
-   * Convert a column vector of pixels to an OpenCV matrix.
-   */
-  cv::Mat cv_pixels(const Mat& pixels, unsigned imheight, unsigned imwidth,
-                    unsigned num_channels);
-  /** Undo cv_pixels. */
-  void col_pixels(const cv::Mat& sqpixels, Mat& pixels, unsigned num_channels);
-
-  /** @brief Flip sqpixels.
-   *  @param sqpixels The image to flip
-   *  @param flip_flag OpenCV flip flag: 0=vertical, 1=horizontal, -1=both.
-   */
-  void flip(cv::Mat& sqpixels, int flip_flag);
-  /** Apply the affine transformation in 3x3 matrix trans. */
-  void affine_trans(cv::Mat& sqpixels, const Mat& trans);
-
-  /**
-   * Save pixels to filename.
-   */
-  void internal_save_image(Mat& pixels, const std::string filename,
-                           unsigned imheight, unsigned imwidth,
-                           unsigned num_channels, bool do_scale);
-};
-
-}  // namespace lbann
-
-#endif  // LBANN_IMAGE_PREPROCESSOR
diff --git a/include/lbann/data_readers/image_utils.hpp b/include/lbann/data_readers/image_utils.hpp
deleted file mode 100644
index b52a7f4cb78..00000000000
--- a/include/lbann/data_readers/image_utils.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// image_utils .cpp .hpp - Image I/O utility functions
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_IMAGE_UTILS_HPP
-#define LBANN_IMAGE_UTILS_HPP
-
-#include "lbann/base.hpp"
-#include <type_traits>
-#include <typeinfo>   // operator typeid
-
-#ifdef LBANN_HAS_OPENCV
-#include "lbann/data_readers/cv_utils.hpp"
-#include "lbann/data_readers/cv_process_patches.hpp"
-#endif
-
-
-namespace lbann {
-class image_utils {
- public:
-  static bool loadIMG(std::vector<unsigned char>& image_buf, int& Width, int& Height, bool Flip, unsigned char *&Pixels);
-  static bool loadIMG(const std::string& Imagefile, int& Width, int& Height, bool Flip, unsigned char *&Pixels, std::vector<char>& buf);
-  static bool saveIMG(const std::string& Imagefile, int Width, int Height, bool Flip, unsigned char *Pixels);
-
-#ifdef LBANN_HAS_OPENCV
-  // The other load/import methods rely on these core methods
-  /// process an image and put it into an LBANN Mat data block
-  static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& out);
-  /// process an image and put it into a serialized buffer
-  static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, std::vector<uint8_t>& out);
-  /// process an image and put it into an LBANN Mat data blocks
-  static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& out);
-#endif // LBANN_HAS_OPENCV
-
-  // new function, to support sharded data reader and data store functionality
-  static bool load_image(std::vector<unsigned char>& image_buf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr);
-
-  // new function, to support sharded data reader and data store functionality
-  static bool load_image(std::vector<unsigned char>& image_buf,
-                         int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, cv::Mat* cv_buf = nullptr);
-
-  // load/save an image into/from an LBANN data block of El::Matrix<DataType> type
-  // Use a thread save temporary buffer for decoding the image
-  /// Load an image from a file and put it into an LBANN Mat data block
-  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, std::vector<char>& buf, cv::Mat* cv_buf = nullptr);
-  /// Load an image from a file, extract patches from it and put them into LBANN Mat data blocks
-  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, std::vector<char>& buf, cv::Mat* cv_buf = nullptr);
-  /// Save an image using data from an LBANN Mat data block
-  static bool save_image(const std::string& filename, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data);
-
-  // import/export via a buffer of std::vector<uchar> containg the raw bytes of an image file
-  /// Import an image from a file buffer (inbuf) and put it into an LBANN Mat data block
-  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr);
-  /// Import an image from a file buffer (inbuf), extract patches from it and put them into LBANN Mat data blocks
-  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, cv::Mat* cv_buf = nullptr);
-  /// Export an image using data from an LBANN Mat block into a file buffer (outbuf)
-  static bool export_image(const std::string& fileExt, std::vector<uchar>& outbuf, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data);
-};
-
-} // end of namespace lbann
-
-#endif // LBANN_IMAGE_UTILS_HPP
diff --git a/include/lbann/data_readers/numpy_conduit_converter.hpp b/include/lbann/data_readers/numpy_conduit_converter.hpp
deleted file mode 100644
index 32317487043..00000000000
--- a/include/lbann/data_readers/numpy_conduit_converter.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef NUMPY_CONDUIT_CONVERTER_HPP
-#define NUMPY_CONDUIT_CONVERTER_HPP
-
-#include "lbann_config.hpp"
-#include "conduit/conduit.hpp"
-
-namespace lbann {
-
-/**
- * The numpy_conduit_converter class contains static method(s) for
- * reading numpy files and copying the contents to a conduit file.
- *
- * In general the schema for npz files, after conversion to conduit, is:
- *
- * {
- *   data_id (int) :
- *   // one or more of the following sections
- *   {
- *     section_name :
- *     {
- *       "word_size": <int>,
- *       "fortran_order: <0|1>,
- *       "num_vals": <int>,
- *       "shape": <[ vector ]>,
- *       "data": <char*>
- *     }
- *   }
- * }
- *
- * cosmoflow has the following sections:
- *    "data":
- *    "frm":
- *    "responses":
- */
-
-class numpy_conduit_converter {
- public:
-
-  static void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset_conduit_node = true);
-
-};
-
-}  // namespace lbann
-
-#endif  // NUMPY_CONDUIT_CONVERTER_HPP
diff --git a/include/lbann/data_readers/offline_patches_npz.hpp b/include/lbann/data_readers/offline_patches_npz.hpp
deleted file mode 100644
index c433d232ced..00000000000
--- a/include/lbann/data_readers/offline_patches_npz.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef _OFFLINE_PATCHES_NPZ_HPP_
-#define _OFFLINE_PATCHES_NPZ_HPP_
-
-#include "cnpy.h"
-#include <string>
-#include <vector>
-
-namespace lbann {
-
-/**
- * Loads the list of patche files, generated off-line, and the label per sample.
- * As the list is quite large itself in the ASCII text format, it is packed and
- * loaded as a compressed NumPy file (*.npz).
- * Each image file name is compressed further by representing it as a sequence of
- * indices to common substring dictionaries. There are two types of substring
- * dictionaries, root and variant. There is an array of index sequences and an
- * array of dictionary substrings per type, and a label array.
- * For example, a file path train/n000111/abc.tag1.tag2.jpg would be represented
- * as 'r[i][j][k]', 'v[i][j][x]', 'v[i][j][y]', 'v[i][j][z]' for the j-th patch
- * of the i-th sample where 'r[i][j][k]' is "train/n000111", and 'v[i][j][x]',
- * 'v[i][j][y]' and 'v[i][j][z]' is "abc", "tag1", and "tag2" respectively.
- * 'r' is the root dictionary and 'v' is the variant dictionary.
- * The list is kept in a compressed form, and uncompressed on-demand during execution.
- * Each index sequence array is kept as a CNPY data structure, and each dictionary
- * array is loaded into a vector of strings. The label array is loaded into a
- * vector of uint8_t.
- */
-class offline_patches_npz {
- public:
-  using label_t = uint8_t;
-  using sample_t = std::pair<std::vector<std::string>, label_t>;
-
-  offline_patches_npz();
-  offline_patches_npz(size_t npatches);
-  offline_patches_npz(std::string divider);
-  offline_patches_npz(size_t npatches, std::string divider);
-  // TODO: copy constructor and assignment operator for deep-copying if needed
-  // The cnpy structure relies on shared_ptr
-
-  /**
-   * Load the data in the compressed numpy format file.
-   * Use only first_n available samples if specified.
-   * keep_file_lists indicates whether to remove file lists loaded
-   * once converting them to vector of strings.
-   * Need to keep it for selecting a range of samples afterwards.
-   */
-  bool load(const std::string filename, size_t first_n = 0u,
-            bool keep_file_lists = false);
-  /// Show the description
-  std::string get_description() const;
-
-  /// Return the number of samples
-  size_t get_num_samples() const {
-    return m_item_class_list.size();
-  }
-  /// Return the number of patches per sample (the number of image data sources)
-  size_t get_num_patches() const {
-    return m_num_patches;
-  }
-  /// Set the number of patches per sample (the number of image data sources)
-  void set_num_patches(size_t npatches) {
-    m_num_patches = npatches;
-  }
-  /// Reconsturct and return the meta-data (patch file names and the label) of idx-th sample
-  sample_t get_sample(const size_t idx) const;
-  /// Return the label of idx-th sample
-  label_t get_label(const size_t idx) const;
-
-#ifdef _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_
-  std::vector<std::string> get_file_roots() const;
-  size_t count_samples(const size_t num_roots) const;
-  bool select(const std::string out_file, const size_t sample_start, size_t& sample_end);
-#endif // _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_
-
- protected:
-  /// Check the dimensions of loaded data
-  bool check_data() const;
-
- protected:
-  /// Whether loaded data have passed the format check
-  bool m_checked_ok;
-  /// The number of image patches per sample (i.e. the num of patch files to read)
-  size_t m_num_patches;
-  /**
-   * List of index sequences to the dictionary of common file path substrings (m_file_root_list)
-   * per patch file (dimension: num_samples * num_patches)
-   */
-  cnpy::NpyArray m_item_root_list;
-  /**
-   * List of index sequences to the dictionary of common file path substrings (m_file_variant_list)
-   * per patch file (dimension: num_samples * num_patches)
-   */
-  cnpy::NpyArray m_item_variant_list;
-  /// list of labels (dimension: num_samples)
-  std::vector<label_t> m_item_class_list;
-  /// The list of common substrings that a file path starts with (dimension is, for example 1000 in case of imagenet data)
-  std::vector<std::string> m_file_root_list;
-  /// The list of common substrings for file path variants
-  std::vector<std::string> m_file_variant_list;
-  /// The text file name of file_root_list
-  std::string m_file_root_list_name;
-  /// The text file name of file_variant_list
-  std::string m_file_variant_list_name;
-  /// A substring after which the file name of variants begins to differ (e.g., ".JPEG.")
-  std::string m_variant_divider;
-  /// control how the text dictionary files are loaded: whether to load all at once and parse or to stream in
-  bool m_fetch_text_dict_at_once;
-  /**
-   * indicate if the numpy file is reformatted to
-   * - treat an array of character strings as a 2-D character array, of which
-   *   the second dimension is the length of the largest string. This is
-   *   relevant to file_{root,variant}_list.
-   * - convert item_class_list to a list of label_t(uint8_t) instead of a
-   *   list of a charater sequence (two digits).
-   * The reformatting is get around the inability of cnpy library for writing
-   * an array of character strings.
-   */
-  bool m_lbann_format;
-
-  /**
-   * The original data structure for m_file_root_list. It is used by select()
-   * if keep_file_lists was on when loading.
-   */
-  cnpy::NpyArray m_file_root_list_org;
-  /**
-   * The original data structure for m_file_variant_list. It is used by select()
-   * if keep_file_lists was on when loading.
-   */
-  cnpy::NpyArray m_file_variant_list_org;
-};
-
-} // end of namespace lbann
-#endif // _OFFLINE_PATCHES_NPZ_HPP_
diff --git a/include/lbann/data_readers/opencv.hpp b/include/lbann/data_readers/opencv.hpp
deleted file mode 100644
index 9adc7efa0d7..00000000000
--- a/include/lbann/data_readers/opencv.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// opencv.hpp - LBANN header for opencv
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN header for opencv
- *  - includes OpenCV headers according to the version
- *  - use newer built-in variables in place of the deprecated ones for newer OpenCV
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _LBANN_OPENCV_H_INCLUDED_
-#define _LBANN_OPENCV_H_INCLUDED_
-
-#include <opencv2/core/version.hpp>
-#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3))
-#include <opencv2/core.hpp>
-#include <opencv2/highgui.hpp>
-#include <opencv2/imgproc.hpp>
-#define _LBANN_CV_UNCHANGED_ cv::IMREAD_UNCHANGED
-#define _LBANN_CV_GRAYSCALE_ cv::IMREAD_GRAYSCALE
-#define _LBANN_CV_COLOR_     cv::IMREAD_COLOR
-#define _LBANN_CV_ANYDEPTH_  cv::IMREAD_ANYDEPTH
-#define _LBANN_CV_ANYCOLOR_  cv::IMREAD_ANYCOLOR
-#else
-#include <opencv2/core/core.hpp>
-#include <opencv2/core/core_c.h>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#define _LBANN_CV_UNCHANGED_ CV_LOAD_IMAGE_UNCHANGED
-#define _LBANN_CV_GRAYSCALE_ CV_LOAD_IMAGE_GRAYSCALE
-#define _LBANN_CV_COLOR_     CV_LOAD_IMAGE_COLOR
-#define _LBANN_CV_ANYDEPTH_  CV_LOAD_IMAGE_ANYDEPTH
-#define _LBANN_CV_ANYCOLOR_  CV_LOAD_IMAGE_ANYCOLOR
-#endif
-
-#define _LBANN_CV_BLUE_  0
-#define _LBANN_CV_GREEN_ 1
-#define _LBANN_CV_RED_   2
-
-#endif // _LBANN_OPENCV_H_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/opencv_extensions.hpp b/include/lbann/data_readers/opencv_extensions.hpp
deleted file mode 100644
index b24ed360d4d..00000000000
--- a/include/lbann/data_readers/opencv_extensions.hpp
+++ /dev/null
@@ -1,233 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// opencv_extensions.hpp - LBANN's cv::Mat pixel type handling mechanisms
-////////////////////////////////////////////////////////////////////////////////
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_
-#define _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_
-#include "lbann/data_readers/opencv.hpp"
-
-namespace lbann {
-
-/// A template structure to convert an OpenCV identifier of channel depth to a standard C++ type
-template<int T> class cv_depth_type {};
-
-/// define a specialized mapper from a CV channel type to its c++ native type
-#define _def_cv_depth_translation(_CV_TYPE_, _NATIVE_TYPE_) \
-template<> struct cv_depth_type<_CV_TYPE_>  { \
- public: \
-  using standard_type =  _NATIVE_TYPE_; \
-}
-
-/// cv_depth_type<CV_8U> maps to uint8_t
-_def_cv_depth_translation(CV_8U, uint8_t);
-/// cv_depth_type<CV_8S> maps to int8_t
-_def_cv_depth_translation(CV_8S, int8_t);
-/// cv_depth_type<CV_16U> maps to uint16_t
-_def_cv_depth_translation(CV_16U, uint16_t);
-/// cv_depth_type<CV_16S> maps to int16_t
-_def_cv_depth_translation(CV_16S, int16_t);
-/// cv_depth_type<CV_32S> maps to int32_t
-_def_cv_depth_translation(CV_32S, int32_t);
-/// cv_depth_type<CV_32F> maps to float
-_def_cv_depth_translation(CV_32F, float);
-/// cv_depth_type<CV_64F> maps to double
-_def_cv_depth_translation(CV_64F, double);
-
-
-/// Convert an OpenCV identifier of image depth to a standard C++ type
-#define _depth_type(_cv_depth_) lbann::cv_depth_type<_cv_depth_>::standard_type
-
-
-/** A template structure to map the type of channel into the
- * corresponding OpenCV type identifier of image.
-   * - _T_: The channel value type as a native C++ type
- */
-template<typename _T_>
-struct cv_image_type {
-  /** A static member function which returns the OpenCV image type based on
-   *  the channel type and number of channels:
-   *  - _C_: The number of channels It ranges from 1 to CV_CN_MAX which is 512
-   */
-  static int T(const int _C_) {
-    return CV_MAKETYPE(cv::DataType<_T_>::depth, _C_);
-  }
-  /** A static member function which maps a native c++ type to the corresponding
-   *  OpenCV channel type.
-   *  The depth value returned ranges from 0 to (CV_DEPTH_MAX-1) which is 7
-   */
-  static int T() {
-    return cv::DataType<_T_>::depth;
-  }
-};
-
-
-template<typename T>
-struct depth_normalization {
-  static double factor() {
-    if (!std::is_integral<T>::value) {
-      return 1.0;
-    } else {
-      return 1.0/std::numeric_limits<T>::max();
-    }
-  }
-  static double inverse_factor() {
-    if (!std::is_integral<T>::value) {
-      return 1.0;
-    } else {
-      return std::numeric_limits<T>::max();
-    }
-  }
-};
-
-template<>
-struct depth_normalization<void> {
-  static double factor() {
-    return 1.0;
-  }
-  static double inverse_factor() {
-    return 1.0;
-  }
-};
-
-/// Checks if an OpenCV depth code corresponds to an integral type
-inline bool is_float(const int cv_depth) {
-  return ((cv_depth == CV_64F) || (cv_depth == CV_32F));
-}
-
-inline bool check_if_cv_Mat_is_float_type(const cv::Mat& image) {
-  return is_float(image.depth());
-}
-
-inline bool check_if_cv_Mat_has_same_shape(const cv::Mat& image1, const cv::Mat& image2) {
-  return ((image1.cols == image2.cols) &&
-          (image1.rows == image2.rows) &&
-          (image1.channels() == image2.channels()));
-}
-
-template<typename T>
-static double depth_norm_factor() {
-  return depth_normalization<T>::factor();
-}
-
-template<typename T>
-static double depth_norm_inverse_factor() {
-  return depth_normalization<T>::inverse_factor();
-}
-
-/// Return the factor for unit scaling with the type indicated by the OpenCV depth
-double get_depth_normalizing_factor(const int cv_depth);
-/// Return the factor to inverse the unit scaling
-double get_depth_denormalizing_factor(const int cv_depth);
-
-/// returns the number of bytes that would be used for the image without compresstion and any header
-inline size_t image_data_amount(const cv::Mat& img) {
-  return static_cast<size_t>(CV_ELEM_SIZE(img.depth())*
-                             CV_MAT_CN(img.type())*
-                             img.cols*img.rows);
-}
-
-} // end of namespace lbann
-
-#define _SWITCH_CV_FUNC_KNOWN_TYPE_1PARAM(_SW_CH_,_T_,_FUNC_,_P1_) \
-  switch (_SW_CH_) { \
-    case 1: return _FUNC_<_T_,1>(_P1_); \
-    case 2: return _FUNC_<_T_,2>(_P1_); \
-    case 3: return _FUNC_<_T_,3>(_P1_); \
-    case 4: return _FUNC_<_T_,4>(_P1_); \
-  }
-
-#define _SWITCH_CV_FUNC_KNOWN_TYPE_2PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_) \
-  switch (_SW_CH_) { \
-    case 1: return _FUNC_<_T_,1>(_P1_,_P2_); \
-    case 2: return _FUNC_<_T_,2>(_P1_,_P2_); \
-    case 3: return _FUNC_<_T_,3>(_P1_,_P2_); \
-    case 4: return _FUNC_<_T_,4>(_P1_,_P2_); \
-  }
-
-#define _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_) \
-  switch (_SW_CH_) { \
-    case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_); \
-    case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_); \
-    case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_); \
-    case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_); \
-  }
-
-#define _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \
-  switch (_SW_CH_) { \
-    case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_,_P4_); \
-    case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_,_P4_); \
-    case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_,_P4_); \
-    case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_,_P4_); \
-  }
-
-#define _SWITCH_CV_FUNC_1PARAM(_SW_D_,_FUNC_,_P1_) \
-  switch (_SW_D_) { \
-    case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_); \
-    case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_); \
-    case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_); \
-    case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_); \
-    case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_); \
-    case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_); \
-    case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_); \
-  }
-
-#define _SWITCH_CV_FUNC_2PARAMS(_SW_D_,_FUNC_,_P1_,_P2_) \
-  switch (_SW_D_) { \
-    case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_); \
-    case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_); \
-    case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_); \
-    case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_); \
-    case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_); \
-    case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_); \
-    case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_); \
-  }
-
-#define _SWITCH_CV_FUNC_3PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_) \
-  switch (_SW_D_) { \
-    case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_); \
-    case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_); \
-    case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_); \
-    case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_); \
-    case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_); \
-    case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_); \
-    case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_); \
-  }
-
-#define _SWITCH_CV_FUNC_4PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \
-  switch (_SW_D_) { \
-    case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_,_P4_); \
-    case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_,_P4_); \
-    case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_,_P4_); \
-    case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_,_P4_); \
-    case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_,_P4_); \
-    case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_,_P4_); \
-    case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_,_P4_); \
-  }
-
-#endif // _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/patchworks/CMakeLists.txt b/include/lbann/data_readers/patchworks/CMakeLists.txt
deleted file mode 100644
index d45491f93cd..00000000000
--- a/include/lbann/data_readers/patchworks/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# Add the headers for this directory
-set_full_path(THIS_DIR_HEADERS
-  patchworks.hpp
-  patchworks_ROI.hpp
-  patchworks_common.hpp
-  patchworks_patch_descriptor.hpp
-  patchworks_stats.hpp
-  )
-
-# Propagate the files up the tree
-set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/data_readers/patchworks/patchworks.hpp b/include/lbann/data_readers/patchworks/patchworks.hpp
deleted file mode 100644
index d445bb2d343..00000000000
--- a/include/lbann/data_readers/patchworks/patchworks.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks.hpp - LBANN PATCHWORKS main interface header
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS main interface header
- *  - includes the main interface function declarations
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _PATCHWORKS_H_INCLUDED_
-#define _PATCHWORKS_H_INCLUDED_
-#include <vector>
-#include "patchworks_common.hpp"
-#include "patchworks_patch_descriptor.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-/// Compute the min and max value of pixels
-std::pair<double,double> check_min_max(const cv::Mat& _img);
-
-/// Adjust for reducing chromatic aberration
-cv::Mat correct_chromatic_aberration(const cv::Mat& _img);
-
-/// Drop 2 channels randomly
-cv::Mat drop_2channels(const cv::Mat& _img);
-
-} // end of namespace patchworks
-} // end of namespace lbann
-
-#endif //_PATCHWORKS_H_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp b/include/lbann/data_readers/patchworks/patchworks_ROI.hpp
deleted file mode 100644
index 3abdfed5da6..00000000000
--- a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_ROI.hpp - LBANN PATCHWORKS ROI (region-of-interest) header
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS ROI header
- *  - Region of interest descriptor
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _PATCHWORKS_ROI_H_INCLUDED_
-#define _PATCHWORKS_ROI_H_INCLUDED_
-
-#include <ostream>
-#include <sstream>
-#include "patchworks_common.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-/**
- * Regions of interest descriptor.
- * Contains a pair of coordinates that defines a rectangular region of interest
- */
-class ROI {
- public:
-  /// An internal value to represent an uninitialized coordinate value
-  static const int undefined_coordinate;
-
-  int m_left; ///< The left-most pixel position of the region
-  int m_top; ///< The top-most pixel position of the region
-  int m_right; ///< The right-most pixel position of the region
-  int m_bottom; ///< The bottom-most pixel position of the region
-
-  ROI() ///< The default constructor
-    : m_left(undefined_coordinate), m_top(undefined_coordinate),
-      m_right(undefined_coordinate), m_bottom(undefined_coordinate) {}
-
-  void init(); ///< Reset the structure with undefined coordinate values
-  bool is_undefined() const; ///< Tell if the structure has not been initialized
-  bool is_valid() const; ///< Check if the region is valid
-  bool set_overlapping_region(const cv::Mat& img);
-  /// Check if the region of interest covers the whole image
-  bool is_whole_image(const cv::Mat& img);
-
-  /// Set a region by the coordinates
-  bool set_by_corners(const int p0_x, const int p0_y,
-                      const int p1_x, const int p1_y);
-  /// Set a region by the center and its size
-  bool set_by_center(const int px, const int py,
-                     const unsigned int _width, const unsigned int _height);
-
-  /// move the region horizontally by dx and vertically by dy
-  void move(const std::pair<int, int> displacement);
-
-  /// Returns the left position of the region
-  int left() const {
-    return m_left;
-  }
-  /// Returns the top poisition of the region
-  int top() const {
-    return m_top;
-  }
-  /// Returns the right position of the region
-  int right() const {
-    return m_right;
-  }
-  /// Returns the bottom position of the region
-  int bottom() const {
-    return m_bottom;
-  }
-
-  /// Returns a cv::Rect equivalent
-  cv::Rect rect() const {
-    return cv::Rect(m_left, m_top, m_right-m_left, m_bottom-m_top);
-  }
-  /// Returns the width of the rectangular region
-  int width() const {
-    return (m_right - m_left);
-  }
-  /// Returns the height of the rectangular region
-  int height() const {
-    return (m_bottom - m_top);
-  }
-  /// Returns the area of the rectangular region
-  int area() const {
-    return width()*height();
-  }
-  /// Returns the size of the area (width, hegiht)
-
-  std::ostream& Print(std::ostream& os) const { ///< Print out the content
-    return os << '(' << m_left << ", " << m_top << ") ("
-           <<  m_right << ", " << m_bottom << ')';
-  }
-
-  /// Check if this ROI is exactly the same as the given rectangular area
-  bool operator==(const ROI& rarea) const;
-  /// Check if this ROI is not exactly the same as the given rectangular area
-  bool operator!=(const ROI& rarea) const;
-  /// Check if the given rectangular region contains this ROI but is not the same
-  bool operator<(const ROI& rarea) const;
-  /// Check if the given rectangular region contains this ROI
-  bool operator<=(const ROI& rarea) const;
-  /// Check if this ROI  contains the given rectangular region but is not the same
-  bool operator>(const ROI& rarea) const;
-  /// Check if this ROI  contains the given rectangular region
-  bool operator>=(const ROI& rarea) const;
-};
-
-inline bool ROI::operator<=(const ROI& rarea) const {
-  return (((rarea.m_left <= m_left) && (rarea.m_top <= m_top)) &&
-          ((m_right <= rarea.m_right) && (m_bottom <= rarea.m_bottom)) &&
-          is_valid());
-}
-
-inline bool ROI::operator>=(const ROI& rarea) const {
-  return (((m_left <= rarea.m_left) && (m_top <= rarea.m_top)) &&
-          ((rarea.m_right <= m_right) && (rarea.m_bottom <= m_bottom)) &&
-          rarea.is_valid());
-}
-
-std::ostream& operator<<(std::ostream& os, const ROI& roi);
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // _PATCHWORKS_ROI_H_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/patchworks/patchworks_common.hpp b/include/lbann/data_readers/patchworks/patchworks_common.hpp
deleted file mode 100644
index 5c3b9ceb7d1..00000000000
--- a/include/lbann/data_readers/patchworks/patchworks_common.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_common.hpp - LBANN PATCHWORKS header for common definitions
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS common header
- *  - includes commonly used macros, definitions and declarations
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _PATCHWORKS_COMMON_H_
-#define _PATCHWORKS_COMMON_H_
-
-#include <utility> // std::pair
-#include <limits>
-#include <cstdint>
-#include <string>
-#include "lbann/data_readers/opencv_extensions.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-/// Patch displacement type
-using displacement_type = std::pair<int, int>;
-
-#if 0
-// using 32-bit floating point for intermediate image data processing
-using pw_fp_t = float;
-using pw_cv_vec3 = cv::Vec3f;
-#define _PATCHWORKS_STAT_FLOAT_ 32
-#define _PW_CV_FP_ CV_32FC3
-#else
-// using 64-bit floating point for intermediate image data processing
-using pw_fp_t = double;
-using pw_cv_vec3 = cv::Vec3d;
-#define _PATCHWORKS_STAT_FLOAT_ 64
-#define _PW_CV_FP_ CV_64FC3
-#endif
-
-} // end of namespace patchworks
-} // end of namespace lbann
-
-#endif // _PATCHWORKS_COMMON_H_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp b/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp
deleted file mode 100644
index 2891055593c..00000000000
--- a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp
+++ /dev/null
@@ -1,186 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_patch_descriptor.hpp - LBANN PATCHWORKS header for patch descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS header for patch descriptor
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_
-#define _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_
-
-#include <string>
-#include <ostream>
-#include "patchworks_common.hpp"
-#include "patchworks_ROI.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-class patch_descriptor {
- public:
-  // --- configuration variables ---
-  unsigned int m_width; ///< patch width
-  unsigned int m_height; ///< patch height
-  unsigned int m_gap; ///< gap between patches
-  unsigned int m_jitter; ///< for patch position randomization
-
-  /** patch centering mode
-   *  0: place the center patch anywhere within the image
-   *  1: place the center patch anywhere as long as it allows the space for all 8 neighboring patches
-   *  other: place the center patch at the center of the image
-   */
-  unsigned int m_mode_center;
-
-  /** chromatic aberration correction mode
-   *  0: nothing
-   *  1: pixel transform px*B where a=[-1 2 -1] and B=I-a'a/(aa')
-   *  2: randomly replace two channels with white noise
-   */
-  unsigned int m_mode_chrom;
-
-  /// Whether patches are self-labeled
-  bool m_self_label;
-
-  /// The file extension name (i.e., image type)
-  std::string m_ext;
-
-  // --- post-configuration variables ---
-  ROI m_sample_area; ///< The area to sample patches from
-  /// The list of displacements used to generate consecutive patches
-  std::vector<displacement_type> m_displacements;
-
-  // --- state variables ---
-  ROI m_patch_center; ///< The center patch region
-  /// The actual patch positions
-  std::vector<ROI> m_positions;
-  /// The index of displacement used to generate the current patch
-  unsigned int m_cur_patch_idx;
-
- public:
-  patch_descriptor() {
-    init();  ///< Default constructor
-  }
-  virtual ~patch_descriptor() {}
-  void init(); ///< Initializer
-  void reset(); ///< Clear state variables other than configuration variables
-
-  /// Get patch size
-  unsigned int get_patch_width() const { return m_width; }
-  unsigned int get_patch_height() const { return m_height; }
-
-  /// Set patch size
-  void set_size(const int w, const int h);
-  /// Set the gap between neighboring patches
-  void set_gap(const unsigned int g) {
-    m_gap = g;
-  }
-  /// Set poisiton radomization parameter, the maximum jitter
-  void set_jitter(const unsigned int j) {
-    m_jitter = j;
-  }
-  /// Set mode to place center patch
-  void set_mode_centering(const unsigned int m) {
-    m_mode_center = m;
-  }
-  /// Set correction mode for chromatic aberration
-  void set_mode_chromatic_aberration(const unsigned int m) {
-    m_mode_chrom = m;
-  }
-
-  /// Declare the size of the image to take patches from, and implicitly set the area to sample as the entire image
-  bool set_sample_image(const unsigned int w, const unsigned int h);
-  /// Explicitly set the area to sample patches
-  bool set_sample_area(const ROI& area);
-
-  /// Set the file extention of patch files
-  void set_file_ext(const std::string e) {
-    m_ext = e;
-  }
-
-  /// Mark self labeling for patches
-  void set_self_label() { m_self_label = true; }
-
-  /// Unmark self labeling
-  void unset_self_label() { m_self_label = false; }
-
-  bool is_self_labeling() const { return m_self_label; }
-
-  unsigned int get_num_labels() const { return 8u; }
-
-  /// A function that populates the list of displacements from the base patch to the next one
-  virtual void define_patch_set();
-
-  /// transform each pixel by B = I - a'*a/(a*a') where a=[-1 2 -1] to mitigate chromatic aberration
-  bool is_to_correct_chromatic_aberration_at_pixel() const {
-    return (m_mode_chrom == 1);
-  }
-
-  /// randomly drop two channels to avoid chromatic aberration impact
-  bool is_to_drop_2channels() const {
-    return (m_mode_chrom == 2);
-  }
-
-  /// Allow read-only access to the patch displacements
-  const std::vector<displacement_type>& get_displacements() const {
-    return m_displacements;
-  }
-
-  virtual unsigned int get_num_patches() const { return 2u; }
-
-  /// Compute the position of the first patch
-  virtual bool get_first_patch(ROI& patch);
-  /// Compute the position of a subsequent patch
-  virtual bool get_next_patch(ROI& patch);
-  /// extract all the patches defined
-  virtual bool extract_patches(const cv::Mat& img, std::vector<cv::Mat>& patches);
-  /**
-   * Return the label of the last patch generated.
-   * For dual patch scenarios, it is one less the id of the non-center patch position.
-   */
-  virtual unsigned int get_last_label() const { return m_cur_patch_idx - 1; }
-
-  /// Allow read-only access to the positions of the patches generated
-  const std::vector<ROI>& access_positions() const {
-    return m_positions;
-  }
-  virtual std::string get_type() const { return "patch_descriptor"; }
-  virtual std::string get_description() const;
-  /// Print out the content of patch descriptor
-  virtual std::ostream& print(std::ostream& os) const;
-};
-
-/// stream out the patch descriptor content
-std::ostream& operator<<(std::ostream& os, const patch_descriptor& pd);
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/patchworks/patchworks_stats.hpp b/include/lbann/data_readers/patchworks/patchworks_stats.hpp
deleted file mode 100644
index 12141012eef..00000000000
--- a/include/lbann/data_readers/patchworks/patchworks_stats.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_stats.hpp - LBANN PATCHWORKS header for pixel statistics
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS header for pixel statistics
- */
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#ifndef _PATCHWORKS_STATS_INCLUDED_
-#define _PATCHWORKS_STATS_INCLUDED_
-
-#include <iostream>
-#include <vector>
-#include "patchworks_common.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-/// Pixel statistics of an image
-struct image_stats {
-  size_t cnt; ///< number of values (pixels)
-  size_t cntZeros; ///< number of zero values
-  pw_fp_t min; ///< minimum intensity of a pixel
-  pw_fp_t max; ///< maximum intensity of a pixel
-  pw_fp_t median; ///< median intensity of a pixel
-  pw_fp_t minNZ; ///< number of non-zero pixels
-  pw_fp_t medianNZ; ///< median among non-zero values
-  double avg; ///< average intensity
-  double avgNZ; ///< average intensity among non-zeros
-  double stdev; ///< standard deviation of intensity
-  double stdevNZ; ///< standard deviation among non-zero values
-
-  /// Print out statistics
-  std::ostream& Print(std::ostream& os) const {
-    os << "   stats:" << std::endl
-       << "    - cnt   : " << cnt << std::endl
-       << "    - cnt0  : " << cntZeros << std::endl
-       << "    - min   : " << min << std::endl
-       << "    - max   : " << max << std::endl
-       << "    - med   : " << median << std::endl
-       << "    - minNZ : " << minNZ << std::endl
-       << "    - medNZ : " << medianNZ << std::endl
-       << "    - avg   : " << avg << std::endl
-       << "    - avgNZ : " << avgNZ << std::endl
-       << "    - std   : " << stdev << std::endl
-       << "    - stdNZ : " << stdevNZ << std::endl;
-    return os;
-  }
-};
-
-/// Stream out the image statistics
-inline std::ostream& operator<<(std::ostream& os, const image_stats& stats) {
-  return stats.Print(os);
-}
-
-/// Compute the pixel statistics for a mono channel image
-bool get_single_channel_stats(const cv::Mat& img, image_stats& stats);
-
-/// Compute the pixel statistics of an image per channel
-bool get_channel_stats(const cv::Mat& img, std::vector<image_stats>& stats);
-
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // _PATCHWORKS_STATS_INCLUDED_
-#endif // LBANN_HAS_OPENCV
diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp
new file mode 100644
index 00000000000..6d4aa5e051f
--- /dev/null
+++ b/include/lbann/data_readers/sample_list.hpp
@@ -0,0 +1,160 @@
+#ifndef __SAMPLE_LIST_HPP__
+#define __SAMPLE_LIST_HPP__
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <functional>
+
+#include "lbann/comm.hpp"
+
+#include "lbann/utils/file_utils.hpp"
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/deque.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/tuple.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/types/utility.hpp>
+
+namespace lbann {
+
+static const std::string sample_exclusion_list = "CONDUIT_HDF5_EXCLUSION";
+static const std::string sample_inclusion_list = "CONDUIT_HDF5_INCLUSION";
+
+struct sample_list_header {
+  bool m_is_exclusive;
+  /// Number of included samples
+  size_t m_included_sample_count;
+  /// Number of excluded samples
+  size_t m_excluded_sample_count;
+  size_t m_num_files;
+  std::string m_file_dir;
+  std::string m_sample_list_filename;
+
+  sample_list_header();
+
+  bool is_exclusive() const;
+  size_t get_sample_count() const;
+  size_t get_num_files() const;
+  const std::string& get_sample_list_filename() const;
+  const std::string& get_file_dir() const;
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename);
+  }
+};
+
+template <typename sample_name_t>
+class sample_list {
+ public:
+  /// The type for the index assigned to each sample file
+  using sample_file_id_t = std::size_t;
+  /** To describe a sample as the id of the file to which it belongs.
+    * Each file contains only one sample. */
+  using sample_t = std::template pair<sample_file_id_t, sample_name_t>;
+  /// Type for the list of samples
+  using samples_t = std::template vector< sample_t >;
+  /// Mapping of the file index to the filename
+  using file_id_stats_v_t = std::vector< std::string >;
+
+  sample_list();
+  virtual ~sample_list();
+  sample_list(const sample_list& rhs);
+  sample_list& operator=(const sample_list& rhs);
+  sample_list& copy(const sample_list& rhs);
+
+  void copy_members(const sample_list& rhs);
+
+  /// Load a sample list file
+  void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0);
+
+  /// Load the header of a sample list file
+  sample_list_header load_header(const std::string& samplelist_file) const;
+
+  /// Restore a sample list from a serialized string
+  void load_from_string(const std::string& samplelist);
+
+  /// Tells how many samples in the list
+  virtual size_t size() const;
+
+  /// Tells how many sample files are there
+  virtual size_t get_num_files() const;
+
+  /// Tells if the internal list is empty
+  bool empty() const;
+
+  /// Serialize to and from an archive using the cereal library
+  template <class Archive> void serialize( Archive & ar );
+
+  /// Serialize sample list
+  virtual bool to_string(std::string& sstr) const;
+
+  /// Write the sample list
+  void write(const std::string filename) const;
+
+  /// Allow read-only access to the internal list data
+  const samples_t& get_list() const;
+
+  /// Allow the read-only access to the list header
+  const sample_list_header& get_header() const;
+
+  /// Allow read-only access to the metadata of the idx-th sample in the list
+  const sample_t& operator[](size_t idx) const;
+
+  virtual const std::string& get_samples_filename(sample_file_id_t id) const;
+
+  const std::string& get_samples_dirname() const;
+
+  void all_gather_archive(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm);
+  void all_gather_archive_new(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm);
+
+  template<typename T> size_t all_gather_field(T data, std::vector<T>& gathered_data, lbann_comm& comm);
+  virtual void all_gather_packed_lists(lbann_comm& comm);
+
+ protected:
+
+  /// Reads a header line from the sample list given as a stream, and use the info string for error message
+  std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const;
+
+  /// Reads the header of a sample list
+  sample_list_header read_header(std::istream& istrm, const std::string& filename) const;
+
+  /// read the body of a sample list, which is the list of sample files, where each file contains a single sample.
+  virtual void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0);
+
+  /// Assign names to samples when there is only one sample per file without a name.
+  virtual void assign_samples_name();
+
+  /// Reads a sample list and populates the internal list
+  size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0);
+
+  /// Add the header info to the given string
+  void write_header(std::string& sstr, size_t num_files) const;
+
+  /// Get the number of total/included/excluded samples
+  virtual void get_num_samples(size_t& total, size_t& included, size_t& excluded) const;
+
+  virtual void set_samples_filename(sample_file_id_t id, const std::string& filename);
+
+ protected:
+  /// header info of sample list
+  sample_list_header m_header;
+
+ private:
+  /// List of all samples with a file identifier and sample name for each sample
+  samples_t m_sample_list;
+
+  /// Maps sample's file id to file names, file descriptors, and use counts
+  file_id_stats_v_t m_file_id_stats_map;
+
+};
+
+void handle_mpi_error(int ierr);
+
+template<typename T>
+inline T uninitialized_sample_name();
+
+} // end of namespace
+
+#include "sample_list_impl.hpp"
+
+#endif // __SAMPLE_LIST_HPP__
diff --git a/include/lbann/data_readers/sample_list_conduit_io_handle.hpp b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp
new file mode 100644
index 00000000000..ff9b59ed7f5
--- /dev/null
+++ b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp
@@ -0,0 +1,95 @@
+#ifndef __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__
+#define __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__
+
+#include "sample_list_open_files.hpp"
+#include "conduit/conduit.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_handle.hpp"
+
+namespace lbann {
+
+template <typename sample_name_t>
+class sample_list_conduit_io_handle : public sample_list_open_files<sample_name_t, conduit::relay::io::IOHandle*> {
+ public:
+  using file_handle_t = conduit::relay::io::IOHandle*;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::sample_file_id_t;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::sample_t;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::samples_t;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::file_id_stats_t;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::file_id_stats_v_t;
+  using typename sample_list_open_files<sample_name_t, file_handle_t>::fd_use_map_t;
+
+  sample_list_conduit_io_handle();
+  ~sample_list_conduit_io_handle() override;
+
+  bool is_file_handle_valid(const file_handle_t& h) const override;
+
+ protected:
+  void obtain_sample_names(file_handle_t& h, std::vector<std::string>& sample_names) const override;
+  file_handle_t open_file_handle_for_read(const std::string& path) override;
+  void close_file_handle(file_handle_t& h) override;
+  void clear_file_handle(file_handle_t& h) override;
+};
+
+
+template <typename sample_name_t>
+inline sample_list_conduit_io_handle<sample_name_t>::sample_list_conduit_io_handle()
+: sample_list_open_files<sample_name_t, file_handle_t>() {}
+
+template <typename sample_name_t>
+inline sample_list_conduit_io_handle<sample_name_t>::~sample_list_conduit_io_handle() {
+  // Close the existing open files
+  for(auto& f : this->m_file_id_stats_map) {
+    file_handle_t& h = std::get<1>(f);
+    close_file_handle(h);
+    clear_file_handle(h);
+    std::get<2>(f).clear();
+  }
+  this->m_file_id_stats_map.clear();
+}
+
+template <typename sample_name_t>
+inline void sample_list_conduit_io_handle<sample_name_t>
+::obtain_sample_names(sample_list_conduit_io_handle<sample_name_t>::file_handle_t& h, std::vector<std::string>& sample_names) const {
+  sample_names.clear();
+  if (h != nullptr) {
+    h->list_child_names("/", sample_names);
+  }
+}
+
+template <typename sample_name_t>
+inline bool sample_list_conduit_io_handle<sample_name_t>
+::is_file_handle_valid(const sample_list_conduit_io_handle<sample_name_t>::file_handle_t& h) const {
+  return ((h != nullptr) && (h->is_open()));
+}
+
+template <typename sample_name_t>
+inline typename sample_list_conduit_io_handle<sample_name_t>::file_handle_t sample_list_conduit_io_handle<sample_name_t>
+::open_file_handle_for_read(const std::string& file_path) {
+  file_handle_t h = new conduit::relay::io::IOHandle;
+  h->open(file_path, "hdf5");
+  return h;
+}
+
+template <typename sample_name_t>
+inline void sample_list_conduit_io_handle<sample_name_t>
+::close_file_handle(file_handle_t& h) {
+  if(is_file_handle_valid(h)) {
+    h->close();
+  }
+}
+
+template <>
+inline conduit::relay::io::IOHandle* uninitialized_file_handle<conduit::relay::io::IOHandle*>() {
+  return nullptr;
+}
+
+template <typename sample_name_t>
+inline void sample_list_conduit_io_handle<sample_name_t>
+::clear_file_handle(sample_list_conduit_io_handle<sample_name_t>::file_handle_t& h) {
+  h = uninitialized_file_handle<file_handle_t>();
+}
+
+} // end of namespace lbann
+
+#endif // __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__
diff --git a/include/lbann/data_readers/sample_list_hdf5.hpp b/include/lbann/data_readers/sample_list_hdf5.hpp
new file mode 100644
index 00000000000..f9181594076
--- /dev/null
+++ b/include/lbann/data_readers/sample_list_hdf5.hpp
@@ -0,0 +1,91 @@
+#ifndef __SAMPLE_LIST_HDF5_HPP__
+#define __SAMPLE_LIST_HDF5_HPP__
+
+#include "sample_list_open_files.hpp"
+#include "hdf5.h"
+#include "conduit/conduit.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_hdf5.hpp"
+
+namespace lbann {
+
+template <typename sample_name_t>
+class sample_list_hdf5 : public sample_list_open_files<sample_name_t, hid_t> {
+ public:
+  using file_handle_t = hid_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::sample_file_id_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::sample_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::samples_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::file_id_stats_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::file_id_stats_v_t;
+  using typename sample_list_open_files<sample_name_t, hid_t>::fd_use_map_t;
+
+  sample_list_hdf5();
+  ~sample_list_hdf5() override;
+
+  bool is_file_handle_valid(const hid_t& h) const override;
+
+ protected:
+  void obtain_sample_names(hid_t& h, std::vector<std::string>& sample_names) const override;
+  hid_t open_file_handle_for_read(const std::string& path) override;
+  void close_file_handle(hid_t& h) override;
+  void clear_file_handle(hid_t& h) override;
+};
+
+
+template <typename sample_name_t>
+inline sample_list_hdf5<sample_name_t>::sample_list_hdf5()
+: sample_list_open_files<sample_name_t, hid_t>() {} 
+
+template <typename sample_name_t>
+inline sample_list_hdf5<sample_name_t>::~sample_list_hdf5() {
+  // Close the existing open files
+  for(auto& f : this->m_file_id_stats_map) {
+    file_handle_t& h = std::get<1>(f);
+    close_file_handle(h);
+    clear_file_handle(h);
+    std::get<2>(f).clear();
+  }
+  this->m_file_id_stats_map.clear();
+}
+
+template <typename sample_name_t>
+inline void sample_list_hdf5<sample_name_t>
+::obtain_sample_names(hid_t& h, std::vector<std::string>& sample_names) const {
+  conduit::relay::io::hdf5_group_list_child_names(h, "/", sample_names);
+}
+
+template <typename sample_name_t>
+inline bool sample_list_hdf5<sample_name_t>
+::is_file_handle_valid(const hid_t& h) const {
+  return (h > static_cast<hid_t>(0));
+}
+
+template <typename sample_name_t>
+inline hid_t sample_list_hdf5< sample_name_t>
+::open_file_handle_for_read(const std::string& file_path) {
+  return conduit::relay::io::hdf5_open_file_for_read(file_path);
+}
+
+template <typename sample_name_t>
+inline void sample_list_hdf5<sample_name_t>
+::close_file_handle(hid_t& h) {
+  if(is_file_handle_valid(h)) {
+    conduit::relay::io::hdf5_close_file(h);
+  }
+}
+
+template <>
+inline hid_t uninitialized_file_handle<hid_t>() {
+  return static_cast<hid_t>(0);
+}
+
+template <typename sample_name_t>
+inline void sample_list_hdf5<sample_name_t>
+::clear_file_handle(hid_t& h) {
+  h = uninitialized_file_handle<hid_t>();
+}
+
+} // end of namespace lbann
+
+#endif // __SAMPLE_LIST_HDF5_HPP__
diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp
new file mode 100644
index 00000000000..0f161bed61f
--- /dev/null
+++ b/include/lbann/data_readers/sample_list_impl.hpp
@@ -0,0 +1,747 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <unordered_set>
+#include <algorithm>
+#include <locale>
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include <deque>
+#include <unordered_set>
+#include <memory>
+#include <type_traits>
+#include <limits>
+
+#include <cereal/archives/binary.hpp>
+#include <unistd.h>
+
+namespace lbann {
+
+template<typename T>
+inline std::string to_string(const T val) {
+  return std::to_string(val);
+}
+
+template<>
+inline std::string to_string(const std::string val) {
+  return val;
+}
+
+template <typename sample_name_t>
+inline auto to_sample_name_t(const std::string& sn_str) -> decltype (sample_name_t()){
+  LBANN_ERROR(std::string{} + " :: string conversion is not implement for the sample_name_t");
+  return sample_name_t();
+}
+
+template<> inline int to_sample_name_t<int>(const std::string& sn_str) {
+  return std::stoi(sn_str);
+}
+
+template<> inline long to_sample_name_t<long>(const std::string& sn_str) {
+  return std::stol(sn_str);
+}
+
+template<> inline unsigned long to_sample_name_t<unsigned long>(const std::string& sn_str) {
+  return std::stoul(sn_str);
+}
+
+template<> inline long long to_sample_name_t<long long>(const std::string& sn_str) {
+  return std::stoll(sn_str);
+}
+
+template<> inline unsigned long long to_sample_name_t<unsigned long long>(const std::string& sn_str) {
+  return std::stoull(sn_str);
+}
+
+template<> inline float to_sample_name_t<float>(const std::string& sn_str) {
+  return std::stof(sn_str);
+}
+
+template<> inline double to_sample_name_t<double>(const std::string& sn_str) {
+  return std::stod(sn_str);
+}
+
+template<> inline long double to_sample_name_t<long double>(const std::string& sn_str) {
+  return std::stold(sn_str);
+}
+
+template<> inline std::string to_sample_name_t<std::string>(const std::string& sn_str) {
+  return sn_str;
+}
+
+//------------------------
+//   sample_list_header
+//------------------------
+
+inline sample_list_header::sample_list_header()
+  : m_is_exclusive(false), m_included_sample_count(0u),
+    m_excluded_sample_count(0u), m_num_files(0u),
+    m_file_dir("") {
+}
+
+inline bool sample_list_header::is_exclusive() const {
+  return m_is_exclusive;
+}
+
+inline size_t sample_list_header::get_sample_count() const {
+  return m_included_sample_count;
+}
+
+inline size_t sample_list_header::get_num_files() const {
+  return m_num_files;
+}
+
+inline const std::string& sample_list_header::get_sample_list_filename() const {
+  return m_sample_list_filename;
+}
+
+inline const std::string& sample_list_header::get_file_dir() const {
+  return m_file_dir;
+}
+
+//------------------
+//   sample_list
+//------------------
+
+template <typename sample_name_t>
+inline sample_list<sample_name_t>::sample_list() {
+}
+
+template <typename sample_name_t>
+inline sample_list<sample_name_t>::~sample_list() {
+}
+
+template <typename sample_name_t>
+inline sample_list<sample_name_t>
+::sample_list(const sample_list& rhs) {
+  copy_members(rhs);
+}
+
+template <typename sample_name_t>
+inline sample_list<sample_name_t>& sample_list<sample_name_t>
+::operator=(const sample_list& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+
+  copy_members(rhs);
+
+  return (*this);
+}
+
+template <typename sample_name_t>
+inline sample_list<sample_name_t>& sample_list<sample_name_t>
+::copy(const sample_list& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+
+  copy_members(rhs);
+
+  return (*this);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::copy_members(const sample_list& rhs) {
+  m_header = rhs.m_header;
+  m_sample_list = rhs.m_sample_list;
+
+  /// Keep track of existing filenames
+  m_file_id_stats_map = rhs.m_file_id_stats_map;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::load(const std::string& samplelist_file,
+       size_t stride, size_t offset) {
+  std::ifstream istr(samplelist_file);
+  get_samples_per_file(istr, samplelist_file, stride, offset);
+  istr.close();
+}
+
+template <typename sample_name_t>
+inline sample_list_header sample_list<sample_name_t>
+::load_header(const std::string& samplelist_file) const {
+  std::ifstream istr(samplelist_file);
+  return read_header(istr, samplelist_file);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::load_from_string(const std::string& samplelist) {
+  std::istringstream istr(samplelist);
+  get_samples_per_file(istr, "<LOAD_FROM_STRING>", 1, 0);
+}
+
+template <typename sample_name_t>
+inline size_t sample_list<sample_name_t>
+::size() const {
+  return m_sample_list.size();
+}
+
+template <typename sample_name_t>
+inline size_t sample_list<sample_name_t>
+::get_num_files() const {
+  return m_file_id_stats_map.size();
+}
+
+template <typename sample_name_t>
+inline bool sample_list<sample_name_t>
+::empty() const {
+  return (size() == 0ul);
+}
+
+template <typename sample_name_t>
+inline std::string sample_list<sample_name_t>
+::read_header_line(std::istream& istrm,
+                   const std::string& filename,
+                   const std::string& info) const {
+  if (!istrm.good()) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
+                          + " :: unable to read the header line of sample list " + filename + " for " + info);
+  }
+
+  std::string line;
+  std::getline(istrm, line);
+
+  if (line.empty()) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
+                          + " :: unable to read the header line of sample list " + filename + " for " + info
+                          + " -- the line was empty");
+  }
+  return line;
+}
+
+
+template <typename sample_name_t>
+inline sample_list_header sample_list<sample_name_t>
+::read_header(std::istream& istrm,
+              const std::string& filename) const {
+  sample_list_header hdr;
+
+  hdr.m_sample_list_filename = filename;
+
+  std::string line1 = read_header_line(istrm, filename, "the exclusiveness");
+  std::stringstream header1(line1);
+
+  std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files");
+  std::stringstream header2(line2);
+
+  std::string line3 = read_header_line(istrm, filename, "the data file directory");
+  std::stringstream header3(line3);
+
+  std::string sample_list_type;
+  header1 >> sample_list_type;
+  std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); });
+
+  const std::string type_exclusive = sample_exclusion_list;
+  size_t found = sample_list_type.find(type_exclusive);
+
+  if (found != std::string::npos) {
+    hdr.m_is_exclusive = true;
+  } else {
+    hdr.m_is_exclusive = false;
+  }
+
+  header2 >> hdr.m_included_sample_count;
+  header2 >> hdr.m_excluded_sample_count;
+  header2 >> hdr.m_num_files;
+
+  header3 >> hdr.m_file_dir;
+
+  if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) {
+    LBANN_ERROR(std::string{} + "file " + filename
+                 + " :: data root directory '" + hdr.get_file_dir() + "' does not exist.");
+  }
+
+  return hdr;
+}
+
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::read_sample_list(std::istream& istrm,
+                      size_t stride, size_t offset) {
+  m_sample_list.reserve(m_header.get_sample_count());
+
+  const std::string whitespaces(" \t\f\v\n\r");
+  size_t cnt_files = 0u;
+  std::string line;
+
+  while (std::getline(istrm, line)) {
+    const size_t end_of_str = line.find_last_not_of(whitespaces);
+    if (end_of_str == std::string::npos) { // empty line
+      continue;
+    }
+    if (cnt_files++ >= m_header.get_num_files()) {
+      break;
+    }
+    // Check to see if there is a strided load and skip the lines that are not for this rank
+    if ((cnt_files-1)%stride != offset) {
+      continue;
+    }
+
+    std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing
+    std::string filename;
+
+    sstr >> filename;
+
+    const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
+
+    if (filename.empty() || !check_if_file_exists(file_path)) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
+                            + " :: data file '" + filename + "' does not exist.");
+    }
+
+    const sample_file_id_t index = m_file_id_stats_map.size();
+    static const auto sn0 = uninitialized_sample_name<sample_name_t>();
+    m_sample_list.emplace_back(std::make_pair(index, sn0));
+    m_file_id_stats_map.emplace_back(filename);
+  }
+
+  if (m_header.get_num_files() != cnt_files) {
+    LBANN_ERROR(std::string("Sample list number of files requested ")
+                + std::to_string(m_header.get_num_files())
+                + std::string(" does not equal number of files loaded ")
+                + std::to_string(cnt_files));
+  }
+
+  if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) {
+    LBANN_ERROR(std::string("Sample list count ")
+                + std::to_string(m_header.get_sample_count())
+                + std::string(" does not equal sample list size ")
+                + std::to_string(m_sample_list.size()));
+  }
+}
+
+
+template <typename sample_name_t>
+inline size_t sample_list<sample_name_t>
+::get_samples_per_file(std::istream& istrm,
+                       const std::string& filename,
+                       size_t stride, size_t offset) {
+  m_header = read_header(istrm, filename);
+
+  read_sample_list(istrm, stride, offset);
+
+  return size();
+}
+
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::all_gather_archive(const std::string &archive,
+                     std::vector<std::string>& gathered_archive,
+                     lbann_comm& comm) {
+  if (!options::get()->get_bool("all_gather_old")) {
+    all_gather_archive_new(archive, gathered_archive, comm);
+    return;
+  }
+
+  int size_of_list_archive = archive.size();
+  std::vector<int> packed_sizes(comm.get_procs_per_trainer());
+
+  comm.trainer_all_gather(size_of_list_archive, packed_sizes);
+
+  int total_packed_size = 0;
+  std::vector<int> displ;
+  displ.assign(comm.get_procs_per_trainer()+1, 0);
+
+  for (size_t i = 0u; i < packed_sizes.size(); ++i) {
+    const auto sz = packed_sizes[i];
+    displ[i+1] = displ[i] + sz;
+  }
+  total_packed_size = displ.back();
+
+  if (total_packed_size <= 0) {
+    return;
+  }
+
+  std::string all_samples;
+  all_samples.resize(static_cast<size_t>(total_packed_size));
+
+  std::vector<El::byte> local_data(archive.begin(), archive.end());
+  std::vector<El::byte> packed_data(all_samples.size() * sizeof(decltype(all_samples)::value_type));
+  comm.trainer_all_gather(local_data,
+                          packed_data,
+                          packed_sizes,
+                          displ);
+
+  for (size_t i = 0u; i < packed_sizes.size(); ++i) {
+    std::string& buf = gathered_archive[i];
+    const auto sz = packed_sizes[i];
+    displ[i+1] = displ[i] + sz;
+    std::vector<El::byte>::const_iterator first = packed_data.begin() + displ[i];
+    std::vector<El::byte>::const_iterator last = packed_data.begin() + displ[i] + sz;
+    buf.resize(sz);
+    buf.assign(first, last);
+  }
+  return;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::all_gather_archive_new(const std::string &archive,
+                     std::vector<std::string>& gathered_archive,
+                     lbann_comm& comm) {
+
+  // there's commented out code below to deal with the case where
+  // archive.size() > INT_MAX; but for now let's assume we won't
+  // encounter that (which is true for the 100M JAG set)
+  int constexpr max_int = std::numeric_limits<int>::max();
+  size_t n = archive.size();
+  if (n > max_int) {
+    LBANN_ERROR("(n > max_int");
+  }
+
+  // change int to size_t for case where n > max_int (see commented out
+  // code block below)
+  int size_of_my_archive= archive.size();
+  std::vector<int> packed_sizes(comm.get_procs_per_trainer());
+  comm.trainer_all_gather(size_of_my_archive, packed_sizes);
+
+  int me = comm.get_rank_in_trainer();
+  int np = comm.get_procs_per_trainer();
+
+  size_t g = 0;
+  for (auto t : packed_sizes) {
+    g += t;
+  }
+  if (!me) {
+    std::cout << "global archive size: " << g << std::endl;
+  }
+
+  for (int p=0; p<np; p++) {
+    gathered_archive[p].resize(packed_sizes[p]);
+    if (me == p) {
+      gathered_archive[p] = archive;
+    } 
+    int sz = packed_sizes[p];
+    char *data = const_cast<char*>(gathered_archive[p].data());
+    comm.trainer_broadcast<char>(p, data, sz);
+  }
+
+#if 0
+  std::vector<int> rounds;
+  for (int p=0; p<np; p++) {
+    std::string& buf = gathered_archive[p];
+    buf.resize(packed_sizes[p]);
+
+    rounds.clear();
+    int n = packed_sizes[p]/INT_MAX;
+    if (n < 0) {
+      LBANN_ERROR("(n < 0; that shouldn't be possible; there's a bug; n: ", n, " packed_sizes[p]: ", packed_sizes[p], " packed_sizes[p]/INT_MAX: ", n);
+    }
+    for (int k=0; k<n; k++) {
+      rounds.push_back(INT_MAX);
+    }
+    int remainder = packed_sizes[p] - (n*INT_MAX);
+    rounds.push_back(remainder);
+
+    if (p != me) {
+      gathered_archive[p].resize(packed_sizes[p]);
+    }
+else {
+std::cout << "XX me: " << me << " rounds: ";
+for (auto t : rounds) std::cout << t << " ";
+std::cout << std::endl;
+}
+    size_t offset = 0;
+    for (size_t k=0; k<rounds.size(); k++) {
+      if (me == p) {
+        char *data = const_cast<char*>(archive.data() + offset);
+        comm.trainer_broadcast<char>(p, data, rounds[k]);
+      } else {
+        char *data = const_cast<char*>(gathered_archive[p].data() + offset);
+        comm.trainer_broadcast<char>(p, data, rounds[k]);
+      }
+      offset += rounds[k];
+if (me == p) {
+std::cout << "XX finished round" << std::endl;
+}
+    }
+  }
+#endif
+
+  return;
+}
+
+template <typename sample_name_t>
+template <typename T>
+inline size_t sample_list<sample_name_t>
+::all_gather_field(T data,
+                   std::vector<T>& gathered_data,
+                   lbann_comm& comm) {
+  std::string archive;
+  std::ostringstream ss;
+  {
+    cereal::BinaryOutputArchive oarchive(ss);
+    oarchive(data);
+  } // archive goes out of scope, ensuring all contents are flushed
+  archive = ss.str();
+
+  std::vector<std::string> gathered_archive(comm.get_procs_per_trainer());
+
+  all_gather_archive(archive, gathered_archive, comm);
+
+  std::vector<T> per_rank_data(comm.get_procs_per_trainer());
+
+  size_t gathered_field_size = 0;
+  for (size_t i = 0u; i < gathered_archive.size(); ++i) {
+    std::string& buf = gathered_archive[i];
+    T& tmp = gathered_data[i];
+
+    std::stringstream in_ss(buf);
+    cereal::BinaryInputArchive iarchive(in_ss);
+    iarchive(tmp);
+    gathered_field_size += tmp.size();
+  }
+  return gathered_field_size;
+}
+
+template <typename sample_name_t>
+template <class Archive>
+void sample_list<sample_name_t>
+::serialize( Archive & ar ) {
+  ar(m_header, m_sample_list, m_file_id_stats_map);
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::write_header(std::string& sstr, size_t num_files) const {
+  // The first line indicate if the list is exclusive or inclusive
+  // The next line contains the number of samples (included and excluded),
+  // as well as the number of files, which are the same in this caes
+  // The next line contains the root data file directory
+
+  sstr += (m_header.is_exclusive()? sample_exclusion_list + "\n" : sample_inclusion_list + "\n");
+  size_t total, included, excluded;
+  get_num_samples(total, included, excluded);
+  /// TODO: clarify the comment below
+  /// Include the number of invalid samples, which for an inclusive index list is always 0
+  sstr += std::to_string(included) + ' '  + std::to_string(excluded) + ' '  + std::to_string(num_files) + '\n';
+  sstr += m_header.get_file_dir() + '\n';
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::get_num_samples(size_t& total, size_t& included, size_t& excluded) const {
+  total = size();
+  included = size();
+  excluded = 0ul;
+}
+
+template <typename sample_name_t>
+inline bool sample_list<sample_name_t>
+::to_string(std::string& sstr) const {
+  size_t total_len = 0ul;
+  for (const auto& s : m_sample_list) {
+    const std::string& filename = m_file_id_stats_map[s.first];
+    total_len += filename.size() + 1u;
+  }
+
+  sstr.clear();
+
+  // reserve the string to hold the entire sample lit
+  size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1 + total_len + 1000;
+  sstr.reserve(estimated_len);
+
+  // write the list header
+  write_header(sstr, get_num_files());
+
+  // write the list body
+  for (const auto& s : m_sample_list) {
+    // File name
+    const std::string& filename = m_file_id_stats_map[s.first];
+    sstr += filename + '\n';
+  }
+
+  return true;
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::write(const std::string filename) const {
+  std::string dir, basename;
+  parse_path(filename, dir, basename);
+  if (!dir.empty() && !check_if_dir_exists(dir)) {
+    // The creation of a shared directory must be done once in a coordinated fashion
+    // among the entities that have access to it. Thus, it must be done in advance
+    std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl;
+    return;
+  }
+
+  std::fstream ofs(filename, std::fstream::out | std::fstream::binary);
+
+  if (!ofs.good()) {
+    return;
+  }
+
+  std::string buf;
+  to_string(buf);
+
+  ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type));
+  ofs.close();
+}
+
+template <typename sample_name_t>
+inline const typename sample_list<sample_name_t>::samples_t&
+sample_list<sample_name_t>::get_list() const {
+  return m_sample_list;
+}
+
+template <typename sample_name_t>
+inline const sample_list_header&
+sample_list<sample_name_t>::get_header() const {
+  return m_header;
+}
+
+template <typename sample_name_t>
+inline const typename sample_list<sample_name_t>::sample_t&
+sample_list<sample_name_t>::operator[](size_t idx) const {
+  return m_sample_list[idx];
+}
+
+template <typename sample_name_t>
+inline const std::string& sample_list<sample_name_t>
+::get_samples_filename(sample_file_id_t id) const {
+  return m_file_id_stats_map[id];
+}
+
+template <typename sample_name_t>
+inline   const std::string& sample_list<sample_name_t>
+::get_samples_dirname() const {
+  return m_header.get_file_dir();
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::set_samples_filename(sample_file_id_t id, const std::string& filename) {
+  m_file_id_stats_map[id] = filename;
+}
+
+#if defined(__cpp_if_constexpr) // c++17
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::assign_samples_name() {
+  if constexpr (std::is_integral<sample_name_t>::value
+            && !std::is_same<sample_name_t, bool>::value) {
+    sample_name_t i = 0;
+    for (auto& s: m_sample_list) {
+      s.second = i++;
+    }
+  } else if constexpr (std::is_same<std::string, sample_name_t>::value) {
+    for (auto& s: m_sample_list) {
+      s.second = s.first;
+    }
+  } else {
+    LBANN_ERROR(std::string{} + " :: base class does not implement this method"
+                              + " for the current sample name type");
+  }
+}
+
+template <typename sample_name_t>
+inline sample_name_t uninitialized_sample_name() {
+  if constexpr (std::is_integral<sample_name_t>::value) {
+    return static_cast<sample_name_t>(0);
+  } else if constexpr (std::is_same<std::string, sample_name_t>::value) {
+    return "";
+  } else if constexpr (std::is_floating_point<sample_name_t>::value) {
+    return 0.0;
+  } else if constexpr (std::is_default_constructible<sample_name_t>::value
+                      && std::is_copy_constructible<sample_name_t>::value) {
+    sample_name_t ret{};
+    return ret;
+  } else {
+    LBANN_ERROR(std::string{} + " :: base class does not implement this method"
+                              + " for the current sample name type");
+  }
+}
+#else
+template<> inline void sample_list<size_t>
+::assign_samples_name() {
+  size_t i = 0ul;
+  for (auto& s: m_sample_list) {
+    s.second = i++;
+  }
+}
+
+template<> inline void sample_list<std::string>
+::assign_samples_name() {
+  for (auto& s: m_sample_list) {
+    s.second = s.first;
+  }
+}
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::assign_samples_name() {
+  LBANN_ERROR(std::string{} + " :: base class does not implement this method"
+                            + " for the current sample name type");
+}
+
+template<> inline size_t uninitialized_sample_name<size_t>() {
+  return 0ul;
+}
+
+template<> inline std::string uninitialized_sample_name<std::string>() {
+  return "";
+}
+
+template <typename sample_name_t>
+inline sample_name_t uninitialized_sample_name() {
+  sample_name_t ret{};
+  return ret;
+}
+#endif // defined(__cpp_if_constexpr)
+
+template <typename sample_name_t>
+inline void sample_list<sample_name_t>
+::all_gather_packed_lists(lbann_comm& comm) {
+  int num_ranks = comm.get_procs_per_trainer();
+  typename std::vector<samples_t> per_rank_samples(num_ranks);
+  typename std::vector<std::vector<std::string>> per_rank_files(num_ranks);
+
+  size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm);
+  size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_files, comm);
+
+  m_sample_list.clear();
+  m_file_id_stats_map.clear();
+
+  m_sample_list.reserve(num_samples);
+  m_file_id_stats_map.reserve(num_ids);
+
+  for(int r = 0; r < num_ranks; r++) {
+    const samples_t& s_list = per_rank_samples[r];
+    const auto& files = per_rank_files[r];
+    for (const auto& s : s_list) {
+      sample_file_id_t index = s.first;
+      const std::string& filename = files[index];
+      if(index >= m_file_id_stats_map.size()
+         || (m_file_id_stats_map.back() != filename)) {
+        index = m_file_id_stats_map.size();
+        m_file_id_stats_map.emplace_back(filename);
+      }else {
+        for(size_t i = 0; i < m_file_id_stats_map.size(); i++) {
+          if(filename == m_file_id_stats_map[i]) {
+            index = i;
+            break;
+          }
+        }
+      }
+      static const auto sn0 = uninitialized_sample_name<sample_name_t>();
+      m_sample_list.emplace_back(std::make_pair(index, sn0));
+    }
+  }
+
+  assign_samples_name();
+
+  return;
+}
+
+} // end of namespace lbann
diff --git a/include/lbann/data_readers/sample_list_jag.hpp b/include/lbann/data_readers/sample_list_jag.hpp
deleted file mode 100644
index 07040a80d48..00000000000
--- a/include/lbann/data_readers/sample_list_jag.hpp
+++ /dev/null
@@ -1,321 +0,0 @@
-#ifndef __SAMPLE_LIST_JAG_HPP__
-#define __SAMPLE_LIST_JAG_HPP__
-
-#include <iostream>
-#include <string>
-#include <vector>
-#include <functional>
-
-#ifndef _JAG_OFFLINE_TOOL_MODE_
-#include "lbann/comm.hpp"
-#else
-#include <mpi.h>
-#endif
-
-#include "lbann/utils/file_utils.hpp"
-#include <cereal/types/unordered_map.hpp>
-#include <cereal/types/deque.hpp>
-#include <cereal/types/vector.hpp>
-#include <cereal/types/tuple.hpp>
-#include <cereal/types/string.hpp>
-#include <cereal/types/utility.hpp>
-#include "conduit/conduit_relay_io_hdf5.hpp"
-
-/// Number of system and other files that may be open during execution
-#define LBANN_MAX_OPEN_FILE_MARGIN 128
-#define LBANN_MAX_OPEN_FILE_RETRY 3
-
-namespace lbann {
-
-struct sample_list_header {
-  bool m_is_exclusive;
-  /// Number of included samples
-  size_t m_included_sample_count;
-  /// Number of excluded samples
-  size_t m_excluded_sample_count;
-  size_t m_num_files;
-  std::string m_file_dir;
-  std::string m_sample_list_filename;
-
-  sample_list_header();
-
-  bool is_exclusive() const;
-  size_t get_sample_count() const;
-  size_t get_num_files() const;
-  const std::string& get_sample_list_filename() const;
-  const std::string& get_file_dir() const;
-  template <class Archive> void serialize( Archive & ar ) {
-    ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename);
-  }
-};
-
-static const std::string conduit_hdf5_exclusion_list = "CONDUIT_HDF5_EXCLUSION";
-static const std::string conduit_hdf5_inclusion_list = "CONDUIT_HDF5_INCLUSION";
-
-class sample_list_jag {
- public:
-  /// The type of the native identifier of a sample rather than an arbitrarily assigned index
-  using sample_name_t = std::string;
-  /// The type for arbitrarily assigned index
-  using sample_file_id_t = std::size_t;
-  /// To describe a sample as a pair of the file to which it belongs and its name
-  //  using sample_t = std::pair<std::string, sample_name_t>;
-  using sample_t = std::pair<sample_file_id_t, sample_name_t>;
-  /// Statistics for each file used by the sample list: includes the file name, file descriptor, and
-  /// and a queue of each step and substep when data will be loaded from the file
-  using file_id_stats_t = std::tuple<std::string, hid_t, std::deque<std::pair<int,int>>>;
-
-  /// Type for the list of samples
-  using samples_t = std::vector< sample_t >;
-  /// Mapping of the file index to the statistics for each file
-  using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something
-  /// Type for the map of file descriptors to usage step and substep
-  using fd_use_map_t = std::pair<sample_file_id_t, std::pair<int,int>>;
-
-  sample_list_jag();
-  ~sample_list_jag();
-  sample_list_jag(const sample_list_jag& rhs);
-  sample_list_jag& operator=(const sample_list_jag& rhs);
-  sample_list_jag& copy(const sample_list_jag& rhs);
-
-  void copy_members(const sample_list_jag& rhs);
-
-  /// Load a sample list file
-  void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0);
-
-  /// Load the header of a sample list file
-  sample_list_header load_header(const std::string& samplelist_file) const;
-
-  /// Extract a sample list from a serialized sample list in a string
-  void load_from_string(const std::string& samplelist);
-
-  /// Tells how many samples in the list
-  size_t size() const;
-
-  /// Tells if the internal list is empty
-  bool empty() const;
-
-  /// Clear internal states
-  void clear();
-
-  template <class Archive> void serialize( Archive & ar );
-
-  /// Check if a sample index is in the valid range
-  bool check_index(size_t idx) const;
-
-  /// Serialize sample list
-  bool to_string(std::string& sstr) const;
-
-  /// Write the sample list
-  void write(const std::string filename) const;
-
-  /// Allow read-only access to the internal list data
-  const samples_t& get_list() const;
-
-  /// Allow the read-only access to the list header
-  const sample_list_header& get_header() const;
-
-  /// Allow read-only access to the metadata of the idx-th sample in the list
-  const sample_t& operator[](size_t idx) const;
-
-  const std::string& get_samples_filename(sample_file_id_t id) const {
-    return std::get<0>(m_file_id_stats_map[id]);
-  }
-
-  const std::string& get_samples_dirname() const {
-    return m_header.get_file_dir();
-  }
-
-  hid_t get_samples_hdf5_handle(sample_file_id_t id) const {
-    hid_t h = std::get<1>(m_file_id_stats_map[id]);
-    return h;
-  }
-
-  void set_samples_filename(sample_file_id_t id, const std::string& filename) {
-    std::get<0>(m_file_id_stats_map[id]) = filename;
-  }
-
-  void set_files_hdf5_handle(const std::string& filename, hid_t h) {
-    sample_file_id_t id = 0;
-    for (auto&& e : m_file_id_stats_map) {
-      if(std::get<0>(e) == filename) {
-        std::get<1>(e) = h;
-        break;
-      }
-      id++;
-    }
-    manage_open_hdf5_handles(id, true);
-  }
-
-  void delete_hdf5_handle_pq_entry(sample_file_id_t id) {
-    for (std::deque<fd_use_map_t>::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) {
-      if(it->first == id) {
-        it = m_open_fd_pq.erase(it);
-        break;
-      }
-    }
-    return;
-  }
-
-  void manage_open_hdf5_handles(sample_file_id_t id, bool pre_open_fd = false) {
-    /// When we enter this function the priority queue is either empty or a heap
-    if(!m_open_fd_pq.empty()) {
-      if(m_open_fd_pq.size() > m_max_open_files) {
-        auto& f = m_open_fd_pq.front();
-        auto& victim = m_file_id_stats_map[f.first];
-        hid_t victim_fd = std::get<1>(victim);
-        std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
-        m_open_fd_pq.pop_back();
-        if(victim_fd > 0) {
-          conduit::relay::io::hdf5_close_file(victim_fd);
-          std::get<1>(victim) = 0;
-        }
-      }
-    }
-
-    /// Before we can enqueue the any new access times for this descriptor, remove any
-    /// earlier descriptor
-    std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
-    if(m_open_fd_pq.front().first == id) {
-      m_open_fd_pq.pop_front();
-    }
-    std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
-
-    auto& e = m_file_id_stats_map[id];
-    auto& file_access_queue = std::get<2>(e);
-    if(!file_access_queue.empty()) {
-      if(!pre_open_fd) {
-        file_access_queue.pop_front();
-      }
-    }
-    if(!file_access_queue.empty()) {
-      m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front()));
-    }else {
-      /// If there are no future access of the file place a terminator entry to track
-      /// the open file, but is always sorted to the top of the heap
-      m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id)));
-    }
-    std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
-    return;
-  }
-
-  hid_t open_samples_hdf5_handle(const size_t i, bool pre_open_fd = false) {
-    const sample_t& s = m_sample_list[i];
-    sample_file_id_t id = s.first;
-    hid_t h = get_samples_hdf5_handle(id);
-    if (h <= static_cast<hid_t>(0)) {
-      const std::string& file_name = get_samples_filename(id);
-      const std::string conduit_file_path = add_delimiter(get_samples_dirname()) + file_name;
-      if (file_name.empty() || !check_if_file_exists(conduit_file_path)) {
-        LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist.");
-      }
-      bool retry = false;
-      int retry_cnt = 0;
-      do {
-        try {
-          h = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path );
-        }catch (conduit::Error const& e) {
-          LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what());
-          retry = true;
-          retry_cnt++;
-        }catch (...) {
-          LBANN_ERROR("trying to open the file " + conduit_file_path + " and got an unknown exception");
-        }
-      }while(retry && retry_cnt < 3);
-
-      if (h <= static_cast<hid_t>(0)) {
-        LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' could not be opened.");
-      }
-      auto& e = m_file_id_stats_map[id];
-      std::get<1>(e) = h;
-      /// If a new file is opened, place it in the priority queue
-      manage_open_hdf5_handles(id, pre_open_fd);
-    }
-    return h;
-  }
-
-  void close_if_done_samples_hdf5_handle(const size_t i) {
-    const sample_t& s = m_sample_list[i];
-    sample_file_id_t id = s.first;
-    hid_t h = get_samples_hdf5_handle(id);
-    if (h > static_cast<hid_t>(0)) {
-      auto& e = m_file_id_stats_map[id];
-      auto& file_access_queue = std::get<2>(e);
-      if(file_access_queue.empty()) {
-        conduit::relay::io::hdf5_close_file(std::get<1>(e));
-        std::get<1>(e) = 0;
-        delete_hdf5_handle_pq_entry(id);
-      }
-    }
-  }
-
-  void all_gather_archive(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm);
-  template<typename T> size_t all_gather_field(T data, std::vector<T>& gathered_data, lbann_comm& comm);
-  void all_gather_packed_lists(lbann_comm& comm);
-
-  void compute_epochs_file_usage(const std::vector<int>& shufled_indices, int mini_batch_size, const lbann_comm& comm);
-
- protected:
-
-  /// Reads a header line from the sample list given as a stream, and use the info string for error message
-  std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const;
-
-  /// Reads the header of a sample list
-  sample_list_header read_header(std::istream& istrm, const std::string& filename) const;
-
-  /// Get the list of samples that exist in a conduit bundle
-  hid_t get_conduit_bundle_samples(std::string conduit_file_path, std::vector<std::string>& sample_names, size_t included_samples, size_t excluded_samples);
-
-  /// read the body of exclusive sample list
-  void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0);
-
-  /// read the body of inclusive sample list
-  void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0);
-
-  /// Reads a sample list and populates the internal list
-  size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0);
-
-  /// Add the header info to the given string
-  void write_header(std::string& sstr, size_t num_files) const;
-
-  static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) {
-    return ((left.second).first < (right.second).first) ||
-           (((left.second).first == (right.second).first) &&
-            ((left.second).second < (right.second).second)); }
-
- private:
-  /// header info of sample list
-  sample_list_header m_header;
-
-  /// List of all samples with a file identifier and sample name for each sample
-  samples_t m_sample_list;
-
-  /// Maps sample's file id to file names, file descriptors, and use counts
-  file_id_stats_v_t m_file_id_stats_map;
-
-  /// Track the number of samples per file
-  std::unordered_map<std::string, size_t> m_file_map;
-
-  /// Track the number of open file descriptors and when they will be used next
-  std::deque<fd_use_map_t> m_open_fd_pq;
-
-  size_t m_max_open_files;
-};
-
-void handle_mpi_error(int ierr);
-
-#ifndef _JAG_OFFLINE_TOOL_MODE_
-void distribute_sample_list(const sample_list_jag& sn,
-                            std::string& my_samples,
-                            lbann_comm& comm);
-#else
-void distribute_sample_list(const sample_list_jag& sn,
-                            std::string& my_samples,
-                            MPI_Comm& comm);
-#endif
-
-} // end of namespace
-
-#include "sample_list_jag_impl.hpp"
-
-#endif // __SAMPLE_LIST_JAG_HPP__
diff --git a/include/lbann/data_readers/sample_list_jag_impl.hpp b/include/lbann/data_readers/sample_list_jag_impl.hpp
deleted file mode 100644
index 6b7ea1eeaa8..00000000000
--- a/include/lbann/data_readers/sample_list_jag_impl.hpp
+++ /dev/null
@@ -1,683 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <sstream>
-#include <vector>
-#include <unordered_set>
-#include <algorithm>
-#include <locale>
-#include "sample_list_jag.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <deque>
-#include "hdf5.h"
-#include "conduit/conduit.hpp"
-#include "conduit/conduit_relay.hpp"
-#include "conduit/conduit_relay_io_hdf5.hpp"
-#include <unordered_set>
-#include <memory>
-
-#include <cereal/archives/binary.hpp>
-#include <sstream>
-#include <unistd.h>
-
-namespace lbann {
-
-inline sample_list_header::sample_list_header()
-  : m_is_exclusive(false), m_included_sample_count(0u), m_excluded_sample_count(0u), m_num_files(0u), m_file_dir("") {
-}
-
-inline bool sample_list_header::is_exclusive() const {
-  return m_is_exclusive;
-}
-
-inline size_t sample_list_header::get_sample_count() const {
-  return m_included_sample_count;
-}
-
-inline size_t sample_list_header::get_num_files() const {
-  return m_num_files;
-}
-
-inline const std::string& sample_list_header::get_sample_list_filename() const {
-  return m_sample_list_filename;
-}
-
-inline const std::string& sample_list_header::get_file_dir() const {
-  return m_file_dir;
-}
-
-inline sample_list_jag::sample_list_jag() {
-  m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN;
-}
-
-inline sample_list_jag::~sample_list_jag() {
-  // Close the existing open files
-  for(auto f : m_file_id_stats_map) {
-    if(std::get<1>(f) > 0) {
-      conduit::relay::io::hdf5_close_file(std::get<1>(f));
-    }
-    std::get<1>(f) = 0;
-    std::get<2>(f).clear();
-  }
-  m_file_id_stats_map.clear();
-  m_open_fd_pq.clear();
-}
-
-inline sample_list_jag::sample_list_jag(const sample_list_jag& rhs) {
-  copy_members(rhs);
-}
-
-inline sample_list_jag& sample_list_jag::operator=(const sample_list_jag& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  copy_members(rhs);
-
-  return (*this);
-}
-
-inline sample_list_jag& sample_list_jag::copy(const sample_list_jag& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  copy_members(rhs);
-
-  return (*this);
-}
-
-inline void sample_list_jag::copy_members(const sample_list_jag& rhs) {
-  m_header = rhs.m_header;
-  m_sample_list = rhs.m_sample_list;
-  m_file_id_stats_map = rhs.m_file_id_stats_map;
-  m_file_map = rhs.m_file_map;
-  m_max_open_files = rhs.m_max_open_files;
-
-  /// Keep track of existing filenames but do not copy any file
-  /// descriptor information
-  for(auto&& e : m_file_id_stats_map) {
-    if(std::get<1>(e) > 0) {
-      std::get<1>(e) = 0;
-    }
-    std::get<2>(e).clear();
-  }
-
-  /// Do not copy the open file descriptor priority queue
-  /// File handle ownership is not transfered in the copy
-  m_open_fd_pq.clear();
-}
-
-inline void sample_list_jag::load(const std::string& samplelist_file, size_t stride, size_t offset) {
-  std::ifstream istr(samplelist_file);
-  get_samples_per_file(istr, samplelist_file, stride, offset);
-  istr.close();
-}
-
-inline sample_list_header sample_list_jag::load_header(const std::string& samplelist_file) const {
-  std::ifstream istr(samplelist_file);
-  return read_header(istr, samplelist_file);
-}
-
-inline void sample_list_jag::load_from_string(const std::string& samplelist) {
-  std::istringstream istr(samplelist);
-  get_samples_per_file(istr, "<LOAD_FROM_STRING>", 1, 0);
-}
-
-inline size_t sample_list_jag::size() const {
-  return m_sample_list.size();
-}
-
-inline bool sample_list_jag::empty() const {
-  return m_sample_list.empty();
-}
-
-inline std::string sample_list_jag::read_header_line(std::istream& istrm, const std::string& filename, const std::string& info) const {
-  if (!istrm.good()) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                          + " :: unable to read the header line of sample list " + filename + " for " + info);
-  }
-
-  std::string line;
-  std::getline(istrm, line);
-
-  if (line.empty()) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                          + " :: unable to read the header line of sample list " + filename + " for " + info
-                          + " -- the line was empty");
-  }
-  return line;
-}
-
-
-inline sample_list_header sample_list_jag::read_header(std::istream& istrm, const std::string& filename) const {
-  sample_list_header hdr;
-
-  hdr.m_sample_list_filename = filename;
-
-  std::string line1 = read_header_line(istrm, filename, "the exclusiveness");
-  std::stringstream header1(line1);
-
-  std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files");
-  std::stringstream header2(line2);
-
-  std::string line3 = read_header_line(istrm, filename, "the data file directory");
-  std::stringstream header3(line3);
-
-  std::string sample_list_type;
-  header1 >> sample_list_type;
-  std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); });
-
-  const std::string type_exclusive = conduit_hdf5_exclusion_list;
-  size_t found = sample_list_type.find(type_exclusive);
-
-  if (found != std::string::npos) {
-    hdr.m_is_exclusive = true;
-  } else {
-    hdr.m_is_exclusive = false;
-  }
-
-  header2 >> hdr.m_included_sample_count;
-  header2 >> hdr.m_excluded_sample_count;
-  header2 >> hdr.m_num_files;
-
-  header3 >> hdr.m_file_dir;
-
-  if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) {
-    LBANN_ERROR(std::string{} + "file " + filename
-                 + " :: data root directory '" + hdr.get_file_dir() + "' does not exist.");
-  }
-
-  return hdr;
-}
-
-inline hid_t sample_list_jag::get_conduit_bundle_samples(std::string conduit_file_path, std::vector<std::string>& sample_names, size_t included_samples, size_t excluded_samples) {
-  hid_t hdf5_file_hnd = 0;
-  bool retry = false;
-  int retry_cnt = 0;
-  do {
-    try {
-      hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path );
-    }catch (conduit::Error const& e) {
-      LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what());
-      retry = true;
-      retry_cnt++;
-    }
-  }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY);
-
-  if (hdf5_file_hnd <= static_cast<hid_t>(0)) {
-    std::cout << "Opening the file didn't work" << std::endl;
-    return hdf5_file_hnd;
-  }
-
-  conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", sample_names);
-
-  if(sample_names.size() != (included_samples + excluded_samples)) {
-    LBANN_ERROR(std::string("File does not contain the correct number of samples: found ")
-                + std::to_string(sample_names.size())
-                + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ")
-                + std::to_string(included_samples)
-                + std::string(" and exclusion: ")
-                + std::to_string(excluded_samples));
-  }
-
-  return hdf5_file_hnd;
-}
-
-inline void sample_list_jag::read_exclusive_list(std::istream& istrm, size_t stride, size_t offset) {
-  const std::string whitespaces(" \t\f\v\n\r");
-  size_t cnt_files = 0u;
-  std::string line;
-
-  while (std::getline(istrm, line)) {
-    const size_t end_of_str = line.find_last_not_of(whitespaces);
-    if (end_of_str == std::string::npos) { // empty line
-      continue;
-    }
-    if (cnt_files++ >= m_header.get_num_files()) {
-      break;
-    }
-    // Check to see if there is a strided load and skip the lines that are not for this rank
-    if ((cnt_files-1)%stride != offset) {
-      continue;
-    }
-
-    std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing
-    std::string filename;
-    size_t included_samples;
-    size_t excluded_samples;
-    std::unordered_set<std::string> excluded_sample_indices;
-
-    sstr >> filename >> included_samples >> excluded_samples;
-
-    const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename;
-
-    if (filename.empty() || !check_if_file_exists(conduit_file_path)) {
-      LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist.");
-    }
-
-    excluded_sample_indices.reserve(excluded_samples);
-
-    while(!sstr.eof()) {
-      std::string index;
-      sstr >> index;
-      excluded_sample_indices.insert(index);
-    }
-
-    if(excluded_sample_indices.size() != excluded_samples) {
-      LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ")
-                  + std::to_string(excluded_samples)
-                  + std::string(" exclusions but found ")
-                  + std::to_string(excluded_sample_indices.size()));
-    }
-
-    std::vector<std::string> sample_names;
-    hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples);
-    if(hdf5_file_hnd <= static_cast<hid_t>(0)) {
-      continue; // skipping the file
-    }
-
-    if(m_file_map.count(filename) > 0) {
-      if(sample_names.size() != m_file_map[filename]) {
-        LBANN_ERROR(std::string("The same file ")
-                    + filename
-                    + " was opened multiple times and reported different sizes: "
-                    + std::to_string(sample_names.size())
-                    + " and "
-                    + std::to_string(m_file_map[filename]));
-      }
-    }else {
-      m_file_map[filename] = sample_names.size();
-    }
-
-    sample_file_id_t index = m_file_id_stats_map.size();
-    m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque<std::pair<int,int>>{}));
-    set_files_hdf5_handle(filename, hdf5_file_hnd);
-
-    size_t valid_sample_count = 0u;
-    for(auto s : sample_names) {
-      std::unordered_set<std::string>::const_iterator found = excluded_sample_indices.find(s);
-      if (found != excluded_sample_indices.cend()) {
-        continue;
-      }
-      m_sample_list.emplace_back(index, s);
-      valid_sample_count++;
-    }
-
-    if(valid_sample_count != included_samples) {
-      LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ")
-                  + std::to_string(included_samples)
-                  + std::string(" samples, but found ")
-                  + std::to_string(valid_sample_count));
-    }
-  }
-
-  if (m_header.get_num_files() != cnt_files) {
-    LBANN_ERROR(std::string("Sample list ")
-                + m_header.get_sample_list_filename()
-                + std::string(": number of files requested ")
-                + std::to_string(m_header.get_num_files())
-                + std::string(" does not equal number of files loaded ")
-                + std::to_string(cnt_files));
-  }
-
-  m_header.m_is_exclusive = false;
-}
-
-
-inline void sample_list_jag::read_inclusive_list(std::istream& istrm, size_t stride, size_t offset) {
-  const std::string whitespaces(" \t\f\v\n\r");
-  size_t cnt_files = 0u;
-  std::string line;
-
-  while (std::getline(istrm, line)) {
-    const size_t end_of_str = line.find_last_not_of(whitespaces);
-    if (end_of_str == std::string::npos) { // empty line
-      continue;
-    }
-    if (cnt_files++ >= m_header.get_num_files()) {
-      break;
-    }
-    // Check to see if there is a strided load and skip the lines that are not for this rank
-    if ((cnt_files-1)%stride != offset) {
-      continue;
-    }
-
-    std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing
-    std::string filename;
-    size_t included_samples;
-    size_t excluded_samples;
-
-    sstr >> filename >> included_samples >> excluded_samples;
-
-    const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename;
-
-    if (filename.empty() || !check_if_file_exists(conduit_file_path)) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-                            + " :: data file '" + filename + "' does not exist.");
-    }
-
-    std::vector<std::string> sample_names;
-    hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples);
-    if(hdf5_file_hnd <= static_cast<hid_t>(0)) {
-      continue; // skipping the file
-    }
-
-    if(m_file_map.count(filename) > 0) {
-      if(sample_names.size() != m_file_map[filename]) {
-        LBANN_ERROR(std::string("The same file ")
-                    + filename
-                    + " was opened multiple times and reported different sizes: "
-                    + std::to_string(sample_names.size())
-                    + " and "
-                    + std::to_string(m_file_map[filename]));
-      }
-    }else {
-      m_file_map[filename] = sample_names.size();
-    }
-
-    std::unordered_set<std::string> set_of_samples(sample_names.begin(), sample_names.end());
-
-    sample_file_id_t index = m_file_id_stats_map.size();
-    m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque<std::pair<int,int>>{}));
-    set_files_hdf5_handle(filename, hdf5_file_hnd);
-
-    size_t valid_sample_count = 0u;
-    while(!sstr.eof()) {
-      std::string sample_name;;
-      sstr >> sample_name;
-      std::unordered_set<std::string>::const_iterator found = set_of_samples.find(sample_name);
-      if (found == set_of_samples.cend()) {
-        LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name);
-      }
-      m_sample_list.emplace_back(index, sample_name);
-      valid_sample_count++;
-    }
-    if(valid_sample_count != included_samples) {
-      LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ")
-                  + std::to_string(included_samples)
-                  + std::string(" samples, but found ")
-                  + std::to_string(valid_sample_count));
-    }
-  }
-
-  if (m_header.get_num_files() != cnt_files) {
-    LBANN_ERROR(std::string("Sample list number of files requested ")
-                + std::to_string(m_header.get_num_files())
-                + std::string(" does not equal number of files loaded ")
-                + std::to_string(cnt_files));
-  }
-}
-
-
-inline size_t sample_list_jag::get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride, size_t offset) {
-  m_header = read_header(istrm, filename);
-  m_sample_list.reserve(m_header.get_sample_count());
-
-  if (m_header.is_exclusive()) {
-    read_exclusive_list(istrm, stride, offset);
-  } else {
-    read_inclusive_list(istrm, stride, offset);
-  }
-
-  if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) {
-    LBANN_ERROR(std::string("Sample list count ")
-                + std::to_string(m_header.get_sample_count())
-                + std::string(" does not equal sample list size ")
-                + std::to_string(m_sample_list.size()));
-  }
-
-  return m_sample_list.size();
-}
-
-
-inline void sample_list_jag::all_gather_archive(const std::string &archive, std::vector<std::string>& gathered_archive, lbann_comm& comm) {
-  int size_of_list_archive = archive.size();
-  std::vector<int> packed_sizes(comm.get_procs_per_trainer());
-
-  comm.trainer_all_gather(size_of_list_archive, packed_sizes);
-
-  int total_packed_size = 0;
-  std::vector<int> displ;
-  displ.assign(comm.get_procs_per_trainer()+1, 0);
-
-  for (size_t i = 0u; i < packed_sizes.size(); ++i) {
-    const auto sz = packed_sizes[i];
-    displ[i+1] = displ[i] + sz;
-  }
-  total_packed_size = displ.back();
-
-  if (total_packed_size <= 0) {
-    return;
-  }
-
-  std::string all_samples;
-  all_samples.resize(static_cast<size_t>(total_packed_size));
-
-  std::vector<El::byte> local_data(archive.begin(), archive.end());
-  std::vector<El::byte> packed_data(all_samples.begin(), all_samples.end());
-  comm.trainer_all_gather(local_data,
-                          packed_data,
-                          packed_sizes,
-                          displ);
-
-  for (size_t i = 0u; i < packed_sizes.size(); ++i) {
-    std::string& buf = gathered_archive[i];
-    const auto sz = packed_sizes[i];
-    displ[i+1] = displ[i] + sz;
-    std::vector<El::byte>::const_iterator first = packed_data.begin() + displ[i];
-    std::vector<El::byte>::const_iterator last = packed_data.begin() + displ[i] + sz;
-    buf.resize(sz);
-    buf.assign(first, last);
-  }
-  return;
-}
-
-template<typename T>
-inline size_t sample_list_jag::all_gather_field(T data, std::vector<T>& gathered_data, lbann_comm& comm) {
-  std::string archive;
-  std::stringstream ss;
-  cereal::BinaryOutputArchive oarchive(ss);
-  oarchive(data);
-  archive = ss.str();
-
-  std::vector<std::string> gathered_archive(comm.get_procs_per_trainer());
-
-  all_gather_archive(archive, gathered_archive, comm);
-
-  std::vector<T> per_rank_data(comm.get_procs_per_trainer());
-
-  size_t gathered_field_size = 0;
-  for (size_t i = 0u; i < gathered_archive.size(); ++i) {
-    std::string& buf = gathered_archive[i];
-    T& tmp = gathered_data[i];
-
-    std::stringstream in_ss(buf);
-    cereal::BinaryInputArchive iarchive(in_ss);
-    iarchive(tmp);
-    gathered_field_size += tmp.size();
-  }
-  return gathered_field_size;
-}
-
-inline void sample_list_jag::all_gather_packed_lists(lbann_comm& comm) {
-  int num_ranks = comm.get_procs_per_trainer();
-  std::vector<samples_t> per_rank_samples(num_ranks);
-  std::vector<file_id_stats_v_t> per_rank_file_id_stats_map(num_ranks);
-  std::vector<std::unordered_map<std::string, size_t>> per_rank_file_map(num_ranks);
-
-  // Close the existing open files
-  for(auto&& e : m_file_id_stats_map) {
-    if(std::get<1>(e) > 0) {
-      conduit::relay::io::hdf5_close_file(std::get<1>(e));
-      std::get<1>(e) = 0;
-    }
-    std::get<2>(e).clear();
-  }
-  m_open_fd_pq.clear();
-
-  size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm);
-  size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_file_id_stats_map, comm);
-  size_t num_files = all_gather_field(m_file_map, per_rank_file_map, comm);
-
-  m_sample_list.clear();
-  m_file_id_stats_map.clear();
-
-  m_sample_list.reserve(num_samples);
-  m_file_id_stats_map.reserve(num_ids);
-  m_file_map.reserve(num_files);
-
-  for(int r = 0; r < num_ranks; r++) {
-    const samples_t& sample_list = per_rank_samples[r];
-    const file_id_stats_v_t& file_id_stats_map = per_rank_file_id_stats_map[r];
-    const std::unordered_map<std::string, size_t>& file_map = per_rank_file_map[r];
-    for (const auto& s : sample_list) {
-      sample_file_id_t index = s.first;
-      const std::string& filename = std::get<0>(file_id_stats_map[index]);
-      if(index >= m_file_id_stats_map.size()
-         || (std::get<0>(m_file_id_stats_map.back()) != filename)) {
-        index = m_file_id_stats_map.size();
-        m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque<std::pair<int,int>>{}));
-        // Update the file map structure
-        if(m_file_map.count(filename) == 0) {
-          m_file_map[filename] = file_map.at(filename);
-        }
-      }else {
-        for(size_t i = 0; i < m_file_id_stats_map.size(); i++) {
-          if(filename == std::get<0>(m_file_id_stats_map[i])) {
-            index = i;
-            break;
-          }
-        }
-      }
-      m_sample_list.emplace_back(std::make_pair(index, s.second));
-    }
-  }
-
-  return;
-}
-
-inline void sample_list_jag::compute_epochs_file_usage(const std::vector<int>& shuffled_indices, int mini_batch_size, const lbann_comm& comm) {
-  for (auto&& e : m_file_id_stats_map) {
-    if(std::get<1>(e) > 0) {
-      conduit::relay::io::hdf5_close_file(std::get<1>(e));
-    }
-    std::get<1>(e) = 0;
-    std::get<2>(e).clear();
-  }
-  // Once all of the file handles are closed, clear the priority queue
-  m_open_fd_pq.clear();
-
-  for (size_t i = 0; i < shuffled_indices.size(); i++) {
-    int idx = shuffled_indices[i];
-    const auto& s = m_sample_list[idx];
-    sample_file_id_t index = s.first;
-
-    if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast<size_t>(comm.get_rank_in_trainer())) {
-      /// Enqueue the iteration step when the sample will get used
-      int step = i / mini_batch_size;
-      int substep = (i % mini_batch_size) / comm.get_procs_per_trainer();
-      std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep));
-    }
-  }
-}
-
-inline void sample_list_jag::clear() {
-  m_sample_list.clear();
-}
-
-template <class Archive> void sample_list_jag::serialize( Archive & ar ) {
-  ar(m_header, m_sample_list, m_file_id_stats_map);
-}
-
-inline void sample_list_jag::write_header(std::string& sstr, size_t num_files) const {
-  // The first line indicate if the list is exclusive or inclusive
-  // The next line contains the number of samples and the number of files, which are the same in this caes
-  // The next line contains the root data file directory
-
-  sstr += (m_header.is_exclusive()? conduit_hdf5_exclusion_list + "\n" : conduit_hdf5_inclusion_list + "\n");
-  /// Include the number of invalid samples, which for an inclusive index list is always 0
-  sstr += std::to_string(m_sample_list.size()) + " 0 " + std::to_string(num_files) + '\n';
-  sstr += m_header.get_file_dir() + '\n';
-}
-
-
-inline bool sample_list_jag::to_string(std::string& sstr) const {
-  std::map<std::string, std::vector<sample_name_t>> tmp_file_map;
-  for (const auto& s : m_sample_list) {
-    std::string filename = std::get<0>(m_file_id_stats_map[s.first]);
-    tmp_file_map[filename].emplace_back(s.second);
-  }
-
-  samples_t::const_iterator it_begin = m_sample_list.cbegin();
-  samples_t::const_iterator it_end = m_sample_list.cbegin();
-
-  sstr.clear();
-
-  // reserve the string to hold the entire sample lit
-  size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1;
-  if (it_begin < it_end) {
-    estimated_len += tmp_file_map.size();
-    sstr.reserve(estimated_len);
-  }
-
-  // write the list header
-  write_header(sstr, tmp_file_map.size());
-
-  // write the list body
-  for (const auto& f : tmp_file_map) {
-    // File name
-    sstr += f.first;
-    // Number of included samples
-    sstr += std::string(" ") + std::to_string(f.second.size());
-    // Number of excluded samples
-    sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size());
-    // Inclusion sample list
-    for (const auto& s : f.second) {
-      sstr += ' ' + s;
-    }
-    sstr += '\n';
-  }
-
-  return true;
-}
-
-inline void sample_list_jag::write(const std::string filename) const {
-  std::string dir, basename;
-  parse_path(filename, dir, basename);
-  if (!dir.empty() && !check_if_dir_exists(dir)) {
-    // The creation of a shared directory must be done once in a coordinated fashion
-    // among the entities that have access to it. Thus, it must be done in advance
-    std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl;
-    return;
-  }
-
-  std::fstream ofs(filename, std::fstream::out | std::fstream::binary);
-
-  if (!ofs.good()) {
-    return;
-  }
-
-  std::string buf;
-  to_string(buf);
-
-  ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type));
-  ofs.close();
-}
-
-inline const sample_list_jag::samples_t& sample_list_jag::get_list() const {
-  return m_sample_list;
-}
-
-inline const sample_list_header& sample_list_jag::get_header() const {
-  return m_header;
-}
-
-inline const sample_list_jag::sample_t& sample_list_jag::operator[](size_t idx) const {
-  return m_sample_list[idx];
-}
-
-} // end of namespace lbann
diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp
new file mode 100644
index 00000000000..57bfb89980e
--- /dev/null
+++ b/include/lbann/data_readers/sample_list_open_files.hpp
@@ -0,0 +1,152 @@
+#ifndef __SAMPLE_LIST_OPEN_FILES_HPP__
+#define __SAMPLE_LIST_OPEN_FILES_HPP__
+
+#include "sample_list.hpp"
+
+/// Number of system and other files that may be open during execution
+#define LBANN_MAX_OPEN_FILE_MARGIN 128
+#define LBANN_MAX_OPEN_FILE_RETRY 3
+
+namespace lbann {
+
+template <typename sample_name_t, typename file_handle_t>
+class sample_list_open_files : public sample_list<sample_name_t> {
+ public:
+  /// The type for the index assigned to each sample file
+  using sample_file_id_t = std::size_t;
+  /** To describe a sample as a pair of the file to which it belongs and its name
+      Each file may contain multiple samples. */
+  using sample_t = std::pair<sample_file_id_t, sample_name_t>;
+  /// Information for each file used by the sample list: includes the file name, file descriptor, and
+  /// and a queue of each step and substep when data will be loaded from the file
+  using file_id_stats_t = std::tuple<std::string, file_handle_t, std::deque<std::pair<int,int>>>;
+
+  /// Type for the list of samples
+  using samples_t = std::template vector< sample_t >;
+  /// Mapping of the file index to the statistics for each file
+  using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something
+  /// Type for the map of file descriptors to usage step and substep
+  using fd_use_map_t = std::template pair<sample_file_id_t, std::pair<int,int>>;
+
+  sample_list_open_files();
+  virtual ~sample_list_open_files();
+  /** Copy constructor repllicates all the member variables as they are except
+    * the file information vector, for which only the file name is copied. */
+  sample_list_open_files(const sample_list_open_files& rhs);
+  /** assignemnt operation repllicates all the member variables as they are except
+    * the file information vector, for which only the file name is copied. */
+  sample_list_open_files& operator=(const sample_list_open_files& rhs);
+  sample_list_open_files& copy(const sample_list_open_files& rhs);
+
+  void copy_members(const sample_list_open_files& rhs);
+
+  /// Tells how many samples in the list
+  size_t size() const override;
+
+  /// Tells how many sample files are there
+  size_t get_num_files() const override;
+
+  using sample_list<sample_name_t>::load;
+  /// Emit a serialized archive using the cereal library
+  template <class Archive> void save( Archive & ar ) const;
+  /// Restore the member variables from a given archrive serialized by the cereal library
+  template <class Archive> void load( Archive & ar );
+
+  /// Serialize this sample list into an std::string object
+  bool to_string(std::string& sstr) const override;
+
+  /// Allow read-only access to the internal list data
+  const samples_t& get_list() const;
+
+  /// Allow read-only access to the metadata of the idx-th sample in the list
+  const sample_t& operator[](size_t idx) const;
+
+  const std::string& get_samples_filename(sample_file_id_t id) const override;
+
+  file_handle_t get_samples_file_handle(sample_file_id_t id) const;
+
+  void set_files_handle(const std::string& filename, file_handle_t h);
+
+  void delete_file_handle_pq_entry(sample_file_id_t id);
+
+  void manage_open_file_handles(sample_file_id_t id, bool pre_open_fd = false);
+
+  file_handle_t open_samples_file_handle(const size_t i, bool pre_open_fd = false);
+
+  virtual void close_if_done_samples_file_handle(const size_t i);
+
+  void compute_epochs_file_usage(const std::vector<int>& shufled_indices, int mini_batch_size, const lbann_comm& comm);
+
+  virtual bool is_file_handle_valid(const file_handle_t& h) const = 0;
+
+  void all_gather_packed_lists(lbann_comm& comm) override;
+
+ protected:
+
+  void set_samples_filename(sample_file_id_t id, const std::string& filename) override;
+
+  /// Get the list of samples from a specific type of bundle file
+  virtual void obtain_sample_names(file_handle_t& h, std::vector<std::string>& sample_names) const = 0;
+
+  file_handle_t open_file_handle(std::string file_path);
+
+  /// Get the list of samples that exist in a bundle file
+  file_handle_t get_bundled_sample_names(std::string file_path, std::vector<std::string>& sample_names, size_t included_samples, size_t excluded_samples);
+
+  /// Check that the list of samples given actually exist in a bundle file
+  void validate_implicit_bundles_sample_names(std::string file_path, std::string filename, std::vector<std::string>& sample_names, size_t included_samples, size_t excluded_samples);
+
+  /// read the body of exclusive sample list
+  void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0);
+
+  /// read the body of inclusive sample list
+  void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0);
+
+  /// read the body of a sample list
+  void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0) override;
+
+  void assign_samples_name() override {}
+
+  /// Get the number of total/included/excluded samples
+  void get_num_samples(size_t& total, size_t& included, size_t& excluded) const override;
+
+  static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) {
+    return ((left.second).first < (right.second).first) ||
+           (((left.second).first == (right.second).first) &&
+            ((left.second).second < (right.second).second)); }
+
+  virtual file_handle_t open_file_handle_for_read(const std::string& file_path) = 0;
+  virtual void close_file_handle(file_handle_t& h) = 0;
+  virtual void clear_file_handle(file_handle_t& h) = 0;
+
+ private:
+  using sample_list<sample_name_t>::serialize;
+  template <class Archive> void serialize( Archive & ar ) = delete;
+
+ protected:
+  using sample_list<sample_name_t>::m_header;
+
+  /// Maps sample's file id to file names, file descriptors, and use counts
+  file_id_stats_v_t m_file_id_stats_map;
+
+ private:
+  /// List of all samples with a file identifier and sample name for each sample
+  samples_t m_sample_list;
+
+  /// Track the number of samples per file
+  std::unordered_map<std::string, size_t> m_file_map;
+
+  /// Track the number of open file descriptors and when they will be used next
+  std::deque<fd_use_map_t> m_open_fd_pq;
+
+  size_t m_max_open_files;
+};
+
+template<typename T>
+inline T uninitialized_file_handle();
+
+} // end of namespace
+
+#include "sample_list_open_files_impl.hpp"
+
+#endif // __SAMPLE_LIST_OPEN_FILES_HPP__
diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp
new file mode 100644
index 00000000000..565b016bd22
--- /dev/null
+++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp
@@ -0,0 +1,719 @@
+namespace lbann {
+
+template <typename sample_name_t, typename file_handle_t>
+inline sample_list_open_files<sample_name_t, file_handle_t>::sample_list_open_files() {
+  m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline sample_list_open_files<sample_name_t, file_handle_t>::~sample_list_open_files() {
+  m_open_fd_pq.clear();
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline sample_list_open_files<sample_name_t, file_handle_t>
+::sample_list_open_files(const sample_list_open_files& rhs) {
+  copy_members(rhs);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline sample_list_open_files<sample_name_t, file_handle_t>&
+sample_list_open_files<sample_name_t, file_handle_t>
+::operator=(const sample_list_open_files& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+
+  copy_members(rhs);
+
+  return (*this);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline sample_list_open_files<sample_name_t, file_handle_t>&
+sample_list_open_files<sample_name_t, file_handle_t>
+::copy(const sample_list_open_files& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+
+  copy_members(rhs);
+
+  return (*this);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::copy_members(const sample_list_open_files& rhs) {
+  sample_list<sample_name_t>::copy_members(rhs);
+  m_sample_list = rhs.m_sample_list;
+  m_file_map = rhs.m_file_map;
+  m_max_open_files = rhs.m_max_open_files;
+
+  /// Keep track of existing filenames but do not copy any file
+  /// descriptor information
+  m_file_id_stats_map.assign(rhs.m_file_id_stats_map.size(),
+                             std::make_tuple("",
+                                             uninitialized_file_handle<file_handle_t>(),
+                                             std::deque<std::pair<int,int>>{}));
+
+  for(size_t i = 0u; i < m_file_id_stats_map.size(); ++i) {
+    set_samples_filename(i, rhs.get_samples_filename(i));
+  }
+
+  /// Do not copy the open file descriptor priority queue
+  /// File handle ownership is not transfered in the copy
+  m_open_fd_pq.clear();
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline size_t sample_list_open_files<sample_name_t, file_handle_t>
+::size() const {
+  return m_sample_list.size();
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline size_t sample_list_open_files<sample_name_t, file_handle_t>
+::get_num_files() const {
+  return m_file_id_stats_map.size();
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::read_exclusive_list(std::istream& istrm,
+                      size_t stride, size_t offset) {
+  const std::string whitespaces(" \t\f\v\n\r");
+  size_t cnt_files = 0u;
+  std::string line;
+
+  while (std::getline(istrm, line)) {
+    const size_t end_of_str = line.find_last_not_of(whitespaces);
+    if (end_of_str == std::string::npos) { // empty line
+      continue;
+    }
+    if (cnt_files++ >= m_header.get_num_files()) {
+      break;
+    }
+    // Check to see if there is a strided load and skip the lines that are not for this rank
+    if ((cnt_files-1)%stride != offset) {
+      continue;
+    }
+
+    std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing
+    std::string filename;
+    size_t included_samples;
+    size_t excluded_samples;
+    std::unordered_set<std::string> excluded_sample_indices;
+
+    sstr >> filename >> included_samples >> excluded_samples;
+
+    const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
+
+    if (filename.empty() || !check_if_file_exists(file_path)) {
+      LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist.");
+    }
+
+    excluded_sample_indices.reserve(excluded_samples);
+
+    while(!sstr.eof()) {
+      std::string index;
+      sstr >> index;
+      excluded_sample_indices.insert(index);
+    }
+
+    if(excluded_sample_indices.size() != excluded_samples) {
+      LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ")
+                  + std::to_string(excluded_samples)
+                  + std::string(" exclusions but found ")
+                  + std::to_string(excluded_sample_indices.size()));
+    }
+
+    std::vector<std::string> sample_names;
+    file_handle_t file_hnd = get_bundled_sample_names(file_path, sample_names, included_samples, excluded_samples);
+    if (!is_file_handle_valid(file_hnd)) {
+      continue; // skipping the file
+    }
+
+    if(m_file_map.count(filename) > 0) {
+      if(sample_names.size() != m_file_map[filename]) {
+        LBANN_ERROR(std::string("The same file ")
+                    + filename
+                    + " was opened multiple times and reported different sizes: "
+                    + std::to_string(sample_names.size())
+                    + " and "
+                    + std::to_string(m_file_map[filename]));
+      }
+    }else {
+      m_file_map[filename] = sample_names.size();
+    }
+
+    sample_file_id_t index = m_file_id_stats_map.size();
+    m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle<file_handle_t>(), std::deque<std::pair<int,int>>{}));
+    set_files_handle(filename, file_hnd);
+
+    size_t valid_sample_count = 0u;
+    for(auto s : sample_names) {
+      std::unordered_set<std::string>::const_iterator found = excluded_sample_indices.find(s);
+      if (found != excluded_sample_indices.cend()) {
+        continue;
+      }
+      m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(s));
+      valid_sample_count++;
+    }
+
+    if(valid_sample_count != included_samples) {
+      LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ")
+                  + std::to_string(included_samples)
+                  + std::string(" samples, but found ")
+                  + std::to_string(valid_sample_count));
+    }
+  }
+
+  if (m_header.get_num_files() != cnt_files) {
+    LBANN_ERROR(std::string("Sample list ")
+                + m_header.get_sample_list_filename()
+                + std::string(": number of files requested ")
+                + std::to_string(m_header.get_num_files())
+                + std::string(" does not equal number of files loaded ")
+                + std::to_string(cnt_files));
+  }
+
+  m_header.m_is_exclusive = false;
+}
+
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::read_inclusive_list(std::istream& istrm,
+                      size_t stride, size_t offset) {
+  const std::string whitespaces(" \t\f\v\n\r");
+  size_t cnt_files = 0u;
+  std::string line;
+
+  while (std::getline(istrm, line)) {
+    const size_t end_of_str = line.find_last_not_of(whitespaces);
+    if (end_of_str == std::string::npos) { // empty line
+      continue;
+    }
+    if (cnt_files++ >= m_header.get_num_files()) {
+      break;
+    }
+    // Check to see if there is a strided load and skip the lines that are not for this rank
+    if ((cnt_files-1)%stride != offset) {
+      continue;
+    }
+
+    std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing
+    std::string filename;
+    size_t included_samples;
+    size_t excluded_samples;
+
+    sstr >> filename >> included_samples >> excluded_samples;
+
+    const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename;
+
+    if (filename.empty() || !check_if_file_exists(file_path)) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__)
+                            + " :: data file '" + filename + "' does not exist.");
+    }
+
+    file_handle_t file_hnd = open_file_handle(file_path);
+    if (!is_file_handle_valid(file_hnd)) {
+      continue; // skipping the file
+    }
+
+    sample_file_id_t index = m_file_id_stats_map.size();
+    m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle<file_handle_t>(), std::deque<std::pair<int,int>>{}));
+    set_files_handle(filename, file_hnd);
+
+    size_t valid_sample_count = 0u;
+    //#define VALIDATE_SAMPLE_LIST
+#ifdef VALIDATE_SAMPLE_LIST
+    std::vector<std::string> sample_names;
+#endif
+    while(!sstr.eof()) {
+      std::string sample_name_str;
+      sstr >> sample_name_str;
+      m_sample_list.emplace_back(index, to_sample_name_t<sample_name_t>(sample_name_str));
+#ifdef VALIDATE_SAMPLE_LIST
+      sample_names.emplace_back(sample_name_str);
+#endif
+      valid_sample_count++;
+    }
+    if(valid_sample_count != included_samples) {
+      LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ")
+                  + std::to_string(included_samples)
+                  + std::string(" samples, but found ")
+                  + std::to_string(valid_sample_count));
+    }
+
+    if(m_file_map.count(filename) > 0) {
+      if(valid_sample_count != m_file_map[filename]) {
+        LBANN_ERROR(std::string("The same file ")
+                    + filename
+                    + " was opened multiple times and reported different sizes: "
+                    + std::to_string(valid_sample_count)
+                    + " and "
+                    + std::to_string(m_file_map[filename]));
+      }
+    }else {
+      m_file_map[filename] = /*valid_sample_count*/ included_samples + excluded_samples;
+    }
+#ifdef VALIDATE_SAMPLE_LIST
+    validate_implicit_bundles_sample_names(file_path, filename, sample_names, included_samples, excluded_samples);
+#endif
+  }
+
+  if (m_header.get_num_files() != cnt_files) {
+    LBANN_ERROR(std::string("Sample list number of files requested ")
+                + std::to_string(m_header.get_num_files())
+                + std::string(" does not equal number of files loaded ")
+                + std::to_string(cnt_files));
+  }
+}
+
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::read_sample_list(std::istream& istrm, size_t stride, size_t offset) {
+  if (m_header.is_exclusive()) {
+    read_exclusive_list(istrm, stride, offset);
+  } else {
+    read_inclusive_list(istrm, stride, offset);
+  }
+}
+
+
+template <typename sample_name_t, typename file_handle_t>
+template <class Archive>
+void sample_list_open_files<sample_name_t, file_handle_t>
+::save( Archive & ar ) const {
+  using ar_file_stats_t = std::tuple<std::string, std::deque<std::pair<int,int>>>;
+  std::vector<ar_file_stats_t> file_stats;
+  file_stats.reserve(m_file_id_stats_map.size());
+  for(auto&& e : m_file_id_stats_map) {
+    file_stats.emplace_back(std::make_tuple(std::get<0>(e), std::get<2>(e)));
+  }
+  ar(m_header, m_sample_list, file_stats);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+template <class Archive>
+void sample_list_open_files<sample_name_t, file_handle_t>
+::load( Archive & ar ) {
+  using ar_file_stats_t = std::tuple<std::string, std::deque<std::pair<int,int>>>;
+  std::vector<ar_file_stats_t> file_stats;
+  ar(m_header, m_sample_list, file_stats);
+  m_file_id_stats_map.reserve(file_stats.size());
+  for(auto&& e : file_stats) {
+    //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle<file_handle_t>(), std::deque<std::pair<int,int>>{}));
+    m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle<file_handle_t>(), std::get<1>(e)));
+    //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), file_handle_t(), std::get<1>(e)));
+  }
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline bool sample_list_open_files<sample_name_t, file_handle_t>
+::to_string(std::string& sstr) const {
+  std::map<std::string, std::template vector<sample_name_t>> tmp_file_map;
+  for (const auto& s : m_sample_list) {
+    const std::string& filename = get_samples_filename(s.first);
+    tmp_file_map[filename].emplace_back(s.second);
+  }
+
+  sstr.clear();
+
+  // reserve the string to hold the entire sample lit
+  size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1;
+  for (const auto& f : tmp_file_map) {
+    estimated_len += f.first.size()
+                   + std::to_string(f.second.size()).size()
+                   + std::to_string(m_file_map.at(f.first) - f.second.size()).size()
+                   + 3u;
+    for (const auto& s : f.second) {
+      estimated_len += lbann::to_string(s).size() + 1u;
+    }
+  }
+  sstr.reserve(estimated_len);
+
+  // write the list header
+  this->write_header(sstr, tmp_file_map.size());
+
+  // write the list body
+  for (const auto& f : tmp_file_map) {
+    // File name
+    sstr += f.first;
+    // Number of included samples
+    sstr += std::string(" ") + std::to_string(f.second.size());
+    // Number of excluded samples
+    sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size());
+    // Inclusion sample list
+    for (const auto& s : f.second) {
+      sstr += ' ' + lbann::to_string(s);
+    }
+    sstr += '\n';
+  }
+
+  return true;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::get_num_samples(size_t& total, size_t& included, size_t& excluded) const {
+  total = 0u;
+  for ( const auto f: m_file_map) {
+    total += f.second;
+  }
+  included = size();
+  excluded = total - included;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline const typename sample_list_open_files<sample_name_t, file_handle_t>::samples_t&
+sample_list_open_files<sample_name_t, file_handle_t>::get_list() const {
+  return m_sample_list;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline const typename sample_list_open_files<sample_name_t, file_handle_t>::sample_t&
+sample_list_open_files<sample_name_t, file_handle_t>::operator[](size_t idx) const {
+  return m_sample_list[idx];
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline const std::string& sample_list_open_files<sample_name_t, file_handle_t>
+::get_samples_filename(sample_file_id_t id) const {
+  return std::get<0>(m_file_id_stats_map[id]);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
+::get_samples_file_handle(sample_file_id_t id) const {
+  file_handle_t h = std::get<1>(m_file_id_stats_map[id]);
+  return h;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::set_samples_filename(sample_file_id_t id, const std::string& filename) {
+  std::get<0>(m_file_id_stats_map[id]) = filename;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::set_files_handle(const std::string& filename, file_handle_t h) {
+  sample_file_id_t id = sample_file_id_t(0);
+  for (auto&& e : m_file_id_stats_map) {
+    if(std::get<0>(e) == filename) {
+      std::get<1>(e) = h;
+      break;
+    }
+    id++;
+  }
+  manage_open_file_handles(id, true);
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::obtain_sample_names(file_handle_t& h, std::vector<std::string>& sample_names) const {
+  LBANN_ERROR(std::string{} + " :: abstract class does not implement this method");
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
+::open_file_handle(std::string file_path) {
+  file_handle_t file_hnd;
+  clear_file_handle(file_hnd);
+  bool retry = false;
+  int retry_cnt = 0;
+  do {
+    try {
+      file_hnd = open_file_handle_for_read( file_path );
+    }catch (conduit::Error const& e) {
+      LBANN_WARNING(" :: trying to open the file " + file_path + " and got " + e.what());
+      retry = true;
+      retry_cnt++;
+    }
+  }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY);
+
+  return file_hnd;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
+::get_bundled_sample_names(std::string file_path,
+                           std::vector<std::string>& sample_names,
+                           size_t included_samples,
+                           size_t excluded_samples) {
+  file_handle_t file_hnd = open_file_handle(file_path);
+
+  if (!is_file_handle_valid(file_hnd)) {
+    std::cout << "Opening the file didn't work" << std::endl;
+    return file_hnd;
+  }
+
+  obtain_sample_names(file_hnd, sample_names);
+
+  if(sample_names.size() != (included_samples + excluded_samples)) {
+    LBANN_ERROR(std::string("File does not contain the correct number of samples: found ")
+                + std::to_string(sample_names.size())
+                + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ")
+                + std::to_string(included_samples)
+                + std::string(" and exclusion: ")
+                + std::to_string(excluded_samples));
+  }
+
+  return file_hnd;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::validate_implicit_bundles_sample_names(std::string file_path,
+                                         std::string filename,
+                                         std::vector<std::string>& sample_names,
+                                         size_t included_samples,
+                                         size_t excluded_samples) {
+  std::vector<std::string> all_sample_names;
+  file_handle_t file_hnd = get_bundled_sample_names(file_path, all_sample_names, included_samples, excluded_samples);
+  if (!is_file_handle_valid(file_hnd)) {
+    return; // skipping the file
+  }
+  if(m_file_map.count(filename) > 0) {
+    if(all_sample_names.size() != m_file_map[filename]) {
+      LBANN_ERROR(std::string("The same file ")
+                  + filename
+                  + " was opened multiple times and reported different sizes: "
+                  + std::to_string(all_sample_names.size())
+                  + " and "
+                  + std::to_string(m_file_map[filename]));
+    }
+  }else {
+    m_file_map[filename] = all_sample_names.size();
+  }
+  std::unordered_set<std::string> set_of_samples(all_sample_names.begin(), all_sample_names.end());
+  for(auto&& sample_name : sample_names) {
+    std::unordered_set<std::string>::const_iterator found = set_of_samples.find(sample_name);
+    if (found == set_of_samples.cend()) {
+      LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name);
+    }
+  }
+  return;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::all_gather_packed_lists(lbann_comm& comm) {
+  int num_ranks = comm.get_procs_per_trainer();
+  typename std::vector<samples_t> per_rank_samples(num_ranks);
+  typename std::vector<std::vector<std::string>> per_rank_files(num_ranks);
+  std::vector<std::string> my_files;
+  my_files.reserve(m_file_id_stats_map.size());
+  std::vector<std::unordered_map<std::string, size_t>> per_rank_file_map(num_ranks);
+
+  // Close the existing open files
+  for(auto&& e : m_file_id_stats_map) {
+    auto& h = std::get<1>(e);
+    close_file_handle(h);
+    clear_file_handle(h);
+    std::get<2>(e).clear();
+    my_files.emplace_back(std::get<0>(e));
+  }
+  m_open_fd_pq.clear();
+
+  size_t num_samples = this->all_gather_field(m_sample_list, per_rank_samples, comm);
+  size_t num_ids = this->all_gather_field(my_files, per_rank_files, comm);
+  size_t num_files = this->all_gather_field(m_file_map, per_rank_file_map, comm);
+
+  m_sample_list.clear();
+  m_file_id_stats_map.clear();
+
+  m_sample_list.reserve(num_samples);
+  m_file_id_stats_map.reserve(num_ids);
+  m_file_map.reserve(num_files);
+
+  std::unordered_map<std::string, size_t> mp;
+  for(int r = 0; r < num_ranks; r++) {
+    const samples_t& s_list = per_rank_samples[r];
+    const auto& files = per_rank_files[r];
+    const std::unordered_map<std::string, size_t>& file_map = per_rank_file_map[r];
+    for (const auto& s : s_list) {
+      sample_file_id_t index = s.first;
+      const std::string& filename = files[index];
+      if(index >= m_file_id_stats_map.size()
+         || (std::get<0>(m_file_id_stats_map.back()) != filename)) {
+        index = m_file_id_stats_map.size();
+        m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle<file_handle_t>(), std::deque<std::pair<int,int>>{}));
+        // Update the file map structure
+        if(m_file_map.count(filename) == 0) {
+          m_file_map[filename] = file_map.at(filename);
+        }
+        mp[filename] = index;
+      }else {
+        auto search_result = mp.find(filename);
+        if (search_result == mp.end()) {
+          LBANN_ERROR("mp.find(filename) == mp.end()");
+        }
+        index = search_result->second;
+      }  
+      m_sample_list.emplace_back(std::make_pair(index, s.second));
+    }
+  }
+
+  return;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::compute_epochs_file_usage(const std::vector<int>& shuffled_indices,
+                            int mini_batch_size,
+                            const lbann_comm& comm) {
+  for (auto&& e : m_file_id_stats_map) {
+    auto& h = std::get<1>(e);
+    close_file_handle(h);
+    clear_file_handle(h);
+    std::get<2>(e).clear();
+  }
+  // Once all of the file handles are closed, clear the priority queue
+  m_open_fd_pq.clear();
+  for (size_t i = 0; i < shuffled_indices.size(); i++) {
+    int idx = shuffled_indices[i];
+    const auto& s = m_sample_list[idx];
+    sample_file_id_t index = s.first;
+
+    if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast<size_t>(comm.get_rank_in_trainer())) {
+      /// Enqueue the iteration step when the sample will get used
+      int step = i / mini_batch_size;
+      int substep = (i % mini_batch_size) / comm.get_procs_per_trainer();
+      std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep));
+    }
+  }
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::delete_file_handle_pq_entry(sample_file_id_t id) {
+  for (std::deque<fd_use_map_t>::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) {
+    if(it->first == id) {
+      it = m_open_fd_pq.erase(it);
+      break;
+    }
+  }
+  return;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::manage_open_file_handles(sample_file_id_t id, bool pre_open_fd) {
+  /// When we enter this function the priority queue is either empty or a heap
+  if(!m_open_fd_pq.empty()) {
+    if(m_open_fd_pq.size() > m_max_open_files) {
+      auto& f = m_open_fd_pq.front();
+      auto& victim = m_file_id_stats_map[f.first];
+      auto& victim_fd = std::get<1>(victim);
+      std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
+      m_open_fd_pq.pop_back();
+      close_file_handle(victim_fd);
+      clear_file_handle(victim_fd);
+    }
+  }
+
+  /// Before we can enqueue the any new access times for this descriptor, remove any
+  /// earlier descriptor
+  std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
+  if(m_open_fd_pq.front().first == id) {
+    m_open_fd_pq.pop_front();
+  }
+  std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
+
+  auto& e = m_file_id_stats_map[id];
+  auto& file_access_queue = std::get<2>(e);
+  if(!file_access_queue.empty()) {
+    if(!pre_open_fd) {
+      file_access_queue.pop_front();
+    }
+  }
+  if(!file_access_queue.empty()) {
+    m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front()));
+  }else {
+    /// If there are no future access of the file place a terminator entry to track
+    /// the open file, but is always sorted to the top of the heap
+    m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id)));
+  }
+  std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp);
+  return;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
+::open_samples_file_handle(const size_t i, bool pre_open_fd) {
+  const sample_t& s = m_sample_list[i];
+  sample_file_id_t id = s.first;
+  file_handle_t h = get_samples_file_handle(id);
+  if (!is_file_handle_valid(h)) {
+    const std::string& file_name = get_samples_filename(id);
+    const std::string& file_dir = this->get_samples_dirname();
+    const std::string file_path = add_delimiter(file_dir) + file_name;
+    if (file_name.empty() || !check_if_file_exists(file_path)) {
+      LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist.");
+    }
+
+    h = open_file_handle(file_path);
+
+    if (!is_file_handle_valid(h)) {
+      LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' could not be opened.");
+    }
+    auto& e = m_file_id_stats_map[id];
+    std::get<1>(e) = h;
+    /// If a new file is opened, place it in the priority queue
+    manage_open_file_handles(id, pre_open_fd);
+  }
+  return h;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::close_if_done_samples_file_handle(const size_t i) {
+  const sample_t& s = m_sample_list[i];
+  sample_file_id_t id = s.first;
+  auto h = get_samples_file_handle(id);
+  if (!is_file_handle_valid(h)) {
+    auto& e = m_file_id_stats_map[id];
+    auto& file_access_queue = std::get<2>(e);
+    if(file_access_queue.empty()) {
+      auto& fh = std::get<1>(e);
+      close_file_handle(fh);
+      clear_file_handle(fh);
+      delete_file_handle_pq_entry(id);
+    }
+  }
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline bool sample_list_open_files<sample_name_t, file_handle_t>
+::is_file_handle_valid(const file_handle_t& h) const {
+  LBANN_ERROR(std::string{} + " :: abstract class does not implement this method");
+  return false;
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline file_handle_t sample_list_open_files<sample_name_t, file_handle_t>
+::open_file_handle_for_read(const std::string& file_path) {
+  LBANN_ERROR(std::string{} + " :: abstract class does not implement this method");
+  return file_handle_t();
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::close_file_handle(file_handle_t& h) {
+  LBANN_ERROR(std::string{} + " :: abstract class does not implement this method");
+}
+
+template <typename sample_name_t, typename file_handle_t>
+inline void sample_list_open_files<sample_name_t, file_handle_t>
+::clear_file_handle(file_handle_t& h) {
+  LBANN_ERROR(std::string{} + " :: abstract class does not implement this method");
+}
+
+} // end of namespace lbann
diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp
index 91995485d83..df0f1ced1d4 100644
--- a/include/lbann/data_store/data_store_conduit.hpp
+++ b/include/lbann/data_store/data_store_conduit.hpp
@@ -30,13 +30,13 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
+#include "lbann/utils/exception.hpp"
 #include "conduit/conduit_node.hpp"
 #include <unordered_map>
 #include <unordered_set>
+#include <mutex>
 
 
 namespace lbann {
@@ -52,6 +52,13 @@ class data_store_conduit {
 
  public:
 
+  // need to quickly change from unordered_map to map for debugging
+  using map_ii_t = std::unordered_map<int,int>;
+  using map_is_t = std::unordered_map<int,size_t>;
+
+  // not currently used; will be in the future
+  using map_ss_t = std::unordered_map<size_t,size_t>;
+
   //! ctor
   data_store_conduit(generic_data_reader *reader);
 
@@ -69,36 +76,32 @@ class data_store_conduit {
   //! dtor
   ~data_store_conduit();
 
-  /// normally not needed, since reader is passed to ctor. But may
-  /// be useful in some cases
-  void set_data_reader_ptr(generic_data_reader *reader) { m_reader = reader; }
+  void set_data_reader_ptr(generic_data_reader *reader);
 
   //! convenience handle
-  void set_shuffled_indices(const std::vector<int> *indices) { m_shuffled_indices = indices; }
-
-  void setup(int mini_batch_size);
+  void set_shuffled_indices(const std::vector<int> *indices);
 
-  /*
-   * dah - may be needed in the future, but not needed for bare-bones squashing
-  void set_is_subsidiary_store() {
-    m_is_subsidiary_store = true;
-  }
+  /** @brief Returns the number of samples summed over all ranks */
+  size_t get_num_global_indices() const;
 
-  bool is_subsidiary_store() const {
-    return m_is_subsidiary_store;
-  }
-  */
+  void setup(int mini_batch_size);
 
+  // TODO FIXME
   void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset);
 
-  /// returns the conduit node
+  /** @brief Returns the conduit Node associated with the data_id */
   const conduit::Node & get_conduit_node(int data_id) const;
 
-  /// if 'already_have = true' then the passed 'node' was obtained by a call to
-  /// get_empty_node(). In some operating modes this saves us from copying the node
-  void set_conduit_node(int data_id, conduit::Node &node, bool already_have = false);
+  /** @brief Set a conduit node in the data store
+   *
+   * if 'already_have = true' then the passed 'node' was obtained by a call to
+   * get_empty_node(); note, we do this to prevent copying the node
+   */
+  void set_conduit_node(int data_id, const conduit::Node &node, bool already_have = false);
 
-  void set_preloaded_conduit_node(int data_id, conduit::Node &node);
+  void set_preloaded_conduit_node(int data_id, const conduit::Node &node);
+
+  void spill_preloaded_conduit_node(int data_id, const conduit::Node &node);
 
   const conduit::Node & get_random_node() const;
 
@@ -107,21 +110,92 @@ class data_store_conduit {
   /// returns an empty node
   conduit::Node & get_empty_node(int data_id);
 
-  /// As of this writing, will be called if cmd line includes: --preload_data_store
-  /// This may change in the future; TODO revisit
-  void set_preload() { m_preload = true; }
-
-  bool is_preloaded() { return m_preload; }
-
-  void set_explicit_loading(bool flag) { m_explicit_loading = flag; }
+  //=================================================================
+  // methods for setting and querying the data store's mode
+  //=================================================================
+  /** @brief Returns true if preloading is turned on 
+   *
+   * See notes in: is_explicitly_loading()
+   */
+  bool is_preloading() const { return m_preloading; }
+
+  /** @brief Returns true if explicitly loading is turned on 
+   *
+   * 'explicitly loading' means that the data that will be owned
+   * by each rank is passed into the data store during the first epoch.
+   * This is in contrast to preloading, in which the data is passed into
+   * the data store prior to the first epoch. Explicit and preloading
+   * are exclusive: at most only one may be true, however, both will
+   * be set to false when all loading is complete.
+   */
+  bool is_explicitly_loading() const { return m_explicitly_loading; }
+
+  /** @brief Returns true if all loading has been completed 
+   *
+   * See notes in: set_loading_is_complete()
+   */
+  bool is_fully_loaded() const;
+
+  /** @brief Returns "true" is running in local cache mode
+   *
+   * In local cache mode, each node contains a complete copy
+   * of the data set. This is stored in a shared memory segment,
+   * but part of the set may be spilled to disk if memory is
+   * insufficient. Local cache mode is activated via the cmd line
+   * flag: --data_store_cache
+   */ 
+  bool is_local_cache() const { return m_is_local_cache; }
 
-  bool is_explicitly_loading() { return m_explicit_loading; }
+  /** @brief Turn preloading on or off */ 
+  void set_is_preloading(bool flag);
+
+  /** @brief Turn on explicit loading */ 
+  void set_is_explicitly_loading(bool flag);
+
+  /** @brief Marks the data_store as fully loaded
+   *
+   * Fully loaded means that each rank has all the data that it
+   * is intended to own. When not running in local cache mode, this
+   * occurs (1) at the conclusion of preloading, prior to the beginning of 
+   * the first epoch, or (2) at the conclusion of the first epoch, if 
+   * explicitly loading. When running in local cache mode, this occurs 
+   * (1) at the conclusion of preload_local_cache(), which is called prior 
+   * to the first epoch, or (2) at the conclusion of exchange_local_caches(),
+   * at th conclusion of the first epoch, if explicitly loading.
+   */
+  void set_loading_is_complete(); 
+
+
+  /** @brief turns local cache mode on of off */
+  void set_is_local_cache(bool flag = true) { m_is_local_cache = flag; }
+
+  /** @brief Check that explicit loading, preloading, and fully loaded flags are consistent */
+  void check_query_flags() const;
+   
+  //=================================================================
+  // END methods for setting and querying the data store's mode
+  //=================================================================
+
+//XX   void { m_owner_maps_were_exchanged = false; }
+  /// fills in m_owner, which maps index -> owning processor
+  void exchange_owner_maps();
 
   /// fills in m_owner, which maps index -> owning processor
   void build_preloaded_owner_map(const std::vector<int>& per_rank_list_sizes);
 
-  /// Removed nodes corresponding from the indices vector from the data store
-  void purge_unused_samples(const std::vector<int>& indices);
+  /// fills in m_owner, which maps index -> owning processor
+  void set_preloaded_owner_map(const std::unordered_map<int,int> &owner) { m_owner = owner; }
+
+  /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */
+  void clear_owner_map();
+
+  void set_owner_map(const std::unordered_map<int, int> &m) { m_owner = m; }
+
+  /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */
+  void add_owner(int data_id, int owner) { m_owner[data_id] = owner; }
+
+  /** @brief Special handling for ras_lipid_conduit_data_reader; may go away in the future */
+  void set_finished_building_map() { m_owner_maps_were_exchanged = true; }
 
   /// Recompact the nodes because they are not copied properly when instantiating
   /// using the copy constructor
@@ -131,77 +205,239 @@ class data_store_conduit {
   /// with the index
   int get_index_owner(int idx);
 
-  bool is_local_cache() const { return m_is_local_cache; }
 
-  void exchange_mini_batch_data(size_t current_pos, size_t mb_size) {
-    if (is_local_cache()) {
-      return;
-    }
-    if (m_super_node) {
-      exchange_data_by_super_node(current_pos, mb_size);
-    } else {
-      exchange_data_by_sample(current_pos, mb_size);
-    }
-    ++m_n;
-  }
+  /** @brief Read the data set into memory
+   *
+   * Each rank reads a portion of the data set, then
+   * bcasts to all other ranks.
+   */
+  void preload_local_cache();
+
+  void exchange_mini_batch_data(size_t current_pos, size_t mb_size); 
+
+  void set_node_sizes_vary() { m_node_sizes_vary = true; }
 
   bool has_conduit_node(int data_id) const;
 
-protected :
+  /// only used for debugging; pass --debug on cmd line to get
+  /// each data store to print to a different file. This is made
+  /// public so data readers can also print to the file
+  std::ofstream *m_debug = nullptr;
+  std::ofstream *m_profile = nullptr;
 
-  /// records the number of times exchange_mini_batch_data has been called
-  int m_n;
+  /// for use during development and debugging
+  int get_data_size() { return m_data.size(); }
 
-  bool m_is_setup;
+  /// made public for debugging during development
+  void copy_members(const data_store_conduit& rhs);
 
-  void copy_members(const data_store_conduit& rhs, const std::vector<int>& = std::vector<int>());
-  generic_data_reader *m_reader;
+  /** @brief Closes then reopens the debug logging file
+   *
+   * Debug logging is enabled on all ranks via the cmd line flag: --data_store_debug
+   */
+  void flush_debug_file(); 
 
-  lbann_comm *m_comm;
+  /** @brief Closes then reopens the profile logging file
+   *
+   * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile
+   */
+  void flush_profile_file() const; 
 
-  /// rank in the trainer; convenience handle
-  int  m_rank_in_trainer;
+  /** @brief Writes object's state to file */
+  void write_checkpoint(std::string dir_name);
+  
+  /** @brief Loads object's state from file */
+  void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr);
 
-  /// number of procs in the trainer; convenience handle
-  int  m_np_in_trainer;
+  /** @brief Add text to the profiling file, if it's opened */
+  void set_profile_msg(std::string);
 
-  /// convenience handle
-  bool m_world_master;
+  /** @brief Runs an internal test to ensure the locally cached conduit data is correct
+   *
+   * For use during development and testing. This test is activated via
+   * the cmd line flag: --data_store_test_cache. Output may be written to
+   * cout, and the profile and debug files (if they are opened)
+   * @param n is the maximum number of samples to test; set to -1 to test all
+   * @return true, if all samples read from file match those constructed from
+   *               the local shared memory segment (aka, cache)
+   */ 
+  bool test_local_cache_imagenet(int n);
 
-  /// convenience handle
-  bool m_trainer_master;
+  void test_imagenet_node(int sample_id, bool dereference = true);
 
-  /// set to true if data_store is preloaded
-  bool m_preload;
+  size_t get_mem_usage();
 
-  /// set to true if data_store is being explicitly loaded
-  bool m_explicit_loading;
+private :
 
-  /// maps an index to the processor that owns the associated data
-  mutable std::unordered_map<int, int> m_owner;
+  bool m_bcast_sample_size = true;
 
-  /// convenience handle
-  const std::vector<int> *m_shuffled_indices;
+  // if not null, 'm_other' points from a train to a validation
+  // data store; this permits communication which is needed in
+  // special cases (e.g, see: data_reader_npz_ras_lipid.cpp)
+  data_store_conduit *m_other = nullptr;
+
+  bool m_owner_maps_were_exchanged = false;
+
+  bool m_run_checkpoint_test = false;
+
+  /** @brief The number of samples that this processor owns */
+  size_t m_my_num_indices = 0;
+
+  /** @brief if true, then we are spilling (offloading) samples to disk */
+  bool m_spill = false;
+
+  /** @brief if true, then all samples have been spilled */
+  bool m_is_spilled = false;
+
+  /** During spilling, the conduit file pathnames are written to this file */
+  std::ofstream m_metadata;
+
+  /** @brief Base directory for spilling (offloading) conduit nodes */
+  std::string m_spill_dir_base;
+
+  /** @brief Used to form the directory path for spilling conduit nodes */
+  int m_cur_spill_dir_integer = -1;
+
+  /** @brief @brief Current directory for spilling (writing to file) conduit nodes 
+   *
+   * m_cur_spill_dir = m_spill_dir_base/<m_cur_spill_dir_integer>
+   */
+  std::string m_cur_spill_dir;
+
+  /** @brief The directory to use for testing checkpointing
+   *
+   * Testing is activated by passing the cmd flag: --data_store_test_checkpoint=<dir>
+   */
+  std::string m_test_dir;
+
+  /** @brief Contains the number of conduit nodes that have been written to m_cur_dir
+   *
+   * When m_num_files_in_cur_spill_dir == m_max_files_per_directory,
+   * m_cur_spill_dir_integer is incremented and a new m_cur_dir is created
+   */
+  int m_num_files_in_cur_spill_dir;
+
+  /** @brief maps data_id to m_m_cur_spill_dir_integer. */
+  map_ii_t m_spilled_nodes;
+
+  /// used in set_conduit_node(...)
+  std::mutex m_mutex;
+  std::mutex m_mutex_2;
+
+  /// for use in local cache mode
+  char *m_mem_seg = 0;
+  size_t m_mem_seg_length = 0;
+  std::string m_seg_name;
+
+  const std::string m_debug_filename_base = "debug";
+  std::string m_debug_filename;
+
+  const std::string m_profile_filename_base = "data_store_profile";
+  std::string m_profile_filename;
+
+  bool m_was_loaded_from_file = false;
+  const std::string m_cereal_fn = "data_store_cereal";
+
+  /// used in spill_to_file
+  /// (actually, conduit::Node.save() writes both a
+  ///  json file and a binary file, so double this number
+  const int m_max_files_per_directory = 500;
+
+  //===========================================================
+  // timers for profiling exchange_data
+  //===========================================================
+
+  // applicable to imagenet; NA for JAG
+  double m_exchange_sample_sizes_time = 0;
+
+  // time from beginning of exchange_data_by_sample to wait_all
+  double m_start_snd_rcv_time = 0;
+
+  // time for wait_all
+  double m_wait_all_time = 0;
+
+  // time to unpack nodes received from other ranks
+  double m_rebuild_time = 0;
+
+  // total time for exchange_mini_batch_data
+  double m_exchange_time = 0; 
+
+  // sanity check: 
+  //   m_start_snd_rcv_time + m_wait_all_time + m_rebuild_time
+  // should be only slightly less than m_exchange_time;
+  // Note that, for imagenet, the first call to exchange_data_by_sample
+  // involves additional communication for exchanging sample sizes
+ 
+  //===========================================================
+  // END: timers for profiling exchange_data
+  //===========================================================
+
+  bool m_is_setup = false;
+
+  /// set to true if data_store is preloaded
+  bool m_loading_is_complete = false;
+
+  /** @brief True, if we are in preload mode */
+  bool m_preloading = false;
+
+  /** @brief True, if we are in explicit loading mode 
+   *
+   * There is some redundancy here: m_preloading and m_explicitly_loading
+   * can not both be true, but both may be false. When m_loading_is_complete
+   * is true, both m_preloading and m_preloading should be false.
+   */
+  bool m_explicitly_loading = false;
 
   /// The size of the mini-batch that was used to calculate ownership
   /// of samples when building the owner map.  This size has to be
   /// used consistently when computing the indices that will be sent
   /// and received.
-  int m_owner_map_mb_size;
+  int m_owner_map_mb_size = 0;
 
-  /// if true, use exchange_data_by_super_node, else use
-  /// exchange_data_by_sample; default if false
-  bool m_super_node;
+  /// size of a compacted conduit::Node that contains a single sample
+  int m_compacted_sample_size = 0;
 
-  void exchange_data_by_super_node(size_t current_pos, size_t mb_size);
-  void exchange_data_by_sample(size_t current_pos, size_t mb_size);
+  bool m_is_local_cache = false;
+
+  bool m_node_sizes_vary = false;
+
+  /// used in exchange_data_by_sample, when sample sizes are non-uniform
+  bool m_have_sample_sizes = false;
+
+  generic_data_reader *m_reader;
+
+  lbann_comm *m_comm = nullptr;
+
+  /// convenience handles
+  bool m_world_master;
+  bool m_trainer_master;
+  int  m_rank_in_trainer;
+  int  m_rank_in_world = -1; // -1 for debugging 
+  int  m_np_in_trainer;
+
+  /** @brief Maps an index to the processor that owns the associated data */ 
+  map_ii_t m_owner;
+
+  /// convenience handle
+  const std::vector<int> *m_shuffled_indices;
+
+  /** @brief Contains the conduit nodes that are "owned" by this rank
+   *
+   * Maps data_id -> conduit::Node.
+   */ 
+  std::unordered_map<int, conduit::Node> m_data;
+
+  /** @brief Contains the conduit nodes that are "owned" by this rank
+   *
+   * This differs from m_data in that this holds temporarily,
+   * during the first epoch, if we're running in local cache mode
+   * and explicitly loading
+   */
+  std::unordered_map<int, conduit::Node> m_data_cache;
 
   /// Contains the list of data IDs that will be received
   std::vector<int> m_recv_data_ids;
-
-  /// contains the Nodes that this processor owns;
-  /// maps data_id to conduit::Node
-  mutable std::unordered_map<int, conduit::Node> m_data;
+  map_ii_t m_recv_sample_sizes;
 
   /// This vector contains Nodes that this processor needs for
   /// the current minibatch; this is filled in by exchange_data()
@@ -213,47 +449,189 @@ protected :
   std::vector<El::mpi::Request<El::byte>> m_send_requests;
   std::vector<El::mpi::Request<El::byte>> m_recv_requests;
   std::vector<conduit::Node> m_recv_buffer;
-  std::vector<int> m_outgoing_msg_sizes;
-  std::vector<int> m_incoming_msg_sizes;
+  std::vector<size_t> m_outgoing_msg_sizes;
+  std::vector<size_t> m_incoming_msg_sizes;
 
-  /// size of a compacted conduit::Node that contains a single sample
-  int m_compacted_sample_size;
+  /** @brief Maps a data_id to its image size 
+   *
+   * Used when conduit Nodes have non-uniform size, e.g, imagenet;
+   * see: set_node_sizes_vary()
+   */ 
+  map_is_t m_sample_sizes;
+
+  /** @brief Maps a data_id to the image location in a shared memory segment */
+  map_is_t m_image_offsets;
+
+  /// maps processor id -> set of indices (whose associated samples)
+  /// this proc needs to send. (formerly called "proc_to_indices);
+  /// this is filled in by build_indices_i_will_send()
+  std::vector<std::unordered_set<int>> m_indices_to_send;
+
+  /// maps processor id -> set of indices (whose associated samples)
+  /// this proc needs to recv from others. (formerly called "needed")
+  std::vector<std::unordered_set<int>> m_indices_to_recv;
 
-  /// used in exchange_data_by_super_node(); contains the super_nodes,
-  /// after they have been converted from compacted format
-  std::vector<conduit::Node> m_reconstituted;
+  //=========================================================================
+  // methods follow 
+  //=========================================================================
+
+  void exchange_data_by_sample(size_t current_pos, size_t mb_size);
 
   void setup_data_store_buffers();
 
   /// called by exchange_data
-  static void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out);
+  void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out);
 
-  /// fills in m_owner, which maps index -> owning processor
-  void build_owner_map(int mini_batch_size);
-
-  /// maps processor id -> set of indices (whose associated samples)
-  /// this proc needs to send. (formerly called "proc_to_indices)
-  std::vector<std::unordered_set<int>> m_indices_to_send;
+  /// for use when conduit Nodes have non-uniform size, e.g, imagenet
+  void exchange_sample_sizes();
 
   /// fills in m_indices_to_send and returns the number of samples
   /// that will be sent
   int build_indices_i_will_send(int current_pos, int mb_size);
 
-  /// maps processor id -> set of indices (whose associated samples)
-  /// this proc needs to recv from others. (formerly called "needed")
-  std::vector<std::unordered_set<int>> m_indices_to_recv;
-
   /// fills in m_indices_to_recv and returns the number of samples
   /// that will be received
   int build_indices_i_will_recv(int current_pos, int mb_size);
 
   void error_check_compacted_node(const conduit::Node &nd, int data_id);
 
-  bool m_is_local_cache;
+  /** @brief All ranks exchange their cached data */
+  void exchange_local_caches();
+
+  /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns
+  /// for use in local cache mode
+  void get_image_sizes(map_is_t &sizes, std::vector<std::vector<int>> &indices);
+
+  /// for use in local cache mode
+  void allocate_shared_segment(map_is_t &sizes, std::vector<std::vector<int>> &indices);
+
+  /// for use in local cache mode
+  void read_files(std::vector<char> &work, map_is_t &sizes, std::vector<int> &indices);
+
+  /// fills in m_image_offsets for use in local cache mode
+  void compute_image_offsets(map_is_t &image_sizes, std::vector<std::vector<int>> &indices);
+
+  /// for use in local cache mode
+  void exchange_images(std::vector<char> &work, map_is_t &image_sizes, std::vector<std::vector<int>> &indices); 
+
+  /// for use in local cache mode
+  void build_conduit_nodes(map_is_t &sizes);
+
+
+  /// for use in local cache mode
+  void fillin_shared_images(char* images, size_t size, size_t offset);
+
+  /** @brief For testing during development
+   *
+   * At the beginning of the 2nd epoch, calls write_checkpoint(), 
+   * clears some variables, calls load_checkpoint then continues. 
+   * To activate this test use cmd flag: --data_store_test_checkpoint=
+   */ 
+  void test_checkpoint(const std::string&);
+
+  /** @brief Called by test_checkpoint */
+  void print_variables();
+
+  /** @brief Called by test_checkpoint 
+   *
+   * For testing and development. Prints the first 'n' entries from 
+   * the owner map * (which maps sample_id -> owning rank) to std::cout
+   */
+  void print_partial_owner_map(int n);
+
+  std::string get_conduit_dir() const;
+  std::string get_cereal_fn() const;
+  std::string get_metadata_fn() const;
+
+  /** @brief Creates the directory if it does not already exist */
+  void make_dir_if_it_doesnt_exist(const std::string &dir); 
+
+  /** @brief Writes conduit node to file */
+  void spill_conduit_node(const conduit::Node &node, int data_id);
+
+  /** @brief Loads conduit nodes from file into m_data */
+  void load_spilled_conduit_nodes();
+
+  /** @brief Creates directory structure, opens metadata file for output, etc
+   *
+   * This method is called for both --data_store_spill and 
+   * --data_store_test_checkpoint 
+   */
+  void setup_spill(std::string dir);
+
+  /** @brief Saves this object's state to file
+   *
+   * Here, "state" is all data, except for conduit nodes, that is
+   * needed to reload from checkpoint
+   */
+  void save_state();
+
+  /** @brief Optionally open debug and profiling files
+   *
+   * A debug file is opened for every <rank, data reader role> pair;
+   * files are opened if the cmd flag --data_store_debug is passed.
+   * A profiling file is opened only be <world_master, data reader role>
+   * pairs; files are opened if the cmd flag --data_store_profile is passed.
+   */ 
+  void open_informational_files();
+
+  /** @brief Creates a directory for spilling conduit nodes */
+  void open_next_conduit_spill_directory();
+
+  /** @brief Write timing data for data exchange to the profile file, if it's opened */
+  void profile_timing();
+
+  void setup_checkpoint_test();
+
+  std::string get_lassen_spill_dir();
+
+  void verify_sample_size();
+
+  //=========================================================================
+  // functions and templates for optional profiling and debug files follow
+  //=========================================================================
+
+  void PROFILE() const { 
+    if (!m_profile) {
+      return;
+    }
+    (*m_profile) << std::endl; 
+    flush_profile_file();
+  }
+
+  template <typename T, typename... Types>
+  void PROFILE(T var1, Types... var2) const {
+    if (!m_world_master) {
+      return;
+    }
+    if (!m_profile) {
+      return;
+    }
+    (*m_profile) << var1 << " ";
+    PROFILE(var2...) ;
+    flush_profile_file();
+  }
+
+  void DEBUG_DS() { 
+    if (!m_debug) {
+      return;
+    }
+    (*m_debug) << std::endl; 
+    flush_debug_file();
+  }
+
+  template <typename T, typename... Types>
+  void DEBUG_DS(T var1, Types... var2) {
+    if (!m_debug) {
+      return;
+    }
+    (*m_debug) << var1 << " ";
+    DEBUG_DS(var2...) ;
+    flush_debug_file();
+  }
 };
 
 }  // namespace lbann
 
-#endif //#ifdef LBANN_HAS_CONDUIT
 
 #endif  // __DATA_STORE_JAG_HPP__
diff --git a/include/lbann/execution_contexts/CMakeLists.txt b/include/lbann/execution_contexts/CMakeLists.txt
new file mode 100644
index 00000000000..79bd7243399
--- /dev/null
+++ b/include/lbann/execution_contexts/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  execution_context.hpp
+  sgd_execution_context.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/execution_contexts/execution_context.hpp b/include/lbann/execution_contexts/execution_context.hpp
new file mode 100644
index 00000000000..f26ea0d21d4
--- /dev/null
+++ b/include/lbann/execution_contexts/execution_context.hpp
@@ -0,0 +1,177 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_EXECUTION_CONTEXT_HPP
+#define LBANN_EXECUTION_CONTEXT_HPP
+
+#include "lbann/base.hpp"
+#include "lbann/comm.hpp"
+#include "lbann/io/persist.hpp"
+#include "lbann/utils/threads/thread_pool.hpp"
+#include <cereal/types/utility.hpp>
+
+namespace lbann {
+
+// Forward-declare this.
+class trainer;
+class training_algorithm;
+
+class termination_criteria {
+public:
+  size_t num_steps;
+};
+
+class execution_context {
+public:
+  /** Constructor. */
+  execution_context(trainer& trainer, training_algorithm& training_alg,
+                    lbann_comm *comm, execution_mode mode);
+  /** Destructor. */
+  virtual ~execution_context() = default;
+
+  /** Copy execution_context. */
+  virtual std::unique_ptr<execution_context> copy_execution_context() const {
+    // Use explicit construction of unique pointer since copy
+    // constructor is protected and cannot be accessed in make_unique
+    return std::unique_ptr<execution_context>{new execution_context(*this)};
+  }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_execution_mode),
+       CEREAL_NVP(m_terminate_training),
+       CEREAL_NVP(m_step));
+  }
+
+  /** @brief Return the state of the execution context as a string */
+  virtual std::string get_state_string() const noexcept {
+    return build_string("ec.", to_string(get_execution_mode()),
+                        ".step.", get_step());
+  }
+
+  /** @brief Current step in the training algorithm
+    *  @details Step counts the number of iterations in the training
+    *  algorithm's internal state
+    */
+  size_t get_step() const noexcept { return m_step; }
+
+  /** @brief Increment the current step in the training algorithm
+    *  @details Increment the step count in the training
+    *  algorithm's internal state
+    */
+  void inc_step() noexcept { ++m_step; }
+
+  /** Get the mode that the trainer is currenting executing. */
+  inline void set_execution_mode(execution_mode mode) noexcept {
+    m_execution_mode = mode;
+  }
+
+  /** Get the mode that the trainer is currenting executing. */
+  inline execution_mode get_execution_mode() const noexcept {
+    return m_execution_mode;
+  }
+
+  /** Return true if the flag to stop training is set. */
+  bool get_terminate_training() const {
+    return m_terminate_training;
+  }
+  /** Set the terminate training flag (on or off). */
+  void set_terminate_training(bool f) {
+    m_terminate_training = f;
+  }
+
+  /** Grab the trainer from the execution context */
+  const trainer& get_trainer() const {
+    return m_trainer;
+  }
+
+  trainer& get_trainer() {
+    return const_cast<trainer&>(static_cast<const execution_context&>(*this).get_trainer());
+  }
+
+  const training_algorithm& get_training_algorithm() const {
+    return m_training_algorithm;
+  }
+
+  training_algorithm& get_training_algorithm() {
+    return const_cast<training_algorithm&>(static_cast<const execution_context&>(*this).get_training_algorithm());
+  }
+
+  thread_pool& get_io_thread_pool() const;
+
+  lbann_comm& get_comm() const {
+    if (!m_comm) { LBANN_ERROR("m_comm is null"); }
+    return *m_comm;
+  };
+
+  /** Are background I/O activities enabled by the input layers */
+  bool background_io_activity_allowed();
+
+  /** Checkpoint training_algorithm to given file descriptor */
+  virtual void save_to_checkpoint_shared(persist& p);
+  /** Restore training_algorithm by reading checkpoint from given file descriptor */
+  virtual void load_from_checkpoint_shared(persist& p);
+  virtual void save_to_checkpoint_distributed(persist& p);
+  virtual void load_from_checkpoint_distributed(persist& p);
+
+protected:
+  /** Copy constructor. */
+  execution_context(const execution_context& other) = default;
+  /** Copy assignment operator. */
+  execution_context& operator=(const execution_context& other) = default;
+  /** Move constructor. */
+  execution_context(execution_context&& other) = default;
+  /** Move assignment operator. */
+  execution_context& operator=(execution_context&& other) = default;
+
+private:
+  /** Pointer to the training context (execution environment) for the training algorithm */
+  trainer& m_trainer;
+
+  training_algorithm& m_training_algorithm;
+
+  /** LBANN communicator. */
+  observer_ptr<lbann_comm> m_comm;
+
+  /** The trainer's current execution mode. */
+  execution_mode m_execution_mode = execution_mode::training;
+
+  /** @brief Current step in the training algorithm
+    *  @details Step counts the number of iterations in the training
+    *  algorithm's internal state
+    */
+  size_t m_step = 0;
+
+  /** @brief Whether to terminate training.
+   *  @details If true, training will terminate immediately before
+   *  the next epoch.
+   */
+  bool m_terminate_training = false;
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_EXECUTION_CONTEXT_HPP
diff --git a/include/lbann/execution_contexts/sgd_execution_context.hpp b/include/lbann/execution_contexts/sgd_execution_context.hpp
new file mode 100644
index 00000000000..4d81ae68fbf
--- /dev/null
+++ b/include/lbann/execution_contexts/sgd_execution_context.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_SGD_EXECUTION_CONTEXT_HPP
+#define LBANN_SGD_EXECUTION_CONTEXT_HPP
+
+#include "lbann/execution_contexts/execution_context.hpp"
+#include <cereal/types/base_class.hpp>
+namespace lbann {
+
+class sgd_termination_criteria : public termination_criteria {
+public:
+  size_t num_epochs;
+};
+
+
+/** @brief SGD Uses the step to track the Current mini-batch step for
+  *  execution mode.
+  *  @details Step counts are not reset after each epoch.
+  */
+class sgd_execution_context final : public execution_context {
+public:
+  /** Constructor. */
+  sgd_execution_context(trainer& trainer, training_algorithm& training_alg,
+                        lbann_comm *comm, execution_mode mode, size_t mini_batch_size);
+  /** Destructor. */
+  virtual ~sgd_execution_context() = default;
+
+  /** Copy constructor. */
+  sgd_execution_context(const sgd_execution_context& other) = default;
+  /** Copy assignment operator. */
+  sgd_execution_context& operator=(const sgd_execution_context& other) = default;
+  /** Move constructor. */
+  sgd_execution_context(sgd_execution_context&& other) = default;
+  /** Move assignment operator. */
+  sgd_execution_context& operator=(sgd_execution_context&& other) = default;
+  /** Copy sgd_execution_context. */
+  virtual std::unique_ptr<execution_context> copy_execution_context() const { return make_unique<sgd_execution_context>(*this); }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(cereal::base_class<execution_context>( this ),
+       CEREAL_NVP(m_epoch),
+       CEREAL_NVP(m_current_mini_batch_size),
+       CEREAL_NVP(m_effective_mini_batch_size));
+  }
+
+  /** @brief Return the state of the execution context as a string */
+  std::string get_state_string() const noexcept override {
+    return build_string("sgd.", to_string(get_execution_mode()),
+                        ".epoch.", get_epoch(), ".step.", get_step());
+  }
+
+  /** Number of times the training set has been traversed. */
+  inline size_t get_epoch() const noexcept { return m_epoch; }
+
+  /** @brief Increment the current epoch in the execution context
+    *  @details Increment the counter tracking the number of times
+    *  that the data set has been traversed.
+    */
+  void inc_epoch() noexcept { ++m_epoch; }
+
+  /** Set the trainer's current mini-batch size. */
+  inline void set_current_mini_batch_size(size_t mini_batch_size) {
+    m_current_mini_batch_size = mini_batch_size;
+  }
+  /** Get the trainer's current mini-batch size. */
+  inline size_t get_current_mini_batch_size() const {
+    return m_current_mini_batch_size;
+  }
+  /** Get the trainer's effective mini-batch size. */
+  inline size_t get_effective_mini_batch_size() const {
+    return m_effective_mini_batch_size;
+  }
+  /** Set the trainer's effective mini-batch size. */
+  inline void set_effective_mini_batch_size(size_t mini_batch_size) {
+    m_effective_mini_batch_size = mini_batch_size;
+  }
+
+  /** Checkpoint training_algorithm to given file descriptor  */
+  virtual void save_to_checkpoint_shared(persist& p);
+  /** Restore training_algorithm by reading checkpoint from given file descriptor */
+  virtual void load_from_checkpoint_shared(persist& p);
+  virtual void save_to_checkpoint_distributed(persist& p);
+  virtual void load_from_checkpoint_distributed(persist& p);
+
+private:
+  /** Number of times the training data set has been traversed. */
+  size_t m_epoch = 0;
+
+  /** Size of the current mini-batch in the model. */
+  size_t m_current_mini_batch_size;
+
+  /** The "effective" size of a minibatch.
+   *
+   *  This is the size of the minibatch across all models and used for
+   *  e.g.  correctly averaging gradients from multiple models.
+   */
+  size_t m_effective_mini_batch_size;
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_SGD_EXECUTION_CONTEXT_HPP
diff --git a/include/lbann/io/data_buffers/generic_io_buffer.hpp b/include/lbann/io/data_buffers/generic_io_buffer.hpp
index 1f0ebc807de..a8d4f7ecec0 100644
--- a/include/lbann/io/data_buffers/generic_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/generic_io_buffer.hpp
@@ -36,11 +36,12 @@
 
 namespace lbann
 {
+template <typename TensorDataType>
 class fetch_data_functor {
  public:
   fetch_data_functor (data_reader_target_mode target_mode) :
     _target_mode(target_mode) {}
-  int operator() (CPUMat& samples, CPUMat& responses, El::Matrix<El::Int>& indices_fetched, generic_data_reader* data_reader) const {
+  int operator() (CPUMatDT<TensorDataType>& samples, CPUMatDT<TensorDataType>& responses, El::Matrix<El::Int>& indices_fetched, generic_data_reader* data_reader) const {
     int num_samples_fetched = data_reader->fetch_data(samples, indices_fetched);
     int num_responses_fetched;
     switch(_target_mode) {
@@ -64,7 +65,7 @@ class fetch_data_functor {
     }
     return num_samples_fetched;
   }
-  int operator() (CPUMat& samples, El::Matrix<El::Int>& indices_fetched, generic_data_reader* data_reader) const {
+  int operator() (CPUMatDT<TensorDataType>& samples, El::Matrix<El::Int>& indices_fetched, generic_data_reader* data_reader) const {
     int num_samples_fetched = data_reader->fetch_data(samples, indices_fetched);
     switch(_target_mode) {
     case data_reader_target_mode::NA:
@@ -89,9 +90,22 @@ class update_data_reader_functor {
   }
 };
 
+template <typename TensorDataType>
 class generic_io_buffer {
 public:
-  generic_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers);
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The local tensor type expected for IO in this object. */
+  using IODataType = DataType;
+
+  ///@}
+
+public:
+  generic_io_buffer(lbann_comm *comm, int num_parallel_readers);
   generic_io_buffer(
     const generic_io_buffer&);
   generic_io_buffer& operator=(
@@ -112,8 +126,8 @@ class generic_io_buffer {
   virtual void setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_minibatch_size) = 0;
 
   virtual int fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) = 0;
-  virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) {}
-  virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) {}
+  virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) {}
+  virtual void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) {}
   virtual bool update_data_set(generic_data_reader *data_reader, execution_mode mode) = 0;
   virtual void set_fetch_data_in_background(bool flag, execution_mode mode) = 0;
   virtual bool is_data_fetched_in_background(execution_mode mode) = 0;
@@ -122,17 +136,27 @@ class generic_io_buffer {
   virtual void set_data_fetch_future(std::future<void> future, execution_mode mode) = 0;
   virtual std::future<void> get_data_fetch_future(execution_mode mode) = 0;
 
-  virtual void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) = 0;
-  virtual void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) = 0;
-
-  virtual int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const = 0;
-
   // protected:
  public:
   lbann_comm *m_comm;
-  const fetch_data_functor *fetch_data_fn;
+  const fetch_data_functor<IODataType> *fetch_data_fn;
   const update_data_reader_functor *update_data_reader_fn;
 };
-}
+
+#ifndef LBANN_GENERIC_IO_BUFFER_INSTANTIATE
+
+#define PROTO(T)                           \
+  extern template class generic_io_buffer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+
+#endif // LBANN_GENERIC_IO_BUFFER_INSTANTIATE
+
+} // namespace lbann
 
 #endif // LBANN_GENERIC_IO_BUFFER_HPP_INCLUDED
diff --git a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
index 13a4a23f8b2..56a438fa1c0 100644
--- a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
@@ -31,12 +31,22 @@
 
 namespace lbann {
 
+template <typename TensorDataType>
 class data_buffer {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
  public:
   /** Number of samples in the current mini-batch */
   int m_num_samples_fetched;
   /** Distributed matrix used to stage local data to layer output */
-  std::vector<std::unique_ptr<AbsDistMat>> m_input_buffers;
+  std::vector<std::unique_ptr<AbsDistMatrixType>> m_input_buffers;
   std::atomic<bool> m_fetch_data_in_background;
   std::future<void> m_data_fetch_future;
   /// 1-D Matrix of which indices were fetched in this mini-batch
@@ -48,7 +58,7 @@ class data_buffer {
     m_input_buffers.clear();
     m_input_buffers.resize(num_child_layers);
     for(int i = 0; i < num_child_layers; i++) {
-      m_input_buffers[i].reset(new StarVCMat<El::Device::CPU>(comm->get_trainer_grid()));
+      m_input_buffers[i].reset(new StarVCMatDT<TensorDataType, El::Device::CPU>(comm->get_trainer_grid()));
     }
   }
 
@@ -78,11 +88,24 @@ class data_buffer {
 /**
  * Parallel I/O routines for managing partitioned minibatches
  */
-class partitioned_io_buffer : public generic_io_buffer {
+template <typename TensorDataType>
+class partitioned_io_buffer : public generic_io_buffer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The local tensor type expected for IO in this object. */
+  using IODataType = DataType;
+
+  ///@}
+
  public:
-  typedef std::map<execution_mode, data_buffer *> data_buffer_map_t;
+  typedef std::map<execution_mode, data_buffer<IODataType> *> data_buffer_map_t;
  public:
-  partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers, int num_child_layers);
+  partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, int num_child_layers);
   partitioned_io_buffer(const partitioned_io_buffer& other);
   partitioned_io_buffer& operator=(const partitioned_io_buffer& other);
   ~partitioned_io_buffer();
@@ -94,8 +117,8 @@ class partitioned_io_buffer : public generic_io_buffer {
   void setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) override;
 
   int fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) override;
-  void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) override;
-  void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) override;
+  void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) override;
+  void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) override;
   bool update_data_set(generic_data_reader *data_reader, execution_mode mode) override;
   void set_fetch_data_in_background(bool flag, execution_mode mode) override;
   bool is_data_fetched_in_background(execution_mode mode) override;
@@ -104,14 +127,9 @@ class partitioned_io_buffer : public generic_io_buffer {
   void set_data_fetch_future(std::future<void> future, execution_mode mode) override;
   std::future<void> get_data_fetch_future(execution_mode mode) override;
 
-  void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) override;
-  void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) override;
-  int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const override;
-  static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm);
-
-  data_buffer *get_data_buffer(const execution_mode mode) const {
-    data_buffer *data_buffer = nullptr;
-    data_buffer_map_t::const_iterator it = m_data_buffers.find(mode);
+  data_buffer<IODataType> *get_data_buffer(const execution_mode mode) const {
+    data_buffer<IODataType> *data_buffer = nullptr;
+    typename data_buffer_map_t::const_iterator it = m_data_buffers.find(mode);
     if (it != m_data_buffers.end()) data_buffer = it->second;
 
     switch(mode) {
diff --git a/include/lbann/io/persist.hpp b/include/lbann/io/persist.hpp
index 409dc5ddf89..e616019b806 100644
--- a/include/lbann/io/persist.hpp
+++ b/include/lbann/io/persist.hpp
@@ -30,41 +30,100 @@
 #define LBANN_PERSIST_H
 
 #include "lbann/base.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/enum_iterator.hpp"
 #include "El.hpp"
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
+#include <cereal/types/base_class.hpp>
+#include <cereal/types/polymorphic.hpp>
+#include <sstream>
 
 namespace lbann {
 
 enum class persist_type {
   train, // data should be saved in file with train data
   model, // data should be saved in file with model data
-  validate
+  metrics,
+  validate,
+  testing,
+  prediction_context,
+  training_context,
+  testing_context,
+  validation_context,
 };
 
+using persist_type_iterator = enum_iterator<persist_type, persist_type::train, persist_type::validation_context>;
+
+inline persist_type execution_mode_to_persist_type(execution_mode m) {
+  switch(m) {
+  case execution_mode::training:
+    return persist_type::training_context;
+  case execution_mode::validation:
+    return persist_type::validation_context;
+  case execution_mode::testing:
+    return persist_type::testing_context;
+  case execution_mode::prediction:
+    return persist_type::prediction_context;
+  // case execution_mode::tournament:
+  //   return persist_type::tournament;
+  case execution_mode::invalid:
+  default:
+    LBANN_ERROR("Invalid execution mode specified");
+  }
+}
+
+inline std::string to_string(persist_type pt) {
+  switch(pt) {
+  case persist_type::model:
+    return "model";
+  case persist_type::metrics:
+    return "metrics";
+  case persist_type::train:
+    return "train";
+  case persist_type::validate:
+    return "validate";
+  case persist_type::testing:
+    return "test";
+  case persist_type::prediction_context:
+    return "prediction";
+  case persist_type::training_context:
+    return "training";
+  case persist_type::validation_context:
+    return "validation";
+  case persist_type::testing_context:
+    return "testing";
+  default:
+      LBANN_ERROR("Invalid persist type specified");
+  }
+}
+
+/// @todo Fix the callback types to properly track execution phases
 enum class callback_type {
-  batch,
-  epoch,
-  validation,
-  inference,
+  model_only,
+  weights_only,
+  execution_context_only,
+  full_checkpoint,
   invalid
 };
 
 class persist {
- protected:
-  uint64_t m_bytes;
-  int m_model_fd;
-  int m_train_fd;
-  int m_validate_fd;
-  char m_model_filename[1024];
-  char m_train_filename[1024];
-  char m_validate_filename[1024];
+ private:
+  std::map<persist_type, uint64_t> m_bytes;
+  std::map<persist_type, std::string> m_filenames;
   callback_type ckpt_type;
  public:
-  char m_checkpoint_dir[1024];
+  std::string m_checkpoint_dir;
 
  public:
   persist();
   ~persist() {};
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(CEREAL_NVP(ckpt_type));
+  }
+
   callback_type get_cb_type() const {
     return ckpt_type;
   }
@@ -73,77 +132,190 @@ class persist {
     ckpt_type = type;
   }
 
-  void open_checkpoint(const char *dir);
+  void open_checkpoint_dir(const std::string& dir, bool create_dir);
+  void open_checkpoint(const std::string& dir, bool create_dir);
   void close_checkpoint();
 
-  void open_restart(const char *dir);
+  void open_restart(const std::string& dir);
   void close_restart();
+  void set_restart_dir(const std::string& dir) { m_checkpoint_dir = dir; }
 
   uint64_t get_bytes() const {
-    return m_bytes;
+    uint64_t bytes = 0;
+    for(auto& pt : m_bytes) {
+      bytes += pt.second;
+    }
+    return bytes;
   }
 
   void reset_bytes() {
-    m_bytes = 0;
+    for(auto& pt : m_bytes) {
+      pt.second = 0;
+    }
   }
 
-  bool write_rank_distmat(persist_type type, const char *name, const AbsDistMat& M);
-  bool read_rank_distmat(persist_type type, const char *name, AbsDistMat& M);
-
-  bool write_distmat(persist_type type, const char *name, AbsDistMat *M);
-  bool read_distmat (persist_type type, const char *name, AbsDistMat *M);
-
-  bool write_bytes(persist_type type, const char *name, const void *buf, size_t size);
-  bool read_bytes(persist_type type, const char *name, void *buf, size_t size);
-
-  bool write_uint32(persist_type type, const char *name, uint32_t  val);
-  bool read_uint32 (persist_type type, const char *name, uint32_t *val);
-
-  bool write_uint64(persist_type type, const char *name, uint64_t  val);
-  bool read_uint64 (persist_type type, const char *name, uint64_t *val);
-
-  bool write_int32_contig(persist_type type, const char *name, const int32_t *buf, uint64_t count);
-  bool read_int32_contig (persist_type type, const char *name, int32_t *buf, uint64_t count);
-
-  bool write_float(persist_type type, const char *name, float  val);
-  bool read_float (persist_type type, const char *name, float *val);
-
-  bool write_string(persist_type type, const char *name, const char *val, int str_length);
-  bool read_string (persist_type type, const char *name, char *val, int str_length);
+  template <typename TensorDataType>
+  bool write_rank_distmat(persist_type type, const char *name, const El::AbstractDistMatrix<TensorDataType>& M);
+  template <typename TensorDataType>
+  bool read_rank_distmat(persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType>& M);
 
-  bool write_double(persist_type type, const char *name, double  val);
-  bool read_double (persist_type type, const char *name, double *val);
+  template <typename TensorDataType>
+  bool write_distmat(persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType> *M);
+  template <typename TensorDataType>
+  bool read_distmat (persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType> *M);
 
-  bool write_datatype(persist_type type, const char *name, DataType  val);
-  bool read_datatype (persist_type type, const char *name, DataType *val);
+  const std::string& get_checkpoint_dir() const { return m_checkpoint_dir; }
 
- private:
-  int get_fd(persist_type type) const;
+  std::string get_filename(persist_type type) const;
 };
 
-bool write_distmat(int fd, const char *name, DistMat *M, uint64_t *bytes);
-bool read_distmat (int fd, const char *name, DistMat *M, uint64_t *bytes);
-
 bool write_bytes(int fd, const char *name, const void *buf, size_t size);
 bool read_bytes(int fd, const char *name, void *buf, size_t size);
 
-bool write_uint32(int fd, const char *name, uint32_t  val);
-bool read_uint32 (int fd, const char *name, uint32_t *val);
-
-bool write_uint64(int fd, const char *name, uint64_t  val);
-bool read_uint64 (int fd, const char *name, uint64_t *val);
+bool write_string(int fd, const char *name, const char *buf, size_t size);
+bool read_string(int fd, const char *name, char *buf, size_t size);
 
-bool write_int32_contig(int fd, const char *name, const int32_t *buf, uint64_t count);
-bool read_int32_contig (int fd, const char *name, int32_t *buf, uint64_t count);
+class NonexistentArchiveFile : public std::runtime_error {
+public:
+  NonexistentArchiveFile(std::string const& filename) : std::runtime_error(std::string("Archive file not found: ") + filename) {}
+};
 
-bool write_float(int fd, const char *name, float  val);
-bool read_float (int fd, const char *name, float *val);
+template <typename C>
+void write_cereal_archive(C& obj, const std::string& filename) {
+  std::ofstream os(filename);
+  if(!os.is_open()) {
+    throw NonexistentArchiveFile(filename);
+  }
+  cereal::XMLOutputArchive archive(os);
+  archive(obj);
+}
+
+template <typename C>
+void write_cereal_archive(C& obj, persist& p, const std::string& filename) {
+  write_cereal_archive<C>(obj, p.get_checkpoint_dir() + "/" + filename);
+}
+
+template <typename C>
+void write_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) {
+  write_cereal_archive<C>(obj, p.get_filename(pt) + suffix);
+}
+
+template <typename C>
+void write_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) {
+  const persist_type pt = execution_mode_to_persist_type(mode);
+  write_cereal_archive<C>(obj, p, pt, suffix);
+}
+
+template <typename C>
+void read_cereal_archive(C& obj, const std::string& filename) {
+  std::ifstream is(filename);
+  if(!is.is_open()) {
+    throw NonexistentArchiveFile(filename);
+  }
+  cereal::XMLInputArchive archive(is);
+  archive(obj);
+}
+
+template <typename C>
+void read_cereal_archive(C& obj, persist& p, const std::string& filename) {
+  read_cereal_archive(obj, p.get_checkpoint_dir() + "/" + filename);
+}
+
+template <typename C>
+void read_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) {
+  read_cereal_archive(obj, p.get_filename(pt) + suffix);
+}
+
+template <typename C>
+void read_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) {
+  const persist_type pt = execution_mode_to_persist_type(mode);
+  read_cereal_archive<C>(obj, p, pt, suffix);
+}
+
+template <typename C>
+std::string create_cereal_archive_binary_string(C& obj) {
+  std::ostringstream ss;
+  {
+    cereal::BinaryOutputArchive archive(ss);
+    archive(obj);
+  } // archive goes out of scope, ensuring all contents are flushed
+  return ss.str();
+}
+
+template <typename C>
+void unpack_cereal_archive_binary_string(C& obj, const std::string& buf) {
+  std::istringstream ss(buf);
+  {
+    cereal::BinaryInputArchive archive(ss);
+    archive(obj);
+  } // archive goes out of scope, ensuring all contents are flushed
+}
+
+template <typename C>
+void load_from_shared_cereal_archive(C& obj,
+                                     lbann_comm& comm,
+                                     const std::string& filename) {
+  std::string buf;
+  if (comm.am_trainer_master()) {
+    read_cereal_archive<C>(obj, filename);
+    buf = create_cereal_archive_binary_string<C>(obj);
+  }else {
+    // If you are not the trainer master, still check to see if the file exists
+    std::ifstream is(filename);
+    if(!is.is_open()) {
+      throw NonexistentArchiveFile(filename);
+    }
+  }
 
-bool write_double(int fd, const char *name, double  val);
-bool read_double (int fd, const char *name, double *val);
+  // TODO: this assumes homogeneous processors
+  // broadcast state from rank 0
+  comm.trainer_broadcast(0, buf);
 
-bool write_string(int fd, const char *name, const char *buf, size_t size);
-bool read_string(int fd, const char *name, char *buf, size_t size);
+  if (!comm.am_trainer_master()) {
+    unpack_cereal_archive_binary_string<C>(obj, buf);
+  }
+}
+
+template <typename C>
+void load_from_shared_cereal_archive(C& obj, persist& p,
+                                     lbann_comm& comm,
+                                     const std::string& filename) {
+  load_from_shared_cereal_archive(obj, comm, p.get_checkpoint_dir() + filename);
+}
+
+template <typename C>
+void load_from_shared_cereal_archive(C& obj, persist& p, persist_type pt,
+                                     lbann_comm& comm,
+                                     const std::string& suffix) {
+  load_from_shared_cereal_archive(obj, comm, p.get_filename(pt) + suffix);
+}
+
+template <typename C>
+void load_from_shared_cereal_archive(C& obj, persist& p, execution_mode mode,
+                                     lbann_comm& comm,
+                                     const std::string& suffix) {
+  const persist_type pt = execution_mode_to_persist_type(mode);
+  load_from_shared_cereal_archive<C>(obj, p, pt, comm, suffix);
+}
+
+#ifndef LBANN_PERSIST_INSTANTIATE
+#define PROTO(T)                                                            \
+  extern template bool persist::write_rank_distmat<T>(                      \
+  persist_type type, const char *name, const El::AbstractDistMatrix<T>& M); \
+  extern template bool persist::read_rank_distmat<T>(                       \
+  persist_type type, const char *name, El::AbstractDistMatrix<T>& M);       \
+  extern template bool persist::write_distmat<T>(                           \
+  persist_type type, const char *name, El::AbstractDistMatrix<T> *M);       \
+  extern template bool persist::read_distmat<T>(                            \
+  persist_type type, const char *name, El::AbstractDistMatrix<T> *M)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_PERSIST_INSTANTIATE
 
 } // namespace lbann
 
diff --git a/include/lbann/layers/CMakeLists.txt b/include/lbann/layers/CMakeLists.txt
index 0cc71271bcb..ab56ae6f153 100644
--- a/include/lbann/layers/CMakeLists.txt
+++ b/include/lbann/layers/CMakeLists.txt
@@ -1,8 +1,16 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   layer.hpp
+  data_type_layer.hpp
   )
 
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_HEADERS
+    "${CMAKE_CURRENT_SOURCE_DIR}/distconv_adapter.hpp")
+  list(APPEND THIS_DIR_HEADERS
+    "${CMAKE_CURRENT_SOURCE_DIR}/data_type_distconv_adapter.hpp")
+endif ()
+
 # Add the subdirectories
 add_subdirectory(activations)
 add_subdirectory(image)
diff --git a/include/lbann/layers/activations/CMakeLists.txt b/include/lbann/layers/activations/CMakeLists.txt
index bbcb0179add..553c4b3cebf 100644
--- a/include/lbann/layers/activations/CMakeLists.txt
+++ b/include/lbann/layers/activations/CMakeLists.txt
@@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS
   activations.hpp
   elu.hpp
   identity.hpp
+  relu.hpp
   leaky_relu.hpp
   log_softmax.hpp
   softmax.hpp
diff --git a/include/lbann/layers/activations/activations.hpp b/include/lbann/layers/activations/activations.hpp
index b36c8d61072..24d11fade27 100644
--- a/include/lbann/layers/activations/activations.hpp
+++ b/include/lbann/layers/activations/activations.hpp
@@ -31,14 +31,30 @@
 
 namespace lbann {
 
+// Convenience macros for ETI decls for unary layers
+
+#ifndef LBANN_ACTIVATIONS_LAYER_INSTANTIATE
+#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE)                   \
+  extern template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>; \
+  extern template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+#else
+#define UNARY_ETI_DECL_MACRO_DEV(...)
+#endif // LBANN_UNARY_LAYER_INSTANTIATE
+
+#ifdef LBANN_HAS_GPU
+#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T)                  \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU);  \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU)
+#else
+#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T)                 \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU)
+#endif // LBANN_HAS_GPU
+
 // Convenience macro to define an entry-wise unary layer class
-#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string)          \
-  struct layer_name##_name_struct {                                     \
-    inline operator std::string() { return layer_string; }              \
-  };                                                                    \
-  template <data_layout Layout, El::Device Device>                      \
-  using layer_name                                                      \
-  = entrywise_unary_layer<Layout, Device, layer_name##_name_struct>;
+#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string)    \
+  LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string);  \
+  UNARY_ETI_DECL_MACRO(layer_name, float);                        \
+  UNARY_ETI_DECL_MACRO(layer_name, double)
 
 /** @class lbann::log_sigmoid_layer
  *  @brief Logarithm of sigmoid function.
@@ -46,15 +62,7 @@ namespace lbann {
  *  @f[ \log(\sigma(x)) = -\log(1 + e^{-x}) @f]
  *  See https://en.wikipedia.org/wiki/Sigmoid_function.
  */
-DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid")
-
-/** @class lbann::relu_layer
- *  @brief Rectified linear unit.
- *
- *  @f[ \text{ReLU}(x) = \text{max}(x, 0) @f]
- *  See https://en.wikipedia.org/wiki/Rectifier_(neural_networks).
- */
-DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU")
+DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid");
 
 /** @class lbann::selu_layer
  *  @brief Scaled exponential rectified linear unit.
@@ -73,7 +81,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU")
  *  Hochreiter. "Self-normalizing neural networks." In Advances in
  *  Neural Information Processing Systems, pp. 971-980. 2017.
  */
-DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU")
+DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU");
 
 /** @class lbann::sigmoid_layer
  *  @brief Special case of logistic function.
@@ -81,7 +89,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU")
  *  @f[ \sigma(x) = \frac{1}{1 + e^{-x}} @f]
  *  See https://en.wikipedia.org/wiki/Sigmoid_function.
  */
-DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid")
+DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid");
 // Sigmoid function output is strictly in (0,1)
 // Note: Output is in the range [eps,1-eps], where 'eps' is machine
 // epsilon. This avoids denormalized floats and helps mitigate some
@@ -94,16 +102,19 @@ DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid")
  *  @f[ \text{softplus}(x) = \log (e^x + 1) @f]
  *  See https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
  */
-DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus")
+DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus");
 
 /** @class lbann::softsign_layer
  *  @brief Smooth approximation to sign function.
  *
  *  @f[ \text{softsign}(x) = \frac{x}{1 + |x|} @f]
  */
-DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign")
+DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign");
 
 } // namespace lbann
 
 #undef DEFINE_ENTRYWISE_UNARY_LAYER
+#undef UNARY_ETI_DECL_MACRO
+#undef UNARY_ETI_DECL_MACRO_DEV
+
 #endif // LBANN_LAYERS_ACTIVATIONS_ACTIVATIONS_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/elu.hpp b/include/lbann/layers/activations/elu.hpp
index 52f797488be..c64846f3224 100644
--- a/include/lbann/layers/activations/elu.hpp
+++ b/include/lbann/layers/activations/elu.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED
 #define LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -46,36 +46,45 @@ namespace lbann {
  *  and accurate deep network learning by exponential linear units
  *  (ELUs)." arXiv preprint arXiv:1511.07289 (2015).
  */
-template <data_layout Layout, El::Device Device>
-class elu_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class elu_layer : public data_type_layer<TensorDataType> {
 public:
-  elu_layer(lbann_comm *comm, DataType alpha = 1)
-    : Layer(comm), m_alpha(alpha) {}
+  elu_layer(lbann_comm *comm, TensorDataType alpha = 1)
+    : data_type_layer<TensorDataType>(comm), m_alpha(alpha) {}
   elu_layer* copy() const override { return new elu_layer(*this); }
   std::string get_type() const override { return "ELU"; }
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     desc.add("alpha", m_alpha);
     return desc;
   }
 
 protected:
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
   void fp_compute() override;
   void bp_compute() override;
 
 private:
   /** Scale parameter for negative region. */
-  DataType m_alpha;
+  TensorDataType m_alpha;
 
 };
 
+#ifndef LBANN_ELU_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class elu_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class elu_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_ELU_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp
index e895ba44b99..ff59d2138dd 100644
--- a/include/lbann/layers/activations/identity.hpp
+++ b/include/lbann/layers/activations/identity.hpp
@@ -27,38 +27,73 @@
 #ifndef LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED
 #define LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class identity_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  identity_distconv_adapter(Layer &layer):
+      data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~identity_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  std::unique_ptr<TensorDevType> setup_activations_i(int index) const override;
+  std::unique_ptr<TensorDevType> setup_error_signals_i(int index) const override;
+};
+#endif // LBANN_HAS_DISTCONV
+
+
 /** @brief Output a tensor view.
  *
  *  Forward and backward prop simply involve setting up tensor views,
  *  and hence are very cheap.
  */
-template <data_layout Layout, El::Device Device>
-class identity_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class identity_layer : public data_type_layer<TensorDataType> {
 public:
-  identity_layer(lbann_comm *comm) : Layer(comm) {}
+  identity_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
   identity_layer* copy() const override { return new identity_layer(*this); }
   std::string get_type() const override { return "identity"; }
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 protected:
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    El::LockedView(get_activations(), get_prev_activations());
+    El::LockedView(this->get_activations(), this->get_prev_activations());
   }
   void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
-    El::LockedView(get_error_signals(), get_prev_error_signals());
+    El::LockedView(this->get_error_signals(), this->get_prev_error_signals());
   }
   void fp_compute() override {}
   void bp_compute() override {}
+#ifdef LBANN_HAS_DISTCONV
+ protected:
+  bool is_distconv_supported() const override {
+    return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<identity_distconv_adapter<
+      TensorDataType, Layout, Device>>(*this);
+  }
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifndef LBANN_IDENTITY_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class identity_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class identity_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_IDENTITY_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp
index 0e576117d3c..b936a5ac1b9 100644
--- a/include/lbann/layers/activations/leaky_relu.hpp
+++ b/include/lbann/layers/activations/leaky_relu.hpp
@@ -27,10 +27,27 @@
 #ifndef LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED
 #define LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class leaky_relu_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+
+  leaky_relu_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~leaky_relu_distconv_adapter() = default;
+
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+
+  std::unique_ptr<dc::LeakyReLU> m_leaky_relu;
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief
  *
  *  @f[
@@ -46,36 +63,98 @@ namespace lbann {
  *  nonlinearities improve neural network acoustic models." In
  *  Proc. ICML, vol. 30, no. 1, p. 3. 2013.
  */
-template <data_layout Layout, El::Device Device>
-class leaky_relu_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class leaky_relu_layer : public data_type_layer<TensorDataType> {
 public:
-  leaky_relu_layer(lbann_comm *comm, DataType negative_slope = 0.01)
-    : Layer(comm), m_negative_slope(negative_slope) {}
+  leaky_relu_layer(lbann_comm *comm, TensorDataType negative_slope = 0.01)
+    : data_type_layer<TensorDataType>(comm), m_negative_slope(negative_slope) {}
   leaky_relu_layer* copy() const override { return new leaky_relu_layer(*this); }
   std::string get_type() const override { return "leaky ReLU"; }
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     desc.add("Negative slope", m_negative_slope);
     return desc;
   }
 
 protected:
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
   void fp_compute() override;
   void bp_compute() override;
 
 private:
   /** Function slope in negative region. */
-  DataType m_negative_slope;
+  TensorDataType m_negative_slope;
 
+#ifdef LBANN_HAS_DISTCONV
+ protected:
+  bool is_distconv_supported() const override {
+    return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<leaky_relu_distconv_adapter<
+      TensorDataType, Layout, Device>>(*this);
+  }
+  leaky_relu_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() override;
+  const leaky_relu_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>&
+leaky_relu_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const leaky_relu_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>&
+leaky_relu_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+
+  auto &x = this->get_prev_activations_dist();
+  auto &y = this->get_activations_dist();
+  auto &dx = this->get_error_signals_dist();
+  auto &dy = this->get_prev_error_signals_dist();
+
+  // x == y
+  constraints.mark_equivalent(x, y);
+  // x == dx
+  constraints.mark_equivalent(x, dx);
+  // dx == dy
+  constraints.mark_equivalent(dx, dy);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void leaky_relu_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
+    size_t workspace_capacity) {
+  m_leaky_relu = make_unique<dc::LeakyReLU>(dc::get_backend());
+}
+#endif // LBANN_HAS_DISTCONV
+
+#ifndef LBANN_LEAKY_RELU_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class leaky_relu_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class leaky_relu_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_LEAKY_RELU_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/log_softmax.hpp b/include/lbann/layers/activations/log_softmax.hpp
index 136edf89600..669370f816a 100644
--- a/include/lbann/layers/activations/log_softmax.hpp
+++ b/include/lbann/layers/activations/log_softmax.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED
 #define LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 #include "lbann/utils/cudnn.hpp"
 
 namespace lbann {
@@ -36,19 +36,28 @@ namespace lbann {
  *
  *  @f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} @f]
  */
-template <data_layout Layout, El::Device Device>
-class log_softmax_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class log_softmax_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
   log_softmax_layer(lbann_comm *comm)
-    : Layer(comm)
+    : data_type_layer<TensorDataType>(comm)
 #ifdef LBANN_HAS_CUDNN
     , m_tensors_cudnn_desc(this)
 #endif // LBANN_HAS_CUDNN
   {}
 
   log_softmax_layer(const log_softmax_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr)
 #ifdef LBANN_HAS_CUDNN
@@ -61,7 +70,7 @@ class log_softmax_layer : public Layer {
   }
 
   log_softmax_layer& operator=(const log_softmax_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() : nullptr);
 #ifdef LBANN_HAS_CUDNN
@@ -78,16 +87,16 @@ class log_softmax_layer : public Layer {
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    Layer::setup_matrices(grid);
-    auto dist = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist = this->get_prev_activations().DistData();
     dist.colDist = El::STAR;
-    m_workspace.reset(AbsDistMat::Instantiate(dist));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist));
 #ifdef HYDROGEN_HAVE_CUB
     if (m_workspace->GetLocalDevice() == El::Device::GPU) {
       m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
@@ -96,8 +105,8 @@ class log_softmax_layer : public Layer {
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    Layer::fp_setup_outputs(mini_batch_size);
-    const auto& dist_data = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+    const auto& dist_data = this->get_prev_activations().DistData();
     m_workspace->Empty(false);
     m_workspace->AlignWith(dist_data);
     m_workspace->Resize(1, mini_batch_size);
@@ -106,18 +115,32 @@ class log_softmax_layer : public Layer {
   void fp_compute() override;
   void bp_compute() override;
 
+  template <typename U>
+  friend void fp_compute_impl(log_softmax_layer<U, Layout, Device>& l);
+  template <typename U>
+  friend void bp_compute_impl(log_softmax_layer<U, Layout, Device>& l);
+
 private:
 
   /** Workspace for column-wise reductions. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 #ifdef LBANN_HAS_CUDNN
   /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::data_parallel_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
 #endif // LBANN_HAS_CUDNN
 
 };
 
+#ifndef LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class log_softmax_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class log_softmax_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
new file mode 100644
index 00000000000..f95c663ac86
--- /dev/null
+++ b/include/lbann/layers/activations/relu.hpp
@@ -0,0 +1,131 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED
+#define LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/distconv.hpp"
+
+namespace lbann {
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class relu_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  relu_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~relu_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+  std::unique_ptr<dc::ReLU> m_relu;
+};
+#endif // LBANN_HAS_DISTCONV
+
+/** Rectified linear unit activation function layer.
+ *  \f[ ReLU(x) = \text{max}(x, 0) \f]
+ *  See https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
+ */
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class relu_layer : public data_type_layer<TensorDataType> {
+public:
+  relu_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
+  relu_layer* copy() const override { return new relu_layer(*this); }
+  std::string get_type() const override { return "ReLU"; }
+  data_layout get_data_layout() const override { return T_layout; }
+  El::Device get_device_allocation() const override { return Dev; }
+
+protected:
+  void fp_compute() override;
+  void bp_compute() override;
+#ifdef LBANN_HAS_DISTCONV
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<relu_distconv_adapter<
+      TensorDataType, T_layout, Dev>>(*this);
+  }
+  relu_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const relu_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
+};
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+relu_distconv_adapter<TensorDataType, T_layout, Dev>&
+relu_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<relu_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const relu_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const relu_distconv_adapter<TensorDataType, T_layout, Dev>&
+relu_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const relu_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void relu_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+
+  auto &x = this->get_prev_activations_dist();
+  auto &y = this->get_activations_dist();
+  auto &dx = this->get_error_signals_dist();
+  auto &dy = this->get_prev_error_signals_dist();
+
+  // x == dx
+  constraints.mark_equivalent(x, dx);
+  // y == dy
+  constraints.mark_equivalent(y, dy);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void relu_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
+    size_t workspace_capacity) {
+  m_relu = make_unique<dc::ReLU>(dc::get_backend());
+  m_relu->setup(this->get_prev_activations(),
+                this->get_activations(),
+                this->get_error_signals(),
+                this->get_prev_error_signals());
+}
+#endif // LBANN_HAS_DISTCONV
+
+#ifndef LBANN_RELU_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class relu_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class relu_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_RELU_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp
index 665323c3c14..0a3a4c9917a 100644
--- a/include/lbann/layers/activations/softmax.hpp
+++ b/include/lbann/layers/activations/softmax.hpp
@@ -27,35 +27,91 @@
 #ifndef LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED
 #define LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 #include "lbann/utils/cudnn.hpp"
+#include "lbann/utils/distconv.hpp"
 
 // Threshold outputs to a minimum value.
+
 // If enabled, the minimum output value is sqrt(min), where min is the
 // minimum, normalized, positive value (~1e-19 for float and ~1e-154
-// for double). The gradients w.r.t. input will be inaccurate, on the
-// order of the minimum output value.
-#define LBANN_ENABLE_SOFTMAX_CUTOFF
+// for double). During backprop, gradients are computed as if
+// thresholding did not occur, so there will be a discrepancy for
+// values that are thresholded.
+#define LBANN_ENABLE_SOFTMAX_THRESHOLD
 
 namespace lbann {
 
-/** @brief
- *
+/** @brief Which tensor dimensions to apply softmax over. */
+enum class softmax_mode {
+  INVALID,
+  /** @brief Sample-wise softmax.
+   *
+   *  Slice tensor along the sample dimension (assuming data in NCHW
+   *  format) and apply softmax independently to each slice (once per
+   *  sample).
+   */
+  INSTANCE,
+  /** @brief Position-wise softmax.
+   *
+   *  Split tensor along all but the channel dimension (assuming data
+   *  in NCHW format) and apply softmax independently to each piece
+   *  (once per spatial position per sample).
+   *
+   *  This is not to be confused with @c channelwise_softmax, which
+   *  slices along the sample and channel dimensions.
+   */
+  CHANNEL
+};
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class softmax_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+
+  softmax_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~softmax_distconv_adapter() = default;
+
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+
+  std::unique_ptr<dc::Softmax> m_softmax;
+};
+#endif // LBANN_HAS_DISTCONV
+
+/**
  *  @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f]
  */
-template <data_layout Layout, El::Device Device>
-class softmax_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class softmax_layer : public data_type_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
 
-  softmax_layer(lbann_comm *comm)
-    : Layer(comm)
+public:
+
+  softmax_layer(lbann_comm *comm,
+                softmax_mode mode)
+    : data_type_layer<TensorDataType>(comm),
+      m_mode(mode)
 #ifdef LBANN_HAS_CUDNN
     , m_tensors_cudnn_desc(this)
 #endif // LBANN_HAS_CUDNN
-  {}
+  {
+    if(mode == softmax_mode::INVALID) {
+      LBANN_ERROR("invalid softmax mode");
+    }
+  }
 
   softmax_layer(const softmax_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
+      m_mode(other.m_mode),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr)
 #ifdef LBANN_HAS_CUDNN
@@ -67,17 +123,6 @@ class softmax_layer : public Layer {
 #endif // LBANN_HAS_CUDNN
   }
 
-  softmax_layer& operator=(const softmax_layer& other) {
-    Layer::operator=(other);
-    m_workspace.reset(other.m_workspace ?
-                      other.m_workspace->Copy() : nullptr);
-#ifdef LBANN_HAS_CUDNN
-    m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
-    m_tensors_cudnn_desc.set_layer(this);
-#endif // LBANN_HAS_CUDNN
-    return *this;
-  }
-
   ~softmax_layer() = default;
 
   softmax_layer* copy() const override { return new softmax_layer(*this); }
@@ -85,16 +130,16 @@ class softmax_layer : public Layer {
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    Layer::setup_matrices(grid);
-    auto dist = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist = this->get_prev_activations().DistData();
     dist.colDist = El::STAR;
-    m_workspace.reset(AbsDistMat::Instantiate(dist));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist));
 #ifdef HYDROGEN_HAVE_CUB
     if (m_workspace->GetLocalDevice() == El::Device::GPU) {
       m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
@@ -103,8 +148,8 @@ class softmax_layer : public Layer {
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    Layer::fp_setup_outputs(mini_batch_size);
-    const auto& dist_data = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+    const auto& dist_data = this->get_prev_activations().DistData();
     m_workspace->Empty(false);
     m_workspace->AlignWith(dist_data);
     m_workspace->Resize(1, mini_batch_size);
@@ -113,18 +158,114 @@ class softmax_layer : public Layer {
   void fp_compute() override;
   void bp_compute() override;
 
+  template <typename U>
+  friend void fp_compute_impl(softmax_layer<U, Layout, Device>& l);
+  template <typename U>
+  friend void bp_compute_impl(softmax_layer<U, Layout, Device>& l);
+
 private:
 
+  /** Softmax mode. */
+  const softmax_mode m_mode;
+
   /** Workspace for column-wise reductions. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 #ifdef LBANN_HAS_CUDNN
   /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::data_parallel_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
 #endif // LBANN_HAS_CUDNN
 
+// Minimum output value to avoid denormalized floats
+#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD
+  const TensorDataType threshold_val = static_cast<TensorDataType>(El::Sqrt(std::numeric_limits<TensorDataType>::min()));
+#else
+  const TensorDataType threshold_val = El::TypeTraits<TensorDataType>::Zero();
+#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class softmax_distconv_adapter<TensorDataType, Layout, Device>;
+ protected:
+  bool is_distconv_supported() const override {
+    return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<softmax_distconv_adapter<
+      TensorDataType, Layout, Device>>(*this);
+  }
+  softmax_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() override;
+  const softmax_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+softmax_distconv_adapter<TensorDataType, T_layout, Dev>&
+softmax_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<softmax_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const softmax_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const softmax_distconv_adapter<TensorDataType, T_layout, Dev>&
+softmax_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const softmax_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void softmax_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+  // No overlap supported yet
+  for (auto &d: this->m_prev_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_prev_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void softmax_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
+    size_t workspace_capacity) {
+  auto &l = dynamic_cast<softmax_layer<TensorDataType, T_layout, Dev>&>(
+      this->layer());
+  m_softmax = make_unique<dc::Softmax>(dc::get_backend());
+  auto mode = l.m_mode == softmax_mode::INSTANCE ?
+                          ::distconv::SoftmaxMode::INSTANCE :
+      ::distconv::SoftmaxMode::CHANNEL;
+  m_softmax->setup(this->get_prev_activations(), mode);
+}
+#endif // LBANN_HAS_DISTCONV
+
+
+LBANN_DEFINE_LAYER_BUILDER(softmax);
+
+#ifndef LBANN_SOFTMAX_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class softmax_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class softmax_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_SOFTMAX_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/data_type_distconv_adapter.hpp b/include/lbann/layers/data_type_distconv_adapter.hpp
new file mode 100644
index 00000000000..a120965ad67
--- /dev/null
+++ b/include/lbann/layers/data_type_distconv_adapter.hpp
@@ -0,0 +1,163 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED
+#define LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED
+
+#include "lbann/layers/distconv_adapter.hpp"
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType>
+class data_type_distconv_adapter: public distconv_adapter {
+public:
+  using TensorDevType = dc::TensorDev<TensorDataType>;
+  using TensorShufflerType = dc::TensorShuffler<TensorDataType>;
+
+  data_type_distconv_adapter(Layer& layer): distconv_adapter(layer) {}
+  virtual ~data_type_distconv_adapter() = default;
+
+  /** Get activation tensor corresponding to child layer. */
+  const TensorDevType& get_activations(const Layer& child) const override;
+  /** Get error signal tensor corresponding to parent layer. */
+  const TensorDevType& get_error_signals(const Layer& parent) const override;
+
+  /** Get activation tensor. */
+  const TensorDevType& get_activations(int child_index = 0) const;
+  /** Get activation tensor. */
+  TensorDevType& get_activations(int child_index = 0);
+  /** Get original activation tensor. */
+  const TensorDevType& get_original_activations(int child_index = 0) const;
+  /** Get original activation tensor. */
+  TensorDevType& get_original_activations(int child_index = 0);
+
+  /** Get previous activation tensor. */
+  const TensorDevType& get_prev_activations(int parent_index = 0) const;
+  /** Get previous activation tensor. */
+  TensorDevType& get_prev_activations(int parent_index = 0);
+  /** Get original previous activation tensor. */
+  const TensorDevType& get_original_prev_activations(int parent_index = 0) const;
+  /** Get original previous activation tensor. */
+  TensorDevType& get_original_prev_activations(int parent_index = 0);
+
+  /** Get error signal tensor. */
+  const TensorDevType& get_error_signals(int parent_index = 0) const;
+  /** Get error signal tensor. */
+  TensorDevType& get_error_signals(int parent_index = 0);
+  /** Get original error signal tensor. */
+  const TensorDevType& get_original_error_signals(int parent_index = 0) const;
+  /** Get original error signal tensor. */
+  TensorDevType& get_original_error_signals(int parent_index = 0);
+
+  /** Get previous error siganl tensor. */
+  const TensorDevType& get_prev_error_signals(int child_index = 0) const;
+  /** Get previous error siganl tensor. */
+  TensorDevType& get_prev_error_signals(int child_index = 0);
+  /** Get original previous error signal tensor. */
+  const TensorDevType& get_original_prev_error_signals(int child_index = 0) const;
+  /** Get original previous error signal tensor. */
+  TensorDevType& get_original_prev_error_signals(int child_index = 0);
+
+  void fp_setup(El::Int mini_batch_size) override;
+  void fp_postprocess() override;
+  void bp_setup(El::Int mini_batch_size) override;
+  void bp_postprocess() override;
+
+  void dump_activations() const override;
+  void dump_original_activations() override;
+  void dump_error_signals() const override;
+  void dump_original_error_signals() override;
+
+ protected:
+  // Setup fp tensors
+  void setup_prev_activations() override;
+  virtual std::unique_ptr<TensorDevType> setup_prev_activations_i(int index) const;
+  void setup_original_prev_activations() override;
+  virtual std::unique_ptr<TensorDevType> setup_original_prev_activations_i(int index) const;
+  void setup_activations() override;
+  virtual std::unique_ptr<TensorDevType> setup_activations_i(int index) const;
+  void setup_original_activations() override;
+  virtual std::unique_ptr<TensorDevType> setup_original_activations_i(int index) const;
+
+  // Setup bp tensors
+  void setup_prev_error_signals() override;
+  virtual std::unique_ptr<TensorDevType> setup_prev_error_signals_i(int index) const;
+  void setup_original_prev_error_signals() override;
+  virtual std::unique_ptr<TensorDevType> setup_original_prev_error_signals_i(int index) const;
+  void setup_error_signals() override;
+  virtual std::unique_ptr<TensorDevType> setup_error_signals_i(int index) const;
+  void setup_original_error_signals() override;
+  virtual std::unique_ptr<TensorDevType> setup_original_error_signals_i(int index) const;
+
+  virtual dc::Shape get_prev_activations_shape(int input_index=0) const;
+  virtual dc::Shape get_prev_activations_local_shape(int input_index=0) const;
+  virtual dc::Shape get_activations_shape(int index=0) const;
+  virtual dc::Shape get_activations_local_shape(int index=0) const;
+
+  virtual dc::Shape get_prev_error_signals_shape(int index=0) const;
+  virtual dc::Shape get_prev_error_signals_local_shape(int index=0) const;
+  virtual dc::Shape get_error_signals_shape(int index=0) const;
+  virtual dc::Shape get_error_signals_local_shape(int index=0) const;
+
+  void ensure_prev_activations() override;
+  void copy_out_activations() override;
+  void ensure_prev_error_signals() override;
+  void copy_out_error_signals() override;
+
+  TensorShufflerType& get_prev_activations_shuffler(
+      const TensorDevType &src, const TensorDevType &dst);
+  TensorShufflerType& get_activations_shuffler(
+      const TensorDevType &src, const TensorDevType &dst);
+  TensorShufflerType& get_prev_error_signals_shuffler(
+      const TensorDevType &src, const TensorDevType &dst);
+  TensorShufflerType& get_error_signals_shuffler(
+      const TensorDevType &src, const TensorDevType &dst);
+
+ private:
+  std::vector<std::unique_ptr<TensorDevType>> m_inputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_original_inputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_outputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_original_outputs;
+
+  std::vector<std::unique_ptr<TensorDevType>> m_gradient_wrt_inputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_original_gradient_wrt_inputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_gradient_wrt_outputs;
+  std::vector<std::unique_ptr<TensorDevType>> m_original_gradient_wrt_outputs;
+
+  // TODO: Use unique_ptr
+  std::array<TensorShufflerType*, 4> m_prev_activations_shufflers{ {nullptr, nullptr, nullptr, nullptr} };
+  std::array<TensorShufflerType*, 4> m_activations_shufflers{ {nullptr, nullptr, nullptr, nullptr} };
+  std::array<TensorShufflerType*, 4> m_prev_error_signals_shufflers{ {nullptr, nullptr, nullptr, nullptr} };
+  std::array<TensorShufflerType*, 4> m_error_signals_shufflers{ {nullptr, nullptr, nullptr, nullptr} };
+
+  void set_activations_outermost_dimension(size_t dim);
+  void set_error_signals_outermost_dimension(size_t dim);
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_DATA_TYPE_DISTCONV_ADAPTER_HPP_INCLUDED
diff --git a/include/lbann/layers/data_type_layer.hpp b/include/lbann/layers/data_type_layer.hpp
new file mode 100644
index 00000000000..2c363ccef21
--- /dev/null
+++ b/include/lbann/layers/data_type_layer.hpp
@@ -0,0 +1,394 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED
+#define LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+#include "lbann/weights/weights_proxy.hpp"
+
+#include "lbann/utils/h2_tmp.hpp"
+
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/layers/data_type_distconv_adapter.hpp"
+#include <set>
+#include <map>
+#include <array>
+#endif // LBANN_HAS_DISTCONV
+
+namespace lbann {
+
+// Forward declarations
+namespace cudnn {
+template <typename U>
+class data_parallel_layer_tensor_manager;
+template <typename U>
+class entrywise_layer_tensor_manager;
+}
+
+using supported_layer_data_type = h2::meta::TL<
+#ifdef LBANN_HAS_GPU_FP16
+  fp16,
+#endif
+#ifdef LBANN_HAS_HALF
+  cpu_fp16,
+#endif
+  float, double>;
+
+template <typename TensorDataType>
+class data_type_layer : public Layer {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The proxy tensor type expected in this object. */
+  template <El::Device D>
+  using AbsDistMatReadProxyType = El::AbstractDistMatrixReadDeviceProxy<TensorDataType, D>;
+
+  /** @brief The local tensor type expected in this object. */
+  using AbsMatrixType = El::AbstractMatrix<TensorDataType>;
+
+  /** @brief The proxy type for weights used by this object. */
+  using WeightsProxyType = weights_proxy<TensorDataType>;
+
+  ///@}
+
+public:
+  static_assert(
+    h2::meta::tlist::MemberV<TensorDataType, supported_layer_data_type>(),
+    "Must use a supported type.");
+
+  data_type_layer(lbann_comm *comm, bool persistent_error_signals=false)
+    : Layer(comm), m_persistent_error_signals{persistent_error_signals} {}
+  data_type_layer(const data_type_layer<TensorDataType>& other);
+  data_type_layer& operator=(const data_type_layer<TensorDataType>& other);
+  virtual ~data_type_layer() = default;
+
+  /** Get a string representing the layer datatype
+   */
+  std::string get_datatype_name() const override {
+    return TypeName<TensorDataType>();
+  };
+
+  /** Forward propagation step.
+   *  Apply a mathematical operation to input tensors to obtain output
+   *  tensors.
+   */
+  void forward_prop() final;
+
+  void summarize_matrices(lbann_summary& summarizer, int step) override;
+
+  /** Check that the setup is reasonable. */
+  void check_setup() override;
+
+  // ===========================================================
+  // Public Tensor access functions
+  // ===========================================================
+
+  /** Get activation tensor corresponding to child layer. */
+  const BaseDistMat& get_activations(const Layer& child) const override;
+  /** Get error signal tensor corresponding to parent layer. */
+  const BaseDistMat& get_error_signals(const Layer& parent) const override;
+
+  /** Get activation tensor. */
+  AbsDistMatrixType& get_activations(int child_index = 0);
+  /** Get error signal tensor. */
+  AbsDistMatrixType& get_error_signals(int parent_index = 0);
+  /** Get activation tensor. */
+  const AbsDistMatrixType& get_activations(int child_index = 0) const;
+  /** Get error signal tensor. */
+  const AbsDistMatrixType& get_error_signals(int parent_index = 0) const;
+
+  /** Get local portion of activation tensor. */
+  AbsMatrixType& get_local_activations(int child_index = 0);
+  /** Get local portion of error signal tensor. */
+  AbsMatrixType& get_local_error_signals(int parent_index = 0);
+  /** Get local portion of activation tensor. */
+  const AbsMatrixType& get_local_activations(int child_index = 0) const;
+  /** Get local portion of error signal tensor. */
+  const AbsMatrixType& get_local_error_signals(int parent_index = 0) const;
+
+  /** @brief Set whether to keep or dynamically reallocate error signals.
+   *
+   *  Passing a value of @c true means to keep the error signals; @c
+   *  false means to dynamically reallocate them.
+   */
+  void set_keep_error_signals(bool) override;
+
+protected:
+
+  // ===========================================================
+  // Protected Tensor access functions
+  // ===========================================================
+
+  /** Get previous activation tensor. */
+  const AbsDistMatrixType& get_prev_activations(int parent_index = 0) const;
+  /** Get previous error signal tensor. */
+  const AbsDistMatrixType& get_prev_error_signals(int child_index = 0) const;
+
+  /** Get local portion of previous activation tensor. */
+  const AbsMatrixType& get_local_prev_activations(int parent_index = 0) const;
+  /** Get local portion of previous error signal tensor. */
+  const AbsMatrixType& get_local_prev_error_signals(int child_index = 0) const;
+
+protected:
+
+  // ===========================================================
+  // Setup helper functions
+  // ===========================================================
+
+  /** Setup distributed matrices.
+   *  Called by the 'setup' function. Each column of these distributed
+   *  matrices is interpreted as the flattened tensor for a mini-batch
+   *  sample. The matrices themselves are constructed by calling the
+   *  'construct_matrix' function. If any matrices have already been
+   *  setup, they are destroyed and reinstantiated.
+   */
+  void setup_matrices(const El::Grid& grid) override;
+
+  /** Setup layer data.
+   *  Called by the 'setup' function. Memory is allocated for
+   *  distributed matrices.
+   */
+  void setup_data(size_t max_mini_batch_size) override;
+
+  // ===========================================================
+  // Forward prop step helper functions
+  // ===========================================================
+
+  /** Setup input tensors.
+   *  Called by the 'forward_prop' function. Each input tensor is
+   *  setup as a view or copy of the corresponding parent layer's
+   *  output tensor.
+   */
+  void fp_setup_inputs(El::Int mini_batch_size) override;
+  /** Setup output tensors.
+   *  Called by the 'forward_prop' function. Each output tensor is
+   *  resized to match the mini-batch size.
+   */
+  void fp_setup_outputs(El::Int mini_batch_size) override;
+
+  // ===========================================================
+  // Back prop step helper functions
+  // ===========================================================
+
+  /** Setup gradient w.r.t. input tensors.
+   *  Called by the 'back_prop' function. Each gradient w.r.t. input
+   *  tensor is resized to match the mini-batch size.
+   */
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override;
+  /** Compute objective funciton gradients.
+   *  Called by the 'back_prop' function. Given the input, output, and
+   *  gradient w.r.t. output tensors, the gradient w.r.t. input
+   *  tensors are populated with the computed values and the gradients
+   *  w.r.t. the weights are sent to the appropriate optimizers.
+   */
+  void bp_compute() override;
+
+  // ===========================================================
+  // Protected Weights access functions
+  // ===========================================================
+
+  /** @brief Get the values matrix for a specific weights object */
+  AbsDistMatrixType const& weights_values(size_t idx) const {
+    if (idx >= m_weights_proxy.size())
+      LBANN_ERROR("Bad index ", idx, " "
+                  "(size=" , m_weights_proxy.size(), ")");
+    return m_weights_proxy[idx].values();
+  }
+
+  /** @brief Get a specific master weights object.
+   *
+   *  This is sufficient for setting or accessing metadata about the
+   *  weights class.
+   */
+  weights& master_weights(size_t idx) {
+    return get_weights(idx);
+  }
+  weights const& master_weights(size_t idx) const {
+    return get_weights(idx);
+  }
+
+private:
+
+  void setup_weights(size_t idx, weights& w) override;
+
+  /** @brief Attempt to take ownership of the previous error signal.
+   *
+   *  If the underlying matrix has the right datatype and
+   *  distribution, the signal is moved explicitly. Otherwise a deep
+   *  copy is made so that it has the correct datatype and
+   *  distribution.
+   *
+   *  This is valid if the child layer does not have persistent error
+   *  signals.
+   *
+   *  @param child The layer from which the error signal has come.
+   *  @param signal The error signal from the layer.
+   */
+  void move_or_copy_prev_error_signal_(
+    const Layer& child,
+    std::unique_ptr<El::BaseDistMatrix> signal) final;
+
+  /** @brief Attempt to view the previous error signal.
+   *
+   *  If the underlying matrix has the right datatype and
+   *  distribution, the signal can be viewed directly. Otherwise a
+   *  deep copy is made so that it has the correct datatype and
+   *  distribution.
+   *
+   *  This is only valid if the child layer has persistent error
+   *  signals. Otherwise, the viewed data my be invalidated.
+   *
+   *  @param child The layer from which the error signal has come.
+   *  @param signal The error signal from the layer.
+   */
+  void view_or_copy_prev_error_signal_(
+    const Layer& child,
+    const El::BaseDistMatrix& signal) final;
+
+  /** @brief Deep copy the error signal.
+   *
+   *  In some cases, it can be determined that neither viewing nor
+   *  moving is a possibility. In these cases, we must do a deep copy.
+   *
+   *  @param child The layer from which the error signal has come.
+   *  @param signal The error signal from the layer.
+   */
+  void deep_copy_prev_error_signal_(
+    const Layer& child,
+    const El::BaseDistMatrix& signal) final;
+
+  /** @brief Ensure that gradient matrices exist.
+   *
+   *  This step is performed immediately prior to the bp_compute()
+   *  work.
+   */
+  void allocate_new_gradients_() final;
+
+  /** @brief Send error signals computed by this layer to their
+   *         respective parents.
+   *
+   *  This step is performed immediately after the bp_compute() work
+   *  and prior to clearing the previous error signals. This ordering
+   *  is necessary in case this layer's error signals are views into
+   *  the previous error signals.
+   */
+  void propagate_error_signals_to_parents_() final;
+
+  /** @brief Free previous error signals, if possible.
+   *
+   *  This step is performed at the end of a layer's backprop phase.
+   */
+  void clear_prev_error_signals_() final;
+
+  /** Backward propagation step.
+   *  Given the objective function gradients w.r.t. the output
+   *  tensors, compute the gradients w.r.t. the input tensors and
+   *  w.r.t. the weights. This is essentially an application of the
+   *  chain rule.
+   */
+  void back_prop_impl_() final;
+
+  // ===========================================================
+  // Private class members
+  // ===========================================================
+
+  /** @brief Persistent, read-only, proxied views of the weights
+   *         values matrix.
+   *
+   *  @note (trb 05/28/2020): These are kept as members out of
+   *  consideration for the case where accessing them could require a
+   *  deep copy. This is more out of my own concern about ways in
+   *  which derived classes could abuse weights; in theory, I believe,
+   *  you could just create these on the fly once during FP and once
+   *  during BP. Then the question is: does the performance cost of
+   *  (potentially) two(ish) copies or the memory cost of storing an
+   *  additional copy of the weights hurt more?
+   */
+  std::vector<WeightsProxyType> m_weights_proxy;
+
+  /** Input tensors.
+   *  Each matrix column corresponds to a flattened mini-batch sample.
+   */
+  std::vector<std::unique_ptr<AbsDistMatrixType>> m_inputs;
+  /** Output tensors.
+   *  Each matrix column corresponds to a flattened mini-batch sample.
+   */
+  std::vector<std::unique_ptr<AbsDistMatrixType>> m_outputs;
+  /** Objective function gradients w.r.t. the output tensors.
+   *  Each matrix column corresponds to a flattened mini-batch sample.
+   */
+  std::vector<std::unique_ptr<AbsDistMatrixType>> m_gradient_wrt_outputs;
+  /** Objective function gradients w.r.t. the input tensors.
+   *  Each matrix column corresponds to a flattened mini-batch sample.
+   */
+  std::vector<std::unique_ptr<AbsDistMatrixType>> m_gradient_wrt_inputs;
+
+  /** @brief Whether to keep persistent error signals or dynamically
+   *         allocate/deallocate them.
+   *
+   *  The default behavior is dynamic allocation.
+   */
+  bool m_persistent_error_signals = false;
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class data_type_distconv_adapter<TensorDataType>;
+ public:
+  data_type_distconv_adapter<TensorDataType>& get_distconv_adapter() override;
+  const data_type_distconv_adapter<TensorDataType>& get_distconv_adapter() const override;
+
+ protected:
+  void setup_distconv_adapter() override;
+#endif // LBANN_HAS_DISTCONV
+
+#ifdef LBANN_HAS_CUDA
+  template <typename U>
+  friend class cudnn::data_parallel_layer_tensor_manager;
+  template <typename U>
+  friend class cudnn::entrywise_layer_tensor_manager;
+#endif // LBANN_HAS_CUDA
+};
+
+#ifndef LBANN_DATA_TYPE_LAYER_INSTANTIATE
+#define PROTO(T)                           \
+  extern template class data_type_layer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+
+#endif // LBANN_DATA_TYPE_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED
diff --git a/include/lbann/layers/distconv_adapter.hpp b/include/lbann/layers/distconv_adapter.hpp
new file mode 100644
index 00000000000..0b6175ec1e8
--- /dev/null
+++ b/include/lbann/layers/distconv_adapter.hpp
@@ -0,0 +1,141 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED
+#define LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED
+
+#include "lbann/utils/distconv.hpp"
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace lbann {
+
+class Layer;
+
+class tensor_overlap_constraints {
+ public:
+  using dist_set = std::unordered_set<dc::Dist*>;
+  using const_dist_set = std::unordered_set<const dc::Dist*>;
+
+  tensor_overlap_constraints() = default;
+  virtual ~tensor_overlap_constraints() = default;
+
+  void mark_equivalent(dc::Dist &d1, dc::Dist &d2);
+  void mark_updated(const dc::Dist &d);
+  void mark_invariant(const dc::Dist &d);
+
+  void find_valid_overlap();
+
+ private:
+  std::unordered_map<const dc::Dist*, dist_set> m_equivalents;
+  const_dist_set m_updated;
+  const_dist_set m_invariants;
+};
+
+class distconv_adapter {
+  friend class Layer;
+ public:
+  distconv_adapter(Layer& layer);
+  virtual ~distconv_adapter() = default;
+
+  /** Get activation tensor corresponding to child layer. */
+  virtual const dc::AbsTensor& get_activations(const Layer& child) const = 0;
+  /** Get error signal tensor corresponding to parent layer. */
+  virtual const dc::AbsTensor& get_error_signals(const Layer& parent) const = 0;
+
+  virtual void setup_distributions(tensor_overlap_constraints &constraints);
+  void impose_adjacent_overlap_constraints(
+      tensor_overlap_constraints &constraints);
+
+  dc::Dist &get_prev_activations_dist();
+  const dc::Dist &get_prev_activations_dist() const;
+  dc::Dist &get_activations_dist();
+  const dc::Dist &get_activations_dist() const;
+  dc::Dist &get_prev_error_signals_dist();
+  const dc::Dist &get_prev_error_signals_dist() const;
+  dc::Dist &get_error_signals_dist();
+  const dc::Dist &get_error_signals_dist() const;
+
+  virtual void setup_fp_tensors();
+  virtual void setup_bp_tensors();
+
+  virtual void setup_layer(size_t workspace_capacity) {}
+
+  virtual void fp_setup(El::Int mini_batch_size) = 0;
+  virtual void fp_postprocess() = 0;
+  virtual void bp_setup(El::Int mini_batch_size) = 0;
+  virtual void bp_postprocess() = 0;
+
+  virtual bool parent_copy_required(size_t input_index) const;
+  virtual bool parent_shuffle_required(size_t input_index) const;
+  virtual bool child_copy_required(size_t output_index) const;
+  virtual bool child_shuffle_required(size_t output_index) const;
+
+  virtual void dump_activations() const = 0;
+  virtual void dump_original_activations()= 0;
+  virtual void dump_error_signals() const = 0;
+  virtual void dump_original_error_signals()= 0;
+
+ protected:
+  virtual Layer& layer();
+  virtual const Layer& layer() const;
+  std::string get_name() const;
+
+  virtual void setup_prev_activations() = 0;
+  virtual void setup_original_prev_activations() = 0;
+  virtual void setup_activations() = 0;
+  virtual void setup_original_activations() = 0;
+
+  virtual void setup_prev_error_signals() = 0;
+  virtual void setup_original_prev_error_signals() = 0;
+  virtual void setup_error_signals() = 0;
+  virtual void setup_original_error_signals() = 0;
+
+  virtual void ensure_prev_activations() = 0;
+  virtual void copy_out_activations() = 0;
+  virtual void ensure_prev_error_signals() = 0;
+  virtual void copy_out_error_signals() = 0;
+
+  std::vector<dc::Dist> m_prev_activations_dists;
+  std::vector<dc::Dist> m_activations_dists;
+  std::vector<dc::Dist> m_prev_error_signals_dists;
+  std::vector<dc::Dist> m_error_signals_dists;
+
+ private:
+  Layer& m_layer;
+  std::vector<bool> m_parent_copy_required;
+  std::vector<bool> m_parent_shuffle_required;
+  std::vector<bool> m_child_copy_required;
+  std::vector<bool> m_child_shuffle_required;
+
+  void setup_tensor_shuffle();
+  void adjust_parallel_strategy();
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_DISTCONV_ADAPTER_HPP_INCLUDED
diff --git a/include/lbann/layers/image/bilinear_resize.hpp b/include/lbann/layers/image/bilinear_resize.hpp
index 2e3e9e9da67..b77fba2e138 100644
--- a/include/lbann/layers/image/bilinear_resize.hpp
+++ b/include/lbann/layers/image/bilinear_resize.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
 #define LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -36,14 +36,14 @@ namespace lbann {
  *  Tensors are assumed to be image data in CHW format. Gradients are
  *  not propagated during backprop.
  */
-template <data_layout Layout, El::Device Device>
-class bilinear_resize_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class bilinear_resize_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "bilinear_resize_layer only supports DATA_PARALLEL");
 public:
 
   bilinear_resize_layer(lbann_comm *comm, El::Int height, El::Int width)
-    : Layer(comm), m_height(height), m_width(width) {
-    static_assert(Layout == data_layout::DATA_PARALLEL,
-                  "bilinear_resize_layer only supports DATA_PARALLEL");
+    : data_type_layer<TensorDataType>(comm), m_height(height), m_width(width) {
   }
 
   bilinear_resize_layer* copy() const override {
@@ -57,17 +57,17 @@ class bilinear_resize_layer : public Layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
 
     // Get input dimensions
-    auto dims = get_input_dims();
+    auto dims = this->get_input_dims();
     const auto& num_dims = dims.size();
 
     // Check that dimensions are valid
     std::stringstream err;
     if (num_dims < 2) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "expects input with at least two dimensions, "
           << "but input dimensions are ";
       for (size_t i = 0; i < num_dims; ++i) {
@@ -75,12 +75,12 @@ class bilinear_resize_layer : public Layer {
       }
       LBANN_ERROR(err.str());
     } else if (m_height <= 0) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "attempted to resize with "
           << "negative height (" << m_height << ")";
       LBANN_ERROR(err.str());
     } else if (m_width <= 0) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "attempted to resize with "
           << "negative width (" << m_width << ")";
       LBANN_ERROR(err.str());
@@ -89,7 +89,7 @@ class bilinear_resize_layer : public Layer {
     // Resize output tensor
     dims[num_dims-2] = m_height;
     dims[num_dims-1] = m_width;
-    set_output_dims(dims);
+    this->set_output_dims(dims);
 
   }
 
@@ -106,6 +106,14 @@ class bilinear_resize_layer : public Layer {
 
 };
 
+#ifndef LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class bilinear_resize_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp
index 3dfa79edb79..4c911ec255e 100644
--- a/include/lbann/layers/io/input/generic_input_layer.hpp
+++ b/include/lbann/layers/io/input/generic_input_layer.hpp
@@ -29,170 +29,129 @@
 
 #include "lbann/layers/io/io_layer.hpp"
 //#include "lbann/utils/dataset.hpp"
+#include "lbann/io/persist.hpp"
 #include "lbann/io/data_buffers/generic_io_buffer.hpp"
 #include "lbann/io/data_buffers/partitioned_io_buffer.hpp"
 #include "lbann/models/model.hpp"
-#include "lbann/callbacks/callback_imcomm.hpp"
+#include "lbann/callbacks/imcomm.hpp"
 #include "lbann/utils/omp_diagnostics.hpp"
+#include <cereal/types/utility.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
 
 #include <future>
 
 namespace lbann {
 
 /** @todo Move functionality to input_layer. */
-class generic_input_layer : public io_layer {
+template <typename TensorDataType>
+class generic_input_layer : public io_layer<TensorDataType> {
  public:
-  using data_reader_map_t = std::map<execution_mode, generic_data_reader *>;
   using io_buffer_map_t = std::map<execution_mode, std::atomic<int>>;
 
  public:
   generic_input_layer(lbann_comm *comm,
               int num_parallel_readers,
-              std::map<execution_mode, generic_data_reader *> data_readers,
-              bool data_set_spans_models = true,
               data_reader_target_mode dr_mode = data_reader_target_mode::CLASSIFICATION)
-    : io_layer(comm, data_set_spans_models, dr_mode),
-      m_io_buffers(),
-      m_training_dataset(),
-      m_testing_dataset(),
-      m_validation_dataset(),
-      m_data_readers(data_readers),
-      m_data_set_processed(false) {
+    : io_layer<TensorDataType>(comm, dr_mode),
+      m_io_buffers() {
       //m_data_sets_span_models(data_sets_span_models) {
     // Input layers have no parents
-    m_expected_num_parent_layers = 0;
+    this->m_expected_num_parent_layers = 0;
     if(dr_mode == data_reader_target_mode::NA) {
-      m_expected_num_child_layers = 1;
+      this->m_expected_num_child_layers = 1;
     }else {
       // Input layers output a sample and target, which could be the
       // original value, categorical label, or regression value
-      m_expected_num_child_layers = 2;
+      this->m_expected_num_child_layers = 2;
     }
 
-    if(m_data_readers[execution_mode::training] != nullptr) {
-      m_training_dataset.total_samples() = m_data_readers[execution_mode::training]->get_num_data();
-    }
+    this->m_active_buffer[execution_mode::training].store(-1);
+    this->m_active_buffer[execution_mode::validation].store(-1);
+    this->m_active_buffer[execution_mode::testing].store(-1);
+  }
 
-    if(m_data_readers[execution_mode::validation] != nullptr) {
-      m_validation_dataset.total_samples() = m_data_readers[execution_mode::validation]->get_num_data();
-    }
+  ~generic_input_layer() override {
 
-    if(m_data_readers[execution_mode::testing] != nullptr) {
-      m_testing_dataset.total_samples() = m_data_readers[execution_mode::testing]->get_num_data();
+    // Synchronize the I/O thread pool
+    // Note: The thread pool may still be running asynchronously if the
+    // trainer is destroyed in the middle of an epoch. The thread pool
+    // needs to interact with data readers, etc., so it needs to be
+    // synchronized before any of them are destroyed.
+    if (this->m_model != nullptr) {
+      if (this->m_model->has_valid_execution_context()) {
+        this->m_model->get_execution_context().get_io_thread_pool().reap_threads();
+      }
     }
 
-    m_active_buffer[execution_mode::training].store(-1);
-    m_active_buffer[execution_mode::validation].store(-1);
-    m_active_buffer[execution_mode::testing].store(-1);
-  }
-
-  ~generic_input_layer() override {
     for (auto& io_buffer : m_io_buffers) {
       delete io_buffer;
     }
-    // Input layer always frees data readers.
-    for (auto& dr : m_data_readers) {
-      delete dr.second;
-    }
   }
 
   // Input layers copy their datareaders.
   generic_input_layer(const generic_input_layer& other)
-    : io_layer(other),
-      m_io_buffers(other.m_io_buffers),
-      m_training_dataset(other.m_training_dataset),
-      m_testing_dataset(other.m_testing_dataset),
-      m_validation_dataset(other.m_validation_dataset),
-      m_data_readers(other.m_data_readers) {
+    : io_layer<TensorDataType>(other),
+      m_io_buffers(other.m_io_buffers) {
     for (auto& io_buffer : m_io_buffers) {
       io_buffer = io_buffer->copy();
     }
-    for (auto& dr : m_data_readers) {
-      dr.second = dr.second->copy();
-    }
   }
 
   generic_input_layer& operator=(const generic_input_layer& other) {
-    io_layer::operator=(other);
+    io_layer<TensorDataType>::operator=(other);
     for (auto& io_buffer : m_io_buffers) {
       io_buffer = io_buffer->copy();
     }
-    for (auto& dr : m_data_readers) {
-      dr.second = dr.second->copy();
-    }
     return *this;
   }
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    // ar(CEREAL_NVP(m_io_buffer));
+  }
+
   template<typename T_io_buffer>
-  inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers);
+  inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers) {
+    m_io_buffers.push_back(new T_io_buffer(comm, num_parallel_readers, this->m_expected_num_child_layers));
+  }
 
   std::string get_type() const override { return "generic_input"; }
 
   description get_description() const override {
-    auto&& desc = io_layer::get_description();
+    auto desc = io_layer<TensorDataType>::get_description();
     desc.add("Buffer", m_io_buffers[0]->get_type());
-    desc.add("Background I/O", this->m_model->background_io_activity_allowed());
     return desc;
   }
 
-  void setup_dims() override {
-    io_layer::setup_dims();
-    for (int i = 0; i < get_num_children(); ++i) {
-      set_output_dims(get_data_dims(i), i);
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    io_layer<TensorDataType>::setup_dims(dr_metadata);
+    for (int i = 0; i < this->get_num_children(); ++i) {
+      this->set_output_dims(get_data_dims(dr_metadata, i), i);
     }
   }
 
-  void setup_data() override {
-    io_layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    io_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Resize output to maximum mini-batch size
-    const auto& max_mb_size = this->m_model->get_max_mini_batch_size();
-    for (int i = 0; i < get_num_children(); ++i) {
-      auto& output = get_activations(i);
-      output.Resize(output.Height(), max_mb_size);
-    }
-
-    auto num_io_threads = this->m_model->get_io_thread_pool()->get_num_threads();
-    /// BVE FIXME foreach data reader
-    // in case that target_layer gets initialized beforehand
-    if(m_data_readers[execution_mode::training] != nullptr) {
-      m_data_readers[execution_mode::training]->setup(num_io_threads, this->m_model->get_io_thread_pool());
-      m_data_readers[execution_mode::training]->set_rank(Layer::m_comm->get_rank_in_trainer());
-    }
-    if(m_data_readers[execution_mode::validation] != nullptr) {
-      m_data_readers[execution_mode::validation]->setup(num_io_threads, this->m_model->get_io_thread_pool());
-      m_data_readers[execution_mode::validation]->set_rank(Layer::m_comm->get_rank_in_trainer());
-    }
-    if(m_data_readers[execution_mode::testing] != nullptr) {
-      m_data_readers[execution_mode::testing]->setup(num_io_threads, this->m_model->get_io_thread_pool());
-      m_data_readers[execution_mode::testing]->set_rank(Layer::m_comm->get_rank_in_trainer());
-    }
-
-    if(io_layer::m_data_set_spans_models) {
-      calculate_num_iterations_per_epoch_training_spans_models(max_mb_size);
-    } else {
-      calculate_num_iterations_per_epoch_training_unique_per_models(max_mb_size);
+    for (int i = 0; i < this->get_num_children(); ++i) {
+      auto& output = this->get_activations(i);
+      output.Resize(output.Height(), max_mini_batch_size);
     }
 
     for (auto& io_buffer : m_io_buffers) {
       int linearized_target_size;
-      switch(m_data_reader_mode) {
-      case data_reader_target_mode::REGRESSION:
-        linearized_target_size = get_linearized_response_size();
-        break;
-      case data_reader_target_mode::RECONSTRUCTION:
-        linearized_target_size = get_linearized_data_size();
-        break;
-      case data_reader_target_mode::CLASSIFICATION:
-        linearized_target_size = get_linearized_label_size();
-        break;
-      case data_reader_target_mode::NA:
-      default:
+      if(this->get_num_children() > 1) {
+        linearized_target_size = this->get_output_size(1);
+      }else {
         linearized_target_size = 0;
       }
-      io_buffer->setup_data(get_output_size(0),
+      io_buffer->setup_data(this->get_output_size(0),
                             linearized_target_size,
-                            max_mb_size);
+                            max_mini_batch_size);
     }
   }
 
@@ -200,29 +159,34 @@ class generic_input_layer : public io_layer {
    *  Sets up the effective (global) mini-batch size.
    */
   void fp_setup_outputs(El::Int mini_batch_size) override {
-
-    // Determine model mini-batch size and effective mini-batch size
-    // Note: If inter-model communication is activated, the effective
-    // mini-batch is equal to the global mini-batch size.
-    /// @todo This functionality should probably be moved elsewhere
-    mini_batch_size = get_current_mini_batch_size();
-    int effective_mini_batch_size = mini_batch_size;
-    for (auto&& cb : this->m_model->get_callbacks()) {
-      if (dynamic_cast<lbann_callback_imcomm*>(cb) != nullptr) {
-        effective_mini_batch_size = get_current_global_mini_batch_size();
-        break;
+    /// During model setup there is no valid execution context, but
+    /// during execution there is a context
+    if(this->m_model->has_valid_execution_context()) {
+      // Determine model mini-batch size and effective mini-batch size
+      // Note: If inter-model communication is activated, the effective
+      // mini-batch is equal to the global mini-batch size.
+      /// @todo This functionality should probably be moved elsewhere
+      mini_batch_size = get_current_mini_batch_size();
+
+      auto effective_mini_batch_size = mini_batch_size;
+      for (auto&& cb : this->m_model->get_callbacks()) {
+        if (dynamic_cast<callback::imcomm*>(cb) != nullptr) {
+          effective_mini_batch_size = get_current_global_mini_batch_size();
+          break;
+        }
       }
-    }
 
-    // Set mini-batch size in model
-    this->m_model->set_current_mini_batch_size(mini_batch_size);
-    this->m_model->set_effective_mini_batch_size(effective_mini_batch_size);
+      auto& c = static_cast<sgd_execution_context&>(this->m_model->get_execution_context());
+      // Set mini-batch size in model
+      c.set_current_mini_batch_size(mini_batch_size);
+      c.set_effective_mini_batch_size(effective_mini_batch_size);
+    }
 
     // Initialize matrices
-    io_layer::fp_setup_outputs(mini_batch_size);
+    io_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
 
     for (auto& io_buffer : m_io_buffers) {
-      for (int i = 0; i < get_num_children(); ++i) {
+      for (int i = 0; i < this->get_num_children(); ++i) {
         io_buffer->fp_setup_data(mini_batch_size, i);
       }
     }
@@ -230,8 +194,9 @@ class generic_input_layer : public io_layer {
 
   void fetch_data_in_background(int future_active_buffer, execution_mode mode) {
     int active_buffer = future_active_buffer % m_io_buffers.size();
-    generic_io_buffer* io_buffer = m_io_buffers[active_buffer];
-    std::lock_guard<std::mutex> guard(dr_mutex);
+    generic_io_buffer<TensorDataType>* io_buffer = m_io_buffers[active_buffer];
+    data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    std::lock_guard<std::mutex> guard(dc.dr_mutex);
     setup_next_io_buffer(io_buffer);
     io_buffer->fetch_to_local_matrix(get_data_reader(mode), mode);
     return;
@@ -248,16 +213,16 @@ class generic_input_layer : public io_layer {
   }
 
   void fp_compute() override {
-    execution_mode mode = this->m_model->get_execution_mode();
+    execution_mode mode = this->m_model->get_execution_context().get_execution_mode();
 
     increment_active_buffer_idx(mode);
 
-    generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()];
+    generic_io_buffer<TensorDataType>* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()];
 
     // If there is no valid data and there is not already a background
     // thread to fetch the data, queue up the background thread
     if(io_buffer->num_samples_ready(mode) == 0 && !io_buffer->is_data_fetched_in_background(mode)) {
-      std::future<void> background_fetch_done = this->m_model->get_io_thread_pool()->submit_job(
+      std::future<void> background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job(
         std::bind(&generic_input_layer::fetch_data_in_background, this, get_active_buffer_idx(mode), mode));
       io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode);
       io_buffer->set_fetch_data_in_background(true, mode);
@@ -280,36 +245,37 @@ class generic_input_layer : public io_layer {
         }
     }
 
-    if(dynamic_cast<partitioned_io_buffer*>(io_buffer) != nullptr) {
+    if(dynamic_cast<partitioned_io_buffer<TensorDataType>*>(io_buffer) != nullptr) {
       // Use the predetermined size of the mini-batch to set the current
       // batch size for the neural network
       num_samples_in_batch = get_current_mini_batch_size();
 
       update_num_samples_processed(num_samples_in_batch);
-      if(m_expected_num_child_layers == 1) {
-        io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0));
+      if(this->m_expected_num_child_layers == 1) {
+        io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0));
       }else {
-        io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0), get_activations(1));
+        io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0), this->get_activations(1));
       }
     }else {
-          LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type");
+      LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type");
     }
 
-    m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode);
+    data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    dc.m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode);
 
-    if(!m_data_set_processed && this->m_model->background_io_activity_allowed()) {
+    if(!dc.m_data_set_processed && this->m_model->get_execution_context().background_io_activity_allowed()) {
       int next_active_buffer = get_active_buffer_idx(mode) + 1;
-      std::future<void> background_fetch_done = this->m_model->get_io_thread_pool()->submit_job(
+      std::future<void> background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job(
         std::bind(&generic_input_layer::fetch_data_in_background, this, next_active_buffer, mode));
-      generic_io_buffer* next_io_buffer = m_io_buffers[next_active_buffer % m_io_buffers.size()];
+      generic_io_buffer<TensorDataType>* next_io_buffer = m_io_buffers[next_active_buffer % m_io_buffers.size()];
       next_io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode);
       next_io_buffer->set_fetch_data_in_background(true, mode);
     }
   }
 
-  void setup_next_io_buffer(generic_io_buffer* io_buffer) {
+  void setup_next_io_buffer(generic_io_buffer<TensorDataType>* io_buffer) {
     int mini_batch_size = get_current_mini_batch_size();
-    for (int i = 0; i < get_num_children(); ++i) {
+    for (int i = 0; i < this->get_num_children(); ++i) {
       io_buffer->fp_setup_data(mini_batch_size, i);
     }
   }
@@ -318,7 +284,8 @@ class generic_input_layer : public io_layer {
    * Once a mini-batch is processed, resuffle the data for the next batch if necessary
    */
   bool update_compute() override {
-    return m_data_set_processed;
+    data_coordinator& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    return dc.m_data_set_processed;
   }
 
   //************************************************************************
@@ -326,26 +293,11 @@ class generic_input_layer : public io_layer {
   //************************************************************************
 
   generic_data_reader *get_data_reader(const execution_mode mode) const {
-    generic_data_reader *data_reader = nullptr;
-
-    auto it = m_data_readers.find(mode);
-    if (it != m_data_readers.end()) data_reader = it->second;
-
-    switch(mode) {
-    case execution_mode::training:
-      break;
-    case execution_mode::validation:
-      break;
-    case execution_mode::testing:
-      break;
-    default:
-      LBANN_ERROR("generic data distribution: invalid execution phase");
-    }
-    return data_reader;
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_data_reader(mode);
   }
 
   generic_data_reader *get_data_reader() const {
-    return get_data_reader(this->m_model->get_execution_mode());
+    return get_data_reader(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_num_parallel_readers(execution_mode mode) const {
@@ -354,7 +306,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_num_parallel_readers() const {
-    return get_num_parallel_readers(this->m_model->get_execution_mode());
+    return get_num_parallel_readers(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_num_iterations_per_epoch(execution_mode mode) const {
@@ -363,7 +315,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_num_iterations_per_epoch() const {
-    return get_num_iterations_per_epoch(this->m_model->get_execution_mode());
+    return get_num_iterations_per_epoch(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_current_step_in_epoch(execution_mode mode) const {
@@ -372,7 +324,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_current_step_in_epoch() const {
-    return get_current_step_in_epoch(this->m_model->get_execution_mode());
+    return get_current_step_in_epoch(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_mini_batch_size(execution_mode mode) const {
@@ -386,7 +338,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_last_mini_batch_size() const {
-    return get_last_mini_batch_size(this->m_model->get_execution_mode());
+    return get_last_mini_batch_size(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_current_mini_batch_size(execution_mode mode) const {
@@ -395,7 +347,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_current_mini_batch_size() const {
-    return get_current_mini_batch_size(this->m_model->get_execution_mode());
+    return get_current_mini_batch_size(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_global_mini_batch_size(execution_mode mode) const {
@@ -414,7 +366,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_current_global_mini_batch_size() const {
-    return get_current_global_mini_batch_size(this->m_model->get_execution_mode());
+    return get_current_global_mini_batch_size(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_world_master_mini_batch_adjustment(execution_mode mode) const {
@@ -423,7 +375,7 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_world_master_mini_batch_adjustment() const {
-    return get_world_master_mini_batch_adjustment(this->m_model->get_execution_mode());
+    return get_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode());
   }
 
   virtual int get_current_world_master_mini_batch_adjustment(execution_mode mode, int model_rank) const {
@@ -432,110 +384,32 @@ class generic_input_layer : public io_layer {
   }
 
   virtual int get_current_world_master_mini_batch_adjustment(int model_rank) const {
-    return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_mode(), model_rank);
-  }
-
-  /** Calculate how many iterations are required for training, testing,
-   *  and validation given a specified mini-batch size and that the
-   *  training data set is spanning all of the models.
-   */
-  void calculate_num_iterations_per_epoch_training_spans_models(int mini_batch_size) {
-
-    generic_data_reader *dr = get_data_reader(execution_mode::training);
-    if(dr != nullptr) {
-      /// Setup the training data set so that it spans all models
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_spanning_models(mini_batch_size, dr);
-    }
-
-    dr = get_data_reader(execution_mode::validation);
-    if(dr != nullptr) {
-      /// Each model uses the entire validation and testing data sets
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr);
-    }
-
-    dr = get_data_reader(execution_mode::testing);
-    if(dr != nullptr) {
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr);
-    }
-
-  }
-
-  void calculate_num_iterations_per_epoch_training_unique_per_models(int mini_batch_size) {
-
-    generic_data_reader *dr = get_data_reader(execution_mode::training);
-    if(dr != nullptr) {
-      /// Setup the training data set so that it spans all models
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr);
-    }
-
-    dr = get_data_reader(execution_mode::validation);
-    if(dr != nullptr) {
-      /// Each model uses the entire validation and testing data sets
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr);
-    }
-
-    dr = get_data_reader(execution_mode::testing);
-    if(dr != nullptr) {
-      m_io_buffers[0]->calculate_num_iterations_per_epoch_single_model(mini_batch_size, dr);
-    }
-
+    return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode(), model_rank);
   }
 
   //************************************************************************
   // Helper functions to access the dataset statistics
   //************************************************************************
   dataset& get_dataset(execution_mode m) override {
-    switch(m) {
-    case execution_mode::training:
-      return m_training_dataset;
-      break;
-    case execution_mode::validation:
-      return m_validation_dataset;
-      break;
-    case execution_mode::testing:
-      return m_testing_dataset;
-      break;
-    default:
-      LBANN_ERROR("get_dataset: invalid execution mode");
-    }
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_dataset(m);
   }
 
   const dataset& get_dataset(execution_mode m) const override {
-    switch(m) {
-    case execution_mode::training:
-      return m_training_dataset;
-      break;
-    case execution_mode::validation:
-      return m_validation_dataset;
-      break;
-    case execution_mode::testing:
-      return m_testing_dataset;
-      break;
-    default:
-       LBANN_ERROR("get_dataset: invalid execution mode");
-    }
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_dataset(m);
   }
 
   /**
    * Return the dataset associated with the current execution mode.
    */
-  dataset& select_dataset() override { return get_dataset(m_model->get_execution_mode()); }
-  const dataset& select_dataset() const override { return get_dataset(m_model->get_execution_mode()); }
+  dataset& select_dataset() override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); }
+  const dataset& select_dataset() const override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); }
 
   /**
    * Return the first dataset with a valid (non-null) datareader.
    * Returns null if none are valid.
    */
   dataset* select_first_valid_dataset() override {
-    if (m_data_readers[execution_mode::training]) {
-      return &m_training_dataset;
-    } else if (m_data_readers[execution_mode::validation]) {
-      return &m_validation_dataset;
-    } else if (m_data_readers[execution_mode::testing]) {
-      return &m_testing_dataset;
-    } else {
-      return nullptr;
-    }
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().select_first_valid_dataset();
   }
 
   /**
@@ -558,34 +432,21 @@ class generic_input_layer : public io_layer {
    * Return the sample indices fetched in the current mini-batch.
    */
   El::Matrix<El::Int>* get_sample_indices_per_mb() override {
-    execution_mode mode = this->m_model->get_execution_mode();
-    generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()];
-    return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_mode());
+    execution_mode mode = this->m_model->get_execution_context().get_execution_mode();
+    generic_io_buffer<TensorDataType>* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()];
+    return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_context().get_execution_mode());
   }
 
   /**
    * Get the dimensions of the underlying data.
    */
-  const std::vector<int> get_data_dims(int child_index = 0) const override {
-    const generic_data_reader *dr = get_data_reader();
-    //    dataset* ds = select_first_valid_dataset();
-    if (dr) {
-      if(child_index == 0) {
-        return dr->get_data_dims();
-      }else if(child_index == 1) {
-        switch(m_data_reader_mode) {
-        case data_reader_target_mode::REGRESSION:
-          return std::vector<int>(1, dr->get_num_responses());
-        case data_reader_target_mode::RECONSTRUCTION:
-          return dr->get_data_dims();
-        case data_reader_target_mode::CLASSIFICATION:
-        default:
-          return std::vector<int>(1, dr->get_num_labels());
-        }
-        //        the correct value based on initialization
-      }else {
-        LBANN_ERROR("get_data_dims: Invalid child index");
-      }
+  std::vector<int> get_data_dims(DataReaderMetaData& dr_metadata, int child_index = 0) const override {
+    if(child_index == 0) {
+      return dr_metadata.data_dims[data_reader_target_mode::INPUT];
+    }else if(child_index == 1) {
+      return dr_metadata.data_dims[this->m_data_reader_mode];
+    }else {
+      LBANN_ERROR("get_data_dims: Invalid child index");
     }
     return std::vector<int>(1, 0);
   }
@@ -596,26 +457,26 @@ class generic_input_layer : public io_layer {
   long get_linearized_data_size() const override {
     long linearized_data_size = -1;
 
-    data_reader_map_t::const_iterator it;
+    generic_data_reader *dr;
 
-    it = m_data_readers.find(execution_mode::training);
-    if ((it != m_data_readers.end()) && it->second) {
-      linearized_data_size = (it->second)->get_linearized_data_size();
-      std::cerr << "XX >>>>>> linearized_data_size: " << linearized_data_size << "\n";
+    auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    dr = dc.get_data_reader(execution_mode::training);
+    if (dr != nullptr) {
+      linearized_data_size = dr->get_linearized_data_size();
     }
 
-    it = m_data_readers.find(execution_mode::validation);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_data_size = (it->second)->get_linearized_data_size();
+    dr = dc.get_data_reader(execution_mode::validation);
+    if (dr != nullptr) {
+      long tmp_data_size = dr->get_linearized_data_size();
       if (linearized_data_size != -1 && linearized_data_size != tmp_data_size) {
         LBANN_ERROR("lbann_io_layer: validation data set size does not "
                               "match the currently established data set size");
       }
     }
 
-    it = m_data_readers.find(execution_mode::testing);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_data_size = (it->second)->get_linearized_data_size();
+    dr = dc.get_data_reader(execution_mode::testing);
+    if (dr != nullptr) {
+      long tmp_data_size = dr->get_linearized_data_size();
       if (linearized_data_size != -1 && linearized_data_size != tmp_data_size) {
         LBANN_ERROR("lbann_io_layer: testing data set size does not "
                               "match the currently established data set size");
@@ -628,26 +489,27 @@ class generic_input_layer : public io_layer {
    * Get the linearized size of the labels for the underlying data.
    */
   long get_linearized_label_size() const override {
-    if (is_for_regression()) {
+    if (this->is_for_regression()) {
       return static_cast<long>(1);
     }
     long linearized_label_size = -1;
-    data_reader_map_t::const_iterator it;
+    generic_data_reader *dr;
 
-    it = m_data_readers.find(execution_mode::training);
-    if ((it != m_data_readers.end()) && it->second) {
-      linearized_label_size = (it->second)->get_linearized_label_size();
+    auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    dr = dc.get_data_reader(execution_mode::training);
+    if (dr != nullptr) {
+      linearized_label_size = dr->get_linearized_label_size();
     }
-    it = m_data_readers.find(execution_mode::validation);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_label_size = (it->second)->get_linearized_label_size();
+    dr = dc.get_data_reader(execution_mode::validation);
+    if (dr != nullptr) {
+      long tmp_label_size = dr->get_linearized_label_size();
       if (linearized_label_size != -1 && linearized_label_size != tmp_label_size) {
         LBANN_ERROR("lbann_io_layer: validation label set size (" + std::to_string(tmp_label_size) + ") does not match the currently established data set size (" + std::to_string(linearized_label_size) + ")");
       }
     }
-    it = m_data_readers.find(execution_mode::testing);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_label_size = (it->second)->get_linearized_label_size();
+    dr = dc.get_data_reader(execution_mode::testing);
+    if (dr != nullptr) {
+      long tmp_label_size = dr->get_linearized_label_size();
       if (linearized_label_size != -1 && linearized_label_size != tmp_label_size) {
         LBANN_ERROR("lbann_io_layer: testing label set size does not "
                               "match the currently established data set size");
@@ -657,27 +519,28 @@ class generic_input_layer : public io_layer {
   }
 
   long get_linearized_response_size() const override {
-    if (!is_for_regression()) {
+    if (!this->is_for_regression()) {
       return static_cast<long>(1);
     }
     long linearized_response_size = -1;
-    data_reader_map_t::const_iterator it;
+    generic_data_reader *dr;
 
-    it = m_data_readers.find(execution_mode::training);
-    if ((it != m_data_readers.end()) && it->second) {
-      linearized_response_size = (it->second)->get_linearized_response_size();
+    auto& dc = this->m_model->get_execution_context().get_trainer().get_data_coordinator();
+    dr = dc.get_data_reader(execution_mode::training);
+    if (dr != nullptr) {
+      linearized_response_size = dr->get_linearized_response_size();
     }
-    it = m_data_readers.find(execution_mode::validation);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_response_size = (it->second)->get_linearized_response_size();
+    dr = dc.get_data_reader(execution_mode::validation);
+    if (dr != nullptr) {
+      long tmp_response_size = dr->get_linearized_response_size();
       if (linearized_response_size != -1 && linearized_response_size != tmp_response_size) {
         LBANN_ERROR("lbann_io_layer: validation response set size does not "
                               "match the currently established data set size");
       }
     }
-    it = m_data_readers.find(execution_mode::testing);
-    if ((it != m_data_readers.end()) && it->second) {
-      long tmp_response_size = (it->second)->get_linearized_response_size();
+    dr = dc.get_data_reader(execution_mode::testing);
+    if (dr != nullptr) {
+      long tmp_response_size = dr->get_linearized_response_size();
       if (linearized_response_size != -1 && linearized_response_size != tmp_response_size) {
         LBANN_ERROR("lbann_io_layer: testing response set size does not "
                               "match the currently established data set size");
@@ -687,21 +550,21 @@ class generic_input_layer : public io_layer {
   }
 
   long get_num_samples_trained() const override {
-    return m_training_dataset.get_num_samples_processed();
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_num_samples_trained();
   }
   long get_num_samples_tested() const override {
-    return m_testing_dataset.get_num_samples_processed();
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_num_samples_tested();
   }
   long get_total_num_training_samples() const override {
-    return m_training_dataset.get_total_samples();
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_total_num_training_samples();
   }
   long get_total_num_testing_samples() const override {
-    return m_testing_dataset.get_total_samples();
+    return this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_total_num_testing_samples();
   }
 
   bool at_new_epoch() const override {
-    const data_reader_map_t::const_iterator it = m_data_readers.find(execution_mode::training);
-    return ((it != m_data_readers.end()) && it->second && (it->second)->at_new_epoch());
+    const generic_data_reader *dr = this->m_model->get_execution_context().get_trainer().get_data_coordinator().get_data_reader(execution_mode::training);
+    return (dr != nullptr && dr->at_new_epoch());
   }
 
   bool is_execution_mode_valid(execution_mode mode) const override {
@@ -715,174 +578,59 @@ class generic_input_layer : public io_layer {
   // save state of IO to a checkpoint
   bool save_to_checkpoint_shared(persist& p) const override {
     // save state of data readers from input layer
-    data_reader_map_t::const_iterator it;
-    if(p.get_cb_type() != callback_type::validation){
-      it = this->m_data_readers.find(execution_mode::training);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_shared(p, "data_reader_training");
-      }
-      it = this->m_data_readers.find(execution_mode::testing);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_shared(p, "data_reader_testing");
-      }
-      if (m_comm->am_trainer_master()) {
-        p.write_uint64(persist_type::train, "reader_train_processed",
-                       (uint64_t) m_training_dataset.get_num_samples_processed());
-        p.write_uint64(persist_type::train, "reader_train_total",
-                       (uint64_t) m_training_dataset.get_total_samples());
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint){
 
-        p.write_uint64(persist_type::train, "reader_test_processed",
-                       (uint64_t) m_testing_dataset.get_num_samples_processed());
-        p.write_uint64(persist_type::train, "reader_test_total",
-                     (uint64_t) m_testing_dataset.get_total_samples());
+      this->m_model->get_execution_context().get_trainer().get_data_coordinator().save_to_checkpoint_shared(p);
 
+      if (this->get_comm()->am_trainer_master()) {
+        write_cereal_archive<const generic_input_layer>(*this, p, execution_mode::training, "_io.xml");
       }
-    }
-    if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){
-      if (m_comm->am_trainer_master()) {
-        p.write_uint64(persist_type::validate, "reader_validate_processed",
-                       (uint64_t) m_validation_dataset.get_num_samples_processed());
-        p.write_uint64(persist_type::validate, "reader_validate_total",
-                       (uint64_t) m_validation_dataset.get_total_samples());
-      }
-      it = this->m_data_readers.find(execution_mode::validation);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_shared(p, "data_reader_validation");
-      }
+
     }
     return true;
   }
 
-  struct dataset_header {
-    uint64_t train_proc;
-    uint64_t train_total;
-    uint64_t test_proc;
-    uint64_t test_total;
-    uint64_t validate_proc;
-    uint64_t validate_total;
-  };
-
   // reload state of IO from a checkpoint
   bool load_from_checkpoint_shared(persist& p) override {
-    // save state of data readers from input layer
-    data_reader_map_t::const_iterator it;
-
-    it = this->m_data_readers.find(execution_mode::training);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_shared(p, "data_reader_training");
-    }
-    it = this->m_data_readers.find(execution_mode::testing);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_shared(p, "data_reader_testing");
-    }
-
-    // save our own state
-    // rank 0 reads the file
-    dataset_header header;
-    // Assume we are loading from a epoch end checkpoint
-    if (m_comm->am_trainer_master()) {
-      p.read_uint64(persist_type::train, "reader_train_processed",    &header.train_proc);
-      p.read_uint64(persist_type::train, "reader_train_total",        &header.train_total);
-      p.read_uint64(persist_type::train, "reader_test_processed",     &header.test_proc);
-      p.read_uint64(persist_type::train, "reader_test_total",         &header.test_total);
-      if(m_data_readers[execution_mode::validation] != nullptr){
-        p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc);
-        p.read_uint64(persist_type::validate, "reader_validate_total",     &header.validate_total);
+    // save state of the input layer
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint){
+
+      std::string buf;
+      if (this->get_comm()->am_trainer_master()) {
+        read_cereal_archive<generic_input_layer>(*this, p, execution_mode::training, "_io.xml");
+        buf = create_cereal_archive_binary_string<generic_input_layer>(*this);
+      }
+
+      // TODO: this assumes homogeneous processors
+      // broadcast state from rank 0
+      this->get_comm()->trainer_broadcast(0, buf);
+
+      if (!this->get_comm()->am_trainer_master()) {
+        unpack_cereal_archive_binary_string<generic_input_layer>(*this, buf);
       }
-    }
 
-    it = this->m_data_readers.find(execution_mode::validation);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_shared(p, "data_reader_validation");
-    }
-    // TODO: assumes homogeneous hardware
-    // broadcast data from rank 0
-    MPI_Bcast(&header, sizeof(header), MPI_BYTE, 0, MPI_COMM_WORLD);
-    // set our fields
-    m_training_dataset.num_samples_processed()   = (long) header.train_proc;
-    m_training_dataset.total_samples()           = (long) header.train_total;
-    m_testing_dataset.num_samples_processed()    = (long) header.test_proc;
-    m_testing_dataset.total_samples()            = (long) header.test_total;
-    if(m_data_readers[execution_mode::validation] != nullptr){
-      m_validation_dataset.num_samples_processed() = (long) header.validate_proc;
-      m_validation_dataset.total_samples()         = (long) header.validate_total;
     }
     return true;
   }
 
   bool save_to_checkpoint_distributed(persist& p) const override {
     // save state of data readers from input layer
-    data_reader_map_t::const_iterator it;
-    if(p.get_cb_type() != callback_type::validation){
-      it = this->m_data_readers.find(execution_mode::training);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_distributed(p, "data_reader_training");
-      }
-      it = this->m_data_readers.find(execution_mode::testing);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_distributed(p, "data_reader_testing");
-      }
-      p.write_uint64(persist_type::train, "reader_train_processed",
-                     (uint64_t) m_training_dataset.get_num_samples_processed());
-      p.write_uint64(persist_type::train, "reader_train_total",
-                     (uint64_t) m_training_dataset.get_total_samples());
-
-      p.write_uint64(persist_type::train, "reader_test_processed",
-                     (uint64_t) m_testing_dataset.get_num_samples_processed());
-      p.write_uint64(persist_type::train, "reader_test_total",
-                   (uint64_t) m_testing_dataset.get_total_samples());
-
-    }
-    if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){
-      p.write_uint64(persist_type::validate, "reader_validate_processed",
-                     (uint64_t) m_validation_dataset.get_num_samples_processed());
-      p.write_uint64(persist_type::validate, "reader_validate_total",
-                     (uint64_t) m_validation_dataset.get_total_samples());
-      it = this->m_data_readers.find(execution_mode::validation);
-      if ((it != this->m_data_readers.end()) && it->second) {
-        (it->second)->save_to_checkpoint_distributed(p, "data_reader_validation");
-      }
+    if(p.get_cb_type() == callback_type::execution_context_only || p.get_cb_type() == callback_type::full_checkpoint) {
+      this->m_model->get_execution_context().get_trainer().get_data_coordinator().save_to_checkpoint_distributed(p);
 
+      write_cereal_archive<const generic_input_layer>(*this, p, execution_mode::training, "_io.xml");
     }
     return true;
   }
 
   bool load_from_checkpoint_distributed(persist& p) override {
-    // save state of data readers from input layer
-    data_reader_map_t::const_iterator it;
-    it = this->m_data_readers.find(execution_mode::training);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_distributed(p, "data_reader_training");
-    }
-    it = this->m_data_readers.find(execution_mode::testing);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_distributed(p, "data_reader_testing");
-    }
-    // save our own state
-    // rank 0 reads the file
-    dataset_header header;
-    p.read_uint64(persist_type::train, "reader_train_processed",    &header.train_proc);
-    p.read_uint64(persist_type::train, "reader_train_total",        &header.train_total);
-    p.read_uint64(persist_type::train, "reader_test_processed",     &header.test_proc);
-    p.read_uint64(persist_type::train, "reader_test_total",         &header.test_total);
-    if(m_data_readers[execution_mode::validation] != nullptr){
-      p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc);
-      p.read_uint64(persist_type::validate, "reader_validate_total",     &header.validate_total);
-    }
-    it = this->m_data_readers.find(execution_mode::validation);
-    if ((it != this->m_data_readers.end()) && it->second) {
-      (it->second)->load_from_checkpoint_distributed(p, "data_reader_validation");
-    }
-
-    // set our fields
-    m_training_dataset.num_samples_processed()   = (long) header.train_proc;
-    m_training_dataset.total_samples()           = (long) header.train_total;
-    m_testing_dataset.num_samples_processed()    = (long) header.test_proc;
-    m_testing_dataset.total_samples()            = (long) header.test_total;
-    if(m_data_readers[execution_mode::validation] != nullptr){
-      m_validation_dataset.num_samples_processed() = (long) header.validate_proc;
-      m_validation_dataset.total_samples()         = (long) header.validate_total;
-    }
+    // load state of data readers for input layer
+
+    this->m_model->get_execution_context().get_trainer().get_data_coordinator().load_from_checkpoint_distributed(p);
+
+    read_cereal_archive<generic_input_layer>(*this, p, execution_mode::training, "_io.xml");
     return true;
   }
 
@@ -894,24 +642,10 @@ class generic_input_layer : public io_layer {
   }
 
  protected:
-  std::vector<generic_io_buffer*> m_io_buffers;
+  std::vector<generic_io_buffer<TensorDataType>*> m_io_buffers;
   io_buffer_map_t m_active_buffer;
-
-  dataset m_training_dataset;
-  dataset m_testing_dataset;
-  dataset m_validation_dataset;
-  //  bool m_data_sets_span_models;
-
-  data_reader_map_t m_data_readers;
- //  std::map<execution_mode, dataset_stats> m_dataset_stats;
-  bool m_data_set_processed;
-  std::mutex dr_mutex;
 };
 
-template<typename T> inline void generic_input_layer::initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers) {
-  m_io_buffers.push_back(new T(comm, num_parallel_readers, data_readers, m_expected_num_child_layers));
-}
-
 }  // namespace lbann
 
 #endif  // LBANN_LAYERS_GENERIC_INPUT_LAYER_HPP_INCLUDED
diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp
index e2c144684b3..35bdee54a92 100644
--- a/include/lbann/layers/io/input/input_layer.hpp
+++ b/include/lbann/layers/io/input/input_layer.hpp
@@ -29,6 +29,7 @@
 
 #include "lbann/layers/io/input/generic_input_layer.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
 #include "lbann/models/model.hpp"
 #include <string>
 #include <sys/types.h>
@@ -37,23 +38,85 @@
 
 namespace lbann {
 
-template <typename T_io_buffer, data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+class input_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  using TensorHost = dc::TensorHost<TensorDataType>;
+  using TensorHostShuffler = dc::TensorHostShuffler<TensorDataType>;
+
+  input_distconv_adapter(Layer& layer);
+  virtual ~input_distconv_adapter() = default;
+
+  TensorHostShuffler &get_shuffler(const TensorHost &src, const TensorHost &dst,
+                                   int mat_idx);
+  void setup_fp_tensors() override;
+  std::unique_ptr<TensorDevType> setup_activations_i(int index) const override;
+  dc::Shape get_activations_local_shape(int index) const override;
+  dc::Shape get_activations_shape(int index) const;
+  void setup_shuffler_buffers(const TensorHost &src, const TensorHost &dst);
+
+  // No bp tensors needed for this layer.
+  void setup_prev_error_signals() override {}
+  void setup_original_prev_error_signals() override {}
+  void setup_error_signals() override {}
+  void setup_original_error_signals() override {}
+  void setup_bp_tensors() override {}
+
+  bool child_copy_required(size_t output_index) const override;
+  bool child_shuffle_required(size_t output_index) const override;
+
+  // Nothing to do here as everything is done in fp_compute_distconv.
+  void fp_setup(El::Int mini_batch_size) override {}
+  void fp_compute();
+  bool is_input_processed(size_t index) const;
+
+ private:
+  std::vector<bool> m_is_input_processed;
+  std::vector<std::unique_ptr<TensorHost>> m_original_host_tensors;
+  std::vector<std::unique_ptr<TensorHost>> m_host_tensors;
+
+  bool m_shuffle_required;
+  std::vector<std::array<std::unique_ptr<TensorHostShuffler>, 4>> m_shufflers;
+  std::unique_ptr<TensorDataType> m_shuffler_src_buf;
+  size_t m_shuffler_src_buf_size = 0;
+  std::unique_ptr<TensorDataType> m_shuffler_dst_buf;
+  size_t m_shuffler_dst_buf_size = 0;
+
+  // TODO: Use pinned memory pool
+  TensorDataType *m_copy_pinned_buffer = nullptr;
+};
+#endif // LBANN_HAS_DISTCONV
 
 /** @brief Interface with data reader. */
-class input_layer : public generic_input_layer {
+template <typename TensorDataType,
+          typename T_io_buffer,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class input_layer : public generic_input_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "input layer only supports DATA_PARALLEL data layout");
+ public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The local tensor type expected for IO in this object. */
+  using IODataType = DataType;
+
+  ///@}
  public:
 
   /// @todo make the map and vector references
-  input_layer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode,
-    generic_data_reader *> data_readers, bool data_set_spans_models = true,
+  input_layer(lbann_comm *comm, int num_parallel_readers,
     data_reader_target_mode target_mode = data_reader_target_mode::CLASSIFICATION)
-    : generic_input_layer(comm, num_parallel_readers, data_readers, data_set_spans_models, target_mode) {
-    validate_data_layout();
+    : generic_input_layer<TensorDataType>(comm, num_parallel_readers, target_mode) {
     // Initialize two buffers
-    initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers);
-    initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers);
-    for (auto io_buffer : m_io_buffers) {
-      io_buffer->fetch_data_fn = new fetch_data_functor(target_mode);
+    initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer<TensorDataType>::m_comm->get_procs_per_trainer()));
+    initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer<TensorDataType>::m_comm->get_procs_per_trainer()));
+    for (auto io_buffer : this->m_io_buffers) {
+      io_buffer->fetch_data_fn = new fetch_data_functor<IODataType>(target_mode);
       io_buffer->update_data_reader_fn = new update_data_reader_functor();
     }
   }
@@ -63,42 +126,43 @@ class input_layer : public generic_input_layer {
     return new input_layer(*this);
   }
 
-  inline void validate_data_layout();
-
-  inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers) {
-    generic_input_layer::initialize_io_buffer<T_io_buffer>(comm, num_parallel_readers, data_readers);
+  inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers) {
+    generic_input_layer<TensorDataType>::template initialize_io_buffer<T_io_buffer>(comm, num_parallel_readers);
   }
 
   std::string get_type() const override { return "input"; }
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
+#ifdef LBANN_HAS_DISTCONV
+  void fp_compute () override;
+  using distconv_adapter_type = input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>;
+  friend distconv_adapter_type;
+ protected:
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::CPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<distconv_adapter_type>(*this);
+  }
+  distconv_adapter_type& get_distconv_adapter() override;
+  const distconv_adapter_type& get_distconv_adapter() const override;
+  bool keep_original_outputs(int index) const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
-template<>
-inline void input_layer<partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::CPU>::validate_data_layout() {
-  std::stringstream err;
-  err << __FILE__ << " " << __LINE__ << " :: "
-      << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout";
-  throw lbann_exception(err.str());
-}
-
-template<>
-inline void input_layer<partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::CPU>::validate_data_layout() {}
-
-#ifdef LBANN_HAS_GPU
-template<>
-inline void input_layer<partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::GPU>::validate_data_layout() {
-  std::stringstream err;
-  err << __FILE__ << " " << __LINE__ << " :: "
-      << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout";
-  throw lbann_exception(err.str());
-}
-
-template<>
-inline void input_layer<partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::GPU>::validate_data_layout() {}
-#endif // LBANN_HAS_GPU
-
-}
+#ifndef LBANN_INPUT_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)         \
+  extern template class input_layer<    \
+    T, partitioned_io_buffer<T>,        \
+    data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_INPUT_LAYER_INSTANTIATE
+
+} // namespace lbann
 
 #endif  // LBANN_LAYERS_INPUT_LAYER_HPP_INCLUDED
diff --git a/include/lbann/layers/io/io_layer.hpp b/include/lbann/layers/io/io_layer.hpp
index 4f0b22ec529..939010c0472 100644
--- a/include/lbann/layers/io/io_layer.hpp
+++ b/include/lbann/layers/io/io_layer.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_IO_LAYER_HPP_INCLUDED
 #define LBANN_LAYERS_IO_LAYER_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 #include "lbann/data_readers/data_reader.hpp"
 #include "lbann/utils/dataset.hpp"
 #include "lbann/io/persist.hpp"
@@ -43,17 +43,15 @@
 namespace lbann {
 
 /** @todo Move functionality to input_layer. */
-class io_layer : public Layer {
+template <typename TensorDataType>
+class io_layer : public data_type_layer<TensorDataType> {
  protected:
-  bool m_data_set_spans_models;
   data_reader_target_mode m_data_reader_mode;
 
  public:
   io_layer(lbann_comm *comm,
-           bool data_set_spans_models = true,
            data_reader_target_mode data_reader_mode = data_reader_target_mode::CLASSIFICATION)
-    : Layer(comm),
-      m_data_set_spans_models(data_set_spans_models),
+    : data_type_layer<TensorDataType>(comm),
       m_data_reader_mode(data_reader_mode) {
   }
 
@@ -93,7 +91,7 @@ class io_layer : public Layer {
   /**
    * Get the dimensions of the underlying data.
    */
-  virtual const std::vector<int> get_data_dims(int child_index = 0) const = 0;
+  virtual std::vector<int> get_data_dims(DataReaderMetaData& dr_metadata, int child_index = 0) const = 0;
 
   /**
    * Get the linearized size of the underlying data.
diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp
index 6ed9ecb096b..9778e2f433f 100644
--- a/include/lbann/layers/layer.hpp
+++ b/include/lbann/layers/layer.hpp
@@ -29,22 +29,131 @@
 
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
-#include "lbann/utils/summary.hpp"
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include "lbann/io/persist.hpp"
 #include "lbann/optimizers/optimizer.hpp"
+#include "lbann/utils/description.hpp"
+#include "lbann/utils/distconv.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/summary.hpp"
 #include "lbann/utils/timer.hpp"
-#include "lbann/utils/description.hpp"
-#include "lbann/io/persist.hpp"
-#include <lbann.pb.h>
+#include "lbann/utils/typename.hpp"
+#include "lbann/weights/weights.hpp"
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/layers/distconv_adapter.hpp"
+#endif // LBANN_HAS_DISTCONV
 #include <string>
 #include <vector>
 
+/** @brief A utility macro for easily defining default-constructed sub-class
+ *  builders.*/
+#define LBANN_DEFINE_LAYER_BUILDER(LAYER_NAME)                          \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  std::unique_ptr<Layer> build_##LAYER_NAME##_layer_from_pbuf( \
+    lbann_comm*, lbann_data::Layer const&)
+
+/** @brief A utility macro for easily defining "default" builders.
+ *  @note Must be called inside lbann namespace.
+ */
+#define LBANN_LAYER_DEFAULT_BUILDER(LAYER_NAME) \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  std::unique_ptr<Layer> build_##LAYER_NAME##_layer_from_pbuf(          \
+    lbann_comm* comm, lbann_data::Layer const&) {                       \
+    using LayerType = LAYER_NAME##_layer<TensorDataType, Layout, Device>; \
+    return make_unique<LayerType>(comm);                                \
+  }
+
+/** @brief A utility macro for easily adding ETI for layer builders
+ *  @note Must be called inside lbann namespace.
+ */
+#define LBANN_LAYER_BUILDER_ETI(LAYER_NAME, T, Device)                  \
+  template std::unique_ptr<Layer>                                       \
+  build_##LAYER_NAME##_layer_from_pbuf<T,::lbann::data_layout::DATA_PARALLEL,Device>( \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template std::unique_ptr<Layer>                                       \
+  build_##LAYER_NAME##_layer_from_pbuf<T,::lbann::data_layout::MODEL_PARALLEL,Device>( \
+    lbann_comm*, lbann_data::Layer const&)
+
+// Forward-declare protobuf classes
+namespace lbann_data {
+class Layer;
+}
+
 namespace lbann {
 
 // Forward declarations
 class model;
-class weights;
-class lbann_callback_sync_layers;
+namespace callback {
+class sync_layers;
+} // namespace callback
+
+/** Represents a parallel strategy for a layer. */
+struct ParallelStrategy {
+  /** Number of process groups the sample dimension is split over. */
+  int sample_groups = 0;
+  /** Number of groups the sample dimension is split over. */
+  int sample_splits = 0;
+  /** Number of process groups the depth dimension is split over. */
+  int depth_groups = 0;
+  /** Number of groups the depth dimension is split over. */
+  int depth_splits = 0;
+  /** Number of process groups the height dimension is split over. */
+  int height_groups = 0;
+  /** Number of groups the height dimension is split over. */
+  int height_splits = 0;
+  /** Number of process groups the width dimension is split over. */
+  int width_groups = 0;
+  /** Number of groups the width dimension is split over. */
+  int width_splits = 0;
+  /** Number of process groups the channel dimension is split over. */
+  int channel_groups = 0;
+  /** Number of groups the channel dimension is split over. */
+  int channel_splits = 0;
+  /** Number of process groups the filter dimension is split over. */
+  int filter_groups = 0;
+  /** Number of groups the filter dimension is split over. */
+  int filter_splits = 0;
+  /** Number of times the layer is replicated (for FC layers right now). */
+  int replications = 0;
+  bool operator==(const ParallelStrategy &ps) const {
+    return sample_groups == ps.sample_groups &&
+        sample_splits == ps.sample_splits &&
+        depth_groups == ps.depth_groups &&
+        depth_splits == ps.depth_splits &&
+        height_groups == ps.height_groups &&
+        height_splits == ps.height_splits &&
+        width_groups == ps.width_groups &&
+        width_splits == ps.width_splits &&
+        channel_groups == ps.channel_groups &&
+        channel_splits == ps.channel_splits &&
+        filter_groups == ps.filter_groups &&
+        filter_splits == ps.filter_splits &&
+        replications == ps.replications;
+  }
+  bool operator!=(const ParallelStrategy &ps) const {
+    return !(*this == ps);
+  }
+};
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const ParallelStrategy &ps) {
+  os << "{" << ps.sample_groups
+     << "/" << ps.sample_splits
+     << ", " << ps.depth_groups
+     << "/" << ps.depth_splits
+     << ", " << ps.height_groups
+     << "/" << ps.height_splits
+     << ", " << ps.width_groups
+     << "/" << ps.width_splits
+     << ", " << ps.channel_groups
+     << "/" << ps.channel_splits
+     << ", " << ps.filter_groups
+     << "/" << ps.filter_splits
+     << ", " << ps.replications
+     << "}";
+  return os;
+}
 
 /**
  * @brief Neural network tensor operation.
@@ -64,8 +173,7 @@ class lbann_callback_sync_layers;
  * the weights.
  */
 class Layer {
-  friend class lbann_callback_sync_layers;
-  friend class lbann_callback_sync_selected;
+  friend class callback::sync_layers;
 
 public:
 
@@ -96,22 +204,37 @@ class Layer {
    *  human-readable, name.
    */
   inline void set_name(const std::string name) { m_name = name; }
+  /** Get a string representing the layer datatype
+   */
+  virtual std::string get_datatype_name() const {
+    return TypeName<DataType>();
+  };
 
   /** Human-readable description. */
   virtual description get_description() const;
 
+  /** Get the parallel strategy for the layer. */
+  inline ParallelStrategy& get_parallel_strategy() {
+    return m_parallel_strategy;
+  }
+  /** Get the parallel strategy for the layer. */
+  const ParallelStrategy& get_parallel_strategy() const {
+    return m_parallel_strategy;
+  }
+
   /** Forward propagation step.
    *  Apply a mathematical operation to input tensors to obtain output
    *  tensors.
    */
-  virtual void forward_prop();
+  virtual void forward_prop() {};
   /** Backward propagation step.
    *  Given the objective function gradients w.r.t. the output
    *  tensors, compute the gradients w.r.t. the input tensors and
    *  w.r.t. the weights. This is essentially an application of the
    *  chain rule.
    */
-  virtual void back_prop();
+  void back_prop();
+
   /** Update step.
    *  Update the layer's internal members. Note that the optimization
    *  step for the weights happens elsewhere.
@@ -119,7 +242,7 @@ class Layer {
   virtual bool update();
 
   virtual void summarize_stats(lbann_summary& summarizer, int step);
-  virtual void summarize_matrices(lbann_summary& summarizer, int step);
+  virtual void summarize_matrices(lbann_summary& summarizer, int step) = 0;
 
   /** Setup layer members.
    *  This calls the 'setup_pointers', 'setup_dims', 'setup_matrices',
@@ -127,7 +250,7 @@ class Layer {
    *  assumed that pointers to parent/child layers have already been
    *  initialized.
    */
-  virtual void setup();
+  virtual void setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata);
   /** Check that the setup is reasonable. */
   virtual void check_setup();
 
@@ -145,12 +268,6 @@ class Layer {
    *  should override this function to return its template parameter.
    */
   virtual El::Device get_device_allocation() const = 0;
-  /** Get a human-readable description of the data_layout */
-  std::string get_data_layout_string(data_layout d) const;
-  /** Get a human-readable description of the device allocation */
-  std::string get_device_allocation_string(El::Device dev) const;
-  /** Get a short human-readable description of the device allocation */
-  std::string get_device_allocation_string_short(El::Device dev) const;
 
   /** Reset layer stat counters. */
   virtual void reset_counters();
@@ -198,6 +315,20 @@ class Layer {
   /** Get child layers. (const) */
   inline const std::vector<const Layer*>& get_child_layers() const { return m_child_layers; }
 
+  inline int find_child_layer_index(const Layer* l) const {
+    return std::distance(m_child_layers.begin(),
+                         std::find(m_child_layers.begin(),
+                                   m_child_layers.end(),
+                                   l));
+  }
+
+  inline int find_parent_layer_index(const Layer* l) const {
+    return std::distance(m_parent_layers.begin(),
+                         std::find(m_parent_layers.begin(),
+                                   m_parent_layers.end(),
+                                   l));
+  }
+
   /** Get number of parent layers. */
   inline int get_num_parents() const { return get_parent_layers().size(); }
   /** Get number of child layers. */
@@ -241,14 +372,22 @@ class Layer {
   // Weights access functions
   // ===========================================================
 
-  /** Get references to weights. */
-  inline std::vector<weights*>& get_weights() { return m_weights; }
-  /** Get references to weights. (const) */
-  inline const std::vector<weights*>& get_weights() const { return m_weights; }
   /** Set list of pointers to weights. */
-  inline void set_weights(std::vector<weights*> w) { get_weights() = w; }
+  void set_weights(std::vector<weights*> const& w) {
+    m_weights = w;
+  }
+
   /** Replace weights with another Layer's weights*/
-  void replace_weights(Layer* other_layer);
+  void replace_weights(Layer const& other_layer);
+
+  // ===========================================================
+  // Tensor access functions
+  // ===========================================================
+
+  /** Get activation tensor corresponding to child layer. */
+  virtual const BaseDistMat& get_activations(const Layer& child) const = 0;
+  /** Get error signal tensor corresponding to parent layer. */
+  virtual const BaseDistMat& get_error_signals(const Layer& parent) const = 0;
 
   // ===========================================================
   // Tensor dimension access functions
@@ -266,34 +405,6 @@ class Layer {
   /** Set output tensor dimensions. */
   void set_output_dims(std::vector<int> dims, int output_index = 0);
 
-  // ===========================================================
-  // Tensor access functions
-  // ===========================================================
-
-  /** Get activation tensor. */
-  AbsDistMat& get_activations(int child_index = 0);
-  /** Get error signal tensor. */
-  AbsDistMat& get_error_signals(int parent_index = 0);
-  /** Get previous activation tensor. */
-  const AbsDistMat& get_prev_activations(int parent_index = 0) const;
-  /** Get activation tensor. */
-  const AbsDistMat& get_activations(int child_index = 0) const;
-  /** Get previous error signal tensor. */
-  const AbsDistMat& get_prev_error_signals(int child_index = 0) const;
-  /** Get error signal tensor. */
-  const AbsDistMat& get_error_signals(int parent_index = 0) const;
-  /** Get local portion of activation tensor. */
-  AbsMat& get_local_activations(int child_index = 0);
-  /** Get local portion of error signal tensor. */
-  AbsMat& get_local_error_signals(int parent_index = 0);
-  /** Get local portion of previous activation tensor. */
-  const AbsMat& get_local_prev_activations(int parent_index = 0) const;
-  /** Get local portion of activation tensor. */
-  const AbsMat& get_local_activations(int child_index = 0) const;
-  /** Get local portion of previous error signal tensor. */
-  const AbsMat& get_local_prev_error_signals(int child_index = 0) const;
-  /** Get local portion of error signal tensor. */
-  const AbsMat& get_local_error_signals(int parent_index = 0) const;
 
   /** Get reference to LBANN communicator. */
   lbann_comm* get_comm() const { return m_comm; }
@@ -320,8 +431,63 @@ class Layer {
   void unfreeze();
   bool is_frozen() const;
 
+  /** @brief Set whether to keep or dynamically reallocate error signals.
+   *
+   *  Passing a value of @c true means to keep the error signals; @c
+   *  false means to dynamically reallocate them.
+   */
+  virtual void set_keep_error_signals(bool) = 0;
+
 protected:
 
+  /** @name Weights-related accessors */
+  ///@{
+  void add_weights(weights* w) {
+    m_weights.push_back(w);
+  }
+  size_t num_weights() const noexcept { return m_weights.size(); }
+  bool has_weights() const noexcept { return num_weights() > 0; }
+  bool has_weights(size_t idx) const noexcept {
+    return ((idx < this->num_weights()) && (m_weights[idx]));
+  }
+  void set_num_weights(size_t n) { m_weights.resize(n, nullptr); }
+  void set_weights(size_t idx, weights* w) {
+    m_weights.at(idx) = w;
+  }
+  weights const& get_weights(size_t idx) const {
+    if (idx >= num_weights()) {
+      LBANN_ERROR("Asked for weights index \"", idx, "\"; "
+                  "however, this layer has ", num_weights(),
+                  " weights associated with it.");
+    }
+    if (m_weights[idx] == nullptr) {
+      LBANN_ERROR("Logic error: Detected an in-bounds null weights pointer.");
+    }
+    return *(m_weights[idx]);
+  }
+
+  weights& get_weights(size_t idx) {
+    return const_cast<weights&>(
+      static_cast<Layer const&>(*this).get_weights(idx));
+  }
+
+  void add_as_gradient_source()
+  {
+    for (auto&& w : this->m_weights) {
+      optimizer* opt = w->get_optimizer();
+      if (opt != nullptr) { opt->add_gradient_source(this); }
+    }
+  }
+
+  void remove_as_gradient_source()
+  {
+    for (auto&& w : this->m_weights) {
+      auto&& opt = w->get_optimizer();
+      if (opt != nullptr) { opt->remove_gradient_source(this); }
+    }
+  }
+  ///@}
+
   // ===========================================================
   // Setup helper functions
   // ===========================================================
@@ -336,7 +502,7 @@ class Layer {
    *  the base method sets all uninitialized output tensor dimensions
    *  equal to the first input tensor dimensions.
    */
-  virtual void setup_dims();
+  virtual void setup_dims(DataReaderMetaData& dr_metadata);
   /** Setup distributed matrices.
    *  Called by the 'setup' function. Each column of these distributed
    *  matrices is interpreted as the flattened tensor for a mini-batch
@@ -344,20 +510,12 @@ class Layer {
    *  'construct_matrix' function. If any matrices have already been
    *  setup, they are destroyed and reinstantiated.
    */
-  virtual void setup_matrices(const El::Grid& grid);
-  /** Construct distributed matrix.
-   *  Called by the 'setup_matrices' function. 'type' is one of the
-   *  following: "input", "output", "gradient_wrt_output",
-   *  "gradient_wrt_input".
-   */
-  virtual std::unique_ptr<AbsDistMat> construct_matrix(const El::Grid& grid,
-                                                       std::string type,
-                                                       El::Int index);
+  virtual void setup_matrices(const El::Grid& grid) = 0;
   /** Setup layer data.
    *  Called by the 'setup' function. Memory is allocated for
    *  distributed matrices.
    */
-  virtual void setup_data();
+  virtual void setup_data(size_t max_mini_batch_size) {};
   /** Setup GPU objects.
    *  Called by the 'setup' function if the layer is on GPUs.
    */
@@ -372,12 +530,12 @@ class Layer {
    *  setup as a view or copy of the corresponding parent layer's
    *  output tensor.
    */
-  virtual void fp_setup_inputs(El::Int mini_batch_size);
+  virtual void fp_setup_inputs(El::Int mini_batch_size) = 0;
   /** Setup output tensors.
    *  Called by the 'forward_prop' function. Each output tensor is
    *  resized to match the mini-batch size.
    */
-  virtual void fp_setup_outputs(El::Int mini_batch_size);
+  virtual void fp_setup_outputs(El::Int mini_batch_size) = 0;
   /** Apply layer operation.
    *  Called by the 'forward_prop' function. Given the input tensors,
    *  the output tensors are populated with computed values.
@@ -388,24 +546,18 @@ class Layer {
   // Back prop step helper functions
   // ===========================================================
 
-  /** Setup gradient w.r.t. output tensors.
-   *  Called by the 'back_prop' function. Each gradient w.r.t. output
-   *  tensor is setup as a view or copy of the corresponding child
-   *  layer's gradient w.r.t. input tensor.
-   */
-  virtual void bp_setup_gradient_wrt_outputs(El::Int mini_batch_size);
   /** Setup gradient w.r.t. input tensors.
    *  Called by the 'back_prop' function. Each gradient w.r.t. input
    *  tensor is resized to match the mini-batch size.
    */
-  virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size);
+  virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) = 0;
   /** Compute objective funciton gradients.
    *  Called by the 'back_prop' function. Given the input, output, and
    *  gradient w.r.t. output tensors, the gradient w.r.t. input
    *  tensors are populated with the computed values and the gradients
    *  w.r.t. the weights are sent to the appropriate optimizers.
    */
-  virtual void bp_compute();
+  virtual void bp_compute() {};
 
   // ===========================================================
   // Update step helper functions
@@ -423,9 +575,6 @@ class Layer {
   /** Reference to LBANN communicator. */
   lbann_comm *m_comm;
 
-  /** References to layer weights. */
-  std::vector<weights*> m_weights;
-
   /** References to parent layers. */
   std::vector<const Layer*> m_parent_layers;
   /** References to child layers. */
@@ -465,39 +614,120 @@ class Layer {
 
 private:
 
-  // ===========================================================
-  // Private access functions
-  // ===========================================================
-
-  /** Get activation tensor corresponding to child layer. */
-  const AbsDistMat& get_activations(const Layer& child) const;
-  /** Get error signal tensor corresponding to parent layer. */
-  const AbsDistMat& get_error_signals(const Layer& parent) const;
+  virtual void setup_weights(size_t idx, weights& w) = 0;
+
+  /** @name Implementation details of back-prop. */
+  ///@{
+
+  /** @brief Move error signals from a child to its parent.
+   *
+   *  This is a hacky workaround to C++ rules for protected member
+   *  functions. No error-checking is done, e.g., to assert that the
+   *  two layers actually have a parent-child relationship because
+   *  this is just an implementation detail. The symbol is never
+   *  exposed to the public API.
+   *
+   *  @param parent The parent layer, into which the signal is moved
+   *  @param child  The child layer, from which the signal is moved
+   *  @param signal The now-released error signal from the child layer
+   */
+  friend void attempt_move_error_signal(
+    Layer& parent, Layer const& child,
+    std::unique_ptr<BaseDistMat> signals);
+  friend void attempt_view_error_signal(
+    Layer& parent, Layer const& child,
+    const BaseDistMat& signals);
+  friend void deep_copy_error_signal(
+    Layer& parent, Layer const& child,
+    const BaseDistMat& signals);
+
+  /** @brief Computes the core back-prop steps. */
+  virtual void back_prop_impl_() = 0;
+
+  /** @brief Allocates new storage for the gradients that this layer
+   *         will compute.
+   *
+   *  If the layer has persistent error signal information, this will
+   *  simply clear the gradients.
+   */
+  virtual void allocate_new_gradients_() = 0;
+
+  /** @brief Moves all error signals to their respective parents.
+   *
+   *  Error signals from this instances either are directly moved into
+   *  the parent layer or, in cases in which a direct move is not
+   *  possible, are deep-copied into a new tensor in the parent layer
+   *  (e.g., into a different data type or data distribution).
+   */
+  virtual void propagate_error_signals_to_parents_() = 0;
+
+  /** @brief Releases the error signals propagated from the child
+   *         layers.
+   *
+   *  At the conclusion of back-prop, the error signals propagated
+   *  from the child layers are no longer needed. This ensures that
+   *  the memory is released.
+   *
+   *  This function may do other work, but must respect the persistent
+   *  error signal flag.
+   */
+  virtual void clear_prev_error_signals_() = 0;
+
+  /** @brief Assumes ownership of the error signals from the specified
+   *         child layer.
+   *
+   *  This is a simple pointer move when possible; otherwise it is a
+   *  deep-copy of the signal data.
+   *
+   *  @param child The layer whence the signal is coming.
+   *  @param signal The error signals being sent to this layer.
+   */
+  virtual void move_or_copy_prev_error_signal_(
+    const Layer& child,
+    std::unique_ptr<El::BaseDistMatrix> signal) = 0;
+
+  /** @brief Attempts to view the error signals from the specified
+   *         child layer.
+   *
+   *  This is a simple data view when possible; otherwise it is a
+   *  deep-copy of the signal data.
+   *
+   *  @param child The layer whence the signal is coming.
+   *  @param signal The error signals being sent to this layer.
+   */
+  virtual void view_or_copy_prev_error_signal_(
+    const Layer& child,
+    const El::BaseDistMatrix& signal) = 0;
+
+  /** @brief Deep-copy the error signals from the specified child
+   *         layer.
+   *
+   *  @param child The layer whence the signal is coming.
+   *  @param signal The error signals being sent to this layer.
+   */
+  virtual void deep_copy_prev_error_signal_(
+    const Layer& child,
+    const El::BaseDistMatrix& signal) = 0;
+
+  ///@}
 
   // ===========================================================
   // Private class members
   // ===========================================================
 
+  /** @brief References to layer weights.
+   *
+   *  These are references to the base weights objects. The tensor
+   *  data type for weights storage might differ from the tensor data
+   *  type of this layer's tensors. To ensure consistency, we must
+   *  only access weights values through the WeightsProxy class during
+   *  training.
+   */
+  std::vector<weights*> m_weights;
+
   /** Dimensions of output tensors. */
   std::vector<std::vector<int>> m_output_dims_list;
 
-  /** Input tensors.
-   *  Each matrix column corresponds to a flattened mini-batch sample.
-   */
-  std::vector<std::unique_ptr<AbsDistMat>> m_inputs;
-  /** Output tensors.
-   *  Each matrix column corresponds to a flattened mini-batch sample.
-   */
-  std::vector<std::unique_ptr<AbsDistMat>> m_outputs;
-  /** Objective function gradients w.r.t. the output tensors.
-   *  Each matrix column corresponds to a flattened mini-batch sample.
-   */
-  std::vector<std::unique_ptr<AbsDistMat>> m_gradient_wrt_outputs;
-  /** Objective function gradients w.r.t. the input tensors.
-   *  Each matrix column corresponds to a flattened mini-batch sample.
-   */
-  std::vector<std::unique_ptr<AbsDistMat>> m_gradient_wrt_inputs;
-
   /** Hint layer.
    *  During setup, the output tensor dimensions are set to match the
    *  first output tensor of the hint layer. Derived classes may do
@@ -505,8 +735,60 @@ class Layer {
    */
   const Layer* m_hint_layer = nullptr;
 
+  /** Parallel strategy for the layer. */
+  ParallelStrategy m_parallel_strategy;
+
+private:
+  friend std::vector<const weights*> extract_weights(Layer const& l);
+  friend std::vector<weights*> extract_weights(Layer& l);
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class distconv_adapter;
+ public:
+  /** Indicate whether distconv is enabled. */
+  bool distconv_enabled() const;
+  /** Indicate whether original input matrices need to be set up. */
+  virtual bool keep_original_inputs(int index) const;
+  /** Indicate whether original output matrices need to be set up. */
+  virtual bool keep_original_outputs(int index) const;
+  /** Indicate whether original gradient wrt input matrices need to be set up. */
+  virtual bool keep_original_gradient_wrt_inputs(int index) const;
+  /** Indicate whether original gradient wrt output matrices need to be set up. */
+  virtual bool keep_original_gradient_wrt_outputs(int index) const;
+  /** Retrievs distconv adapter. */
+  virtual const distconv_adapter& get_distconv_adapter() const;
+  /** Retrievs distconv adapter. */
+  virtual distconv_adapter& get_distconv_adapter();
+
+ protected:
+  /** Indicate whether distconv is supported. */
+  virtual bool is_distconv_supported() const { return false; }
+  /** Pre-initialize distconv attributes needed for setup_data(). */
+  void prepare_distconv();
+  virtual void setup_distconv_adapter() = 0;
+  std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr() {
+    return m_dc; };
+  const std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr() const {
+    return m_dc; };
+
+ private:
+  mutable bool m_distconv_enabled = false;
+  mutable bool m_distconv_enabled_set = false;
+  std::unique_ptr<distconv_adapter> m_dc;
+#endif // LBANN_HAS_DISTCONV
 };
 
+// FIXME (trb 05/28/2020): These should go away. They're used in
+// "model.cpp" and "model_factory.cpp" but could be refactored
+// out. Outside the scope of current PR.
+inline std::vector<weights*> extract_weights(Layer& l) {
+  return l.m_weights;
+}
+
+inline std::vector<const weights*> extract_weights(Layer const& l) {
+  return {l.m_weights.cbegin(), l.m_weights.cend()};
+}
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LAYER_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt
index ac855e21023..71111d57435 100644
--- a/include/lbann/layers/learning/CMakeLists.txt
+++ b/include/lbann/layers/learning/CMakeLists.txt
@@ -1,8 +1,12 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   base_convolution.hpp
+  channelwise_scale_bias.hpp
+  channelwise_fully_connected.hpp
   convolution.hpp
   deconvolution.hpp
+  embedding.hpp
+  entrywise_scale_bias.hpp
   fully_connected.hpp
   fully_connected_cuda.hpp
   learning.hpp
diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp
index afa3046086b..5f15e935ee0 100644
--- a/include/lbann/layers/learning/base_convolution.hpp
+++ b/include/lbann/layers/learning/base_convolution.hpp
@@ -27,23 +27,70 @@
 #ifndef LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED
 #define LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED
 
-#include <vector>
-#include <omp.h>
+#include "lbann/layers/data_type_layer.hpp"
 #include "lbann/layers/layer.hpp"
-#include "lbann/weights/initializer.hpp"
-#include "lbann/weights/variance_scaling_initializers.hpp"
 #include "lbann/utils/cudnn.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/utils/timer.hpp"
-#include "lbann/utils/im2col.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <vector>
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, El::Device Device>
+class base_convolution_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+
+  base_convolution_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~base_convolution_adapter() = default;
+
+  void setup_fp_tensors() override;
+  void setup_bp_tensors() override;
+  void setup_layer(size_t workspace_capacity) override;
+
+  void fp_compute_convolution();
+  void fp_apply_bias();
+
+  void bp_compute_convolution_data();
+  void bp_compute_convolution_filter();
+
+  std::unique_ptr<dc::Convolution<TensorDataType>> m_conv;
+  std::unique_ptr<TensorDevType> m_kernel;
+  std::unique_ptr<TensorDevType> m_bias;
+  std::unique_ptr<TensorDevType> m_kernel_gradient;
+  std::unique_ptr<TensorDevType> m_bias_gradient;
+
+  std::string m_fwd_algo;
+  std::string m_bwd_data_algo;
+  std::string m_bwd_filter_algo;
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief Computation kernels for convolution and deconvolution layers.
  */
-template <El::Device Device>
-class base_convolution_layer : public Layer {
+template <typename TensorDataType, El::Device Device>
+class base_convolution_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  template <El::Device D>
+  using DMatDT = El::Matrix<TensorDataType, D>;
+
+#ifdef LBANN_HAS_CUDNN
+  using ScalingType = cudnn::ScalingParamType<TensorDataType>;
+#else
+  using ScalingType = TensorDataType;
+#endif // LBANN_HAS_CUDNN
+
+  ///@}
 
 protected:
 
@@ -68,10 +115,15 @@ class base_convolution_layer : public Layer {
   /** Scaling factor for bias term.
    *  If the scaling factor is zero, bias is not applied.
    */
-  DataType m_bias_scaling_factor;
+  ScalingType m_bias_scaling_factor;
 
 #ifdef LBANN_HAS_CUDNN
 
+  /** @brief Math type to use inside cuDNN.
+   *  @details Must be cached since it isn't used until setup.
+   */
+  cudnnMathType_t m_convolution_math_type =
+    cudnn::get_default_convolution_math_type();
   /** Convolution kernel cuDNN descriptor. */
   cudnnFilterDescriptor_t m_kernel_cudnn_desc = nullptr;
   /** Convolution cuDNN descriptor. */
@@ -79,7 +131,7 @@ class base_convolution_layer : public Layer {
   /** Bias tensor cuDNN descriptor. */
   cudnnTensorDescriptor_t m_bias_cudnn_desc = nullptr;
   /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::data_parallel_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
   /** Forward algorithm cache (mini-batch size -> algo). */
   std::unordered_map<int, cudnnConvolutionFwdAlgo_t> m_fwd_cudnn_algos;
   /** Backward data algorithm cache (mini-batch size -> algo). */
@@ -99,358 +151,28 @@ class base_convolution_layer : public Layer {
                          std::vector<int> strides,
                          std::vector<int> dilations,
                          int groups,
-                         bool has_bias)
-    : Layer(comm),
-      m_output_channels(output_channels),
-      m_conv_dims(std::move(conv_dims)),
-      m_pads(std::move(pads)),
-      m_strides(std::move(strides)),
-      m_dilations(std::move(dilations)),
-      m_groups(groups),
-      m_bias_scaling_factor(has_bias ? 1 : 0)
-#ifdef LBANN_HAS_CUDNN
-    , m_tensors_cudnn_desc(this)
-#endif // LBANN_HAS_CUDNN
-  {}
-
-  base_convolution_layer(const base_convolution_layer& other)
-    : Layer(other),
-      m_output_channels(other.m_output_channels),
-      m_conv_dims(other.m_conv_dims),
-      m_pads(other.m_pads),
-      m_strides(other.m_strides),
-      m_dilations(other.m_dilations),
-      m_groups(other.m_groups),
-      m_bias_scaling_factor(other.m_bias_scaling_factor)
-#ifdef LBANN_HAS_CUDNN
-    , m_tensors_cudnn_desc(other.m_tensors_cudnn_desc),
-      m_fwd_cudnn_algos(other.m_fwd_cudnn_algos),
-      m_bwd_data_cudnn_algos(other.m_bwd_data_cudnn_algos),
-      m_bwd_filter_cudnn_algos(other.m_bwd_filter_cudnn_algos)
-#endif // LBANN_HAS_CUDNN
-  {
-#ifdef LBANN_HAS_CUDNN
-    copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc,
-                           m_kernel_cudnn_desc);
-    copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc,
-                                m_convolution_cudnn_desc);
-    if (other.m_bias_scaling_factor != DataType(0)) {
-      cudnn::copy_tensor_desc(other.m_bias_cudnn_desc,
-                              m_bias_cudnn_desc);
-    }
-    m_tensors_cudnn_desc.set_layer(this);
-#endif // LBANN_HAS_CUDNN
-  }
+                         bool has_bias);
 
-  base_convolution_layer& operator=(const base_convolution_layer& other) {
-    Layer::operator=(other);
-    m_output_channels = other.m_output_channels;
-    m_conv_dims = other.m_conv_dims;
-    m_pads = other.m_pads;
-    m_strides = other.m_strides;
-    m_dilations = other.m_dilations;
-    m_groups = other.m_groups;
-    m_bias_scaling_factor = other.m_bias_scaling_factor;
+  base_convolution_layer(const base_convolution_layer& other);
 
-#ifdef LBANN_HAS_CUDNN
-    // Copy cuDNN objects
-    copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc,
-                           m_kernel_cudnn_desc);
-    copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc,
-                                m_convolution_cudnn_desc);
-    if (other.m_bias_scaling_factor != DataType(0)) {
-      cudnn::copy_tensor_desc(other.m_bias_cudnn_desc,
-                              m_bias_cudnn_desc);
-    }
-    m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
-    m_tensors_cudnn_desc.set_layer(this);
-    m_fwd_cudnn_algos = other.m_fwd_cudnn_algos;
-    m_bwd_data_cudnn_algos = other.m_bwd_data_cudnn_algos;
-    m_bwd_filter_cudnn_algos = other.m_bwd_filter_cudnn_algos;
-#endif // LBANN_HAS_CUDNN
+  base_convolution_layer& operator=(const base_convolution_layer& other);
 
-    return *this;
-  }
+  ~base_convolution_layer();
 
-  ~base_convolution_layer() {
 #ifdef LBANN_HAS_CUDNN
-    if (m_kernel_cudnn_desc != nullptr) {
-      CHECK_CUDNN_DTOR(cudnnDestroyFilterDescriptor(m_kernel_cudnn_desc));
-    }
-    if (m_convolution_cudnn_desc != nullptr) {
-      CHECK_CUDNN_DTOR(cudnnDestroyConvolutionDescriptor(m_convolution_cudnn_desc));
-    }
-    if (m_bias_cudnn_desc != nullptr) {
-      CHECK_CUDNN_DTOR(cudnnDestroyTensorDescriptor(m_bias_cudnn_desc));
-    }
+  void set_cudnn_math_mode(cudnnMathType_t math_type) noexcept;
 #endif // LBANN_HAS_CUDNN
-  }
-
-  description get_description() const override {
-    auto&& desc = Layer::get_description();
-    std::ostringstream ss;
-
-    // Convolution dimensions
-    ss.str(std::string{});
-    ss.clear();
-    for (size_t i = 0; i < m_conv_dims.size(); ++i) {
-      ss << (i > 0 ? ", " : "" ) << m_conv_dims[i];
-    }
-    desc.add("Convolution dimensions", ss.str());
 
-    // Strides
-    ss.str(std::string{});
-    ss.clear();
-    for (size_t i = 0; i < m_strides.size(); ++i) {
-      ss << (i > 0 ? ", " : "" ) << m_strides[i];
-    }
-    desc.add("Strides", ss.str());
+  description get_description() const override;
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
 
-    // Pads
-    ss.str(std::string{});
-    ss.clear();
-    for (size_t i = 0; i < m_pads.size(); ++i) {
-      ss << (i > 0 ? ", " : "" ) << m_pads[i];
-    }
-    desc.add("Pads", ss.str());
-
-    // Dilation
-    ss.str(std::string{});
-    ss.clear();
-    for (size_t i = 0; i < m_dilations.size(); ++i) {
-      ss << (i > 0 ? ", " : "" ) << m_dilations[i];
-    }
-    desc.add("Dilations", ss.str());
-
-    // Groups
-    desc.add("Groups", m_groups);
-
-    // Bias
-    ss.str(std::string{});
-    ss.clear();
-    ss << (m_bias_scaling_factor == DataType(0) ?
-           "disabled" : "enabled");
-    desc.add("Bias", ss.str());
-
-    // Result
-    return desc;
-
-  }
-
-  void setup_dims() override {
-    Layer::setup_dims();
-    std::ostringstream err;
-
-    // Check number of channels and channel groups
-    const auto& input_dims = get_input_dims();
-    if (m_output_channels < 1) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has an invalid number of output channels "
-          << "(" << m_output_channels << ")";
-      LBANN_ERROR(err.str());
-    } else if (m_groups < 1) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has an invalid number of groups (" << m_groups << ")";
-      LBANN_ERROR(err.str());
-    } else if (input_dims[0] % m_groups != 0
-               || m_output_channels % m_groups != 0) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has " << m_groups << " groups, which does not divide "
-          << "the input channels (" << input_dims[0] << ") or "
-          << "the output channels (" << m_output_channels << ")";
-      LBANN_ERROR(err.str());
-    }
-
-    // Check kernel dims, pads, stride, dilations
-    const auto& num_spatial_dims = input_dims.size() - 1;
-    if (m_conv_dims.size() != num_spatial_dims
-        || std::any_of(m_conv_dims.begin(), m_conv_dims.end(),
-                       [](El::Int d) { return d < 1; })) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has invalid spatial dimensions for convolution kernel (";
-      if (m_conv_dims.empty()) { err << "no dimensions"; }
-      for (size_t i = 0; i < m_conv_dims.size(); ++i) {
-        err << (i > 0 ? "x" : "") << m_conv_dims[i];
-      }
-      err << ", expected " << num_spatial_dims << " spatial dimensions)";
-      LBANN_ERROR(err.str());
-    } else if (m_pads.size() != num_spatial_dims) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has invalid convolution pads ((";
-      for (size_t i = 0; i < m_pads.size(); ++i) {
-        err << (i > 0 ? "," : "") << m_pads[i];
-      }
-      err << "), expected " << num_spatial_dims << " spatial dimensions)";
-      LBANN_ERROR(err.str());
-    } else if (m_strides.size() != num_spatial_dims
-               || std::any_of(m_strides.begin(), m_strides.end(),
-                              [](El::Int d) { return d < 1; })) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has invalid convolution strides ((";
-      for (size_t i = 0; i < m_strides.size(); ++i) {
-        err << (i > 0 ? "," : "") << m_strides[i];
-      }
-      err << "), expected " << num_spatial_dims << " spatial dimensions)";
-      LBANN_ERROR(err.str());
-    } else if (m_dilations.size() != num_spatial_dims
-               || std::any_of(m_dilations.begin(), m_dilations.end(),
-                              [](El::Int d) { return d < 1; })) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has invalid convolution dilations ((";
-      for (size_t i = 0; i < m_dilations.size(); ++i) {
-        err << (i > 0 ? "," : "") << m_dilations[i];
-      }
-      err << "), expected " << num_spatial_dims << " spatial dimensions)";
-      LBANN_ERROR(err.str());
-    }
-
-    // Make sure that configuration is supported
-    if (Device == El::Device::CPU
-        && std::any_of(m_dilations.begin(), m_dilations.end(),
-                       [](El::Int d) { return d != 1; })) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has non-unit dilation, which is not yet supported on CPU";
-      LBANN_ERROR(err.str());
-    }
-    if (Device == El::Device::CPU && m_groups != 1) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has " << m_groups << " groups, "
-          << "but only one group is currently supported on CPU";
-      LBANN_ERROR(err.str());
-    }
-
-  }
-
-  /** Setup layer data.
+  /** @brief Setup layer data.
    *  The kernel weights are setup in the convolution and
    *  deconvolution classes. */
-  void setup_data() override {
-    Layer::setup_data();
-
-    // Tensor dimensions
-    const auto& input_dims = get_input_dims();
-    const auto& output_dims = get_output_dims();
-    const auto& kernel_dims = get_kernel_dims();
-    const auto& kernel_size = std::accumulate(kernel_dims.begin(),
-                                              kernel_dims.end(),
-                                              1, std::multiplies<int>());
-
-    // Initialize default weights if none are provided
-    if (this->m_weights.size() > 2) {
-      std::stringstream err;
-      err << "attempted to setup layer \"" << get_name() << "\" "
-          << "with an invalid number of weights "
-          << "(expected at most 2, "
-          << "found " << this->m_weights.size() << ")";
-      LBANN_ERROR(err.str());
-    }
-    if (m_bias_scaling_factor != DataType(0)) {
-      this->m_weights.resize(2, nullptr);
-    } else {
-      this->m_weights.resize(1, nullptr);
-    }
-    if (this->m_weights[0] == nullptr) {
-      auto* w = new weights(get_comm());
-      std::unique_ptr<weights_initializer> init(new he_initializer(probability_distribution::gaussian));
-      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-      w->set_name(get_name() + "_kernel");
-      w->set_initializer(init);
-      w->set_optimizer(opt);
-      this->m_weights[0] = w;
-      this->m_model->add_weights(w);
-    }
-    auto& kernel_weights = *this->m_weights[0];
-
-    // Initialize variance scaling initialization
-    auto* cast_initializer
-      = dynamic_cast<variance_scaling_initializer*>(kernel_weights.get_initializer());
-    if (cast_initializer != nullptr) {
-      cast_initializer->set_fan_in(kernel_size / output_dims[0]);
-      cast_initializer->set_fan_out(kernel_size / input_dims[0]);
-    }
-
-    // Initialize weight matrices
-    auto dist = get_prev_activations().DistData();
-    dist.colDist = El::STAR;
-    dist.rowDist = El::STAR;
-    kernel_weights.set_dims(kernel_dims);
-    kernel_weights.set_matrix_distribution(dist);
-
-    // Set up bias if needed.
-    if (m_bias_scaling_factor != DataType(0)) {
-      if (this->m_weights[1] == nullptr) {
-        auto* w = new weights(get_comm());
-        std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-        w->set_name(get_name() + "_bias");
-        w->set_optimizer(opt);
-        this->m_weights[1] = w;
-        this->m_model->add_weights(w);
-      }
-      auto& bias_weights = *this->m_weights[1];
-      bias_weights.set_dims(output_dims[0]);
-      bias_weights.set_matrix_distribution(dist);
-    }
+  void setup_data(size_t max_mini_batch_size) override;
 
-    // Initialize freeze state
-    for (auto&& w : this->m_weights) {
-      if (m_frozen) {
-        w->freeze();
-      } else {
-        w->unfreeze();
-      }
-    }
-    for (auto&& w : this->m_weights) {
-      if (w->is_frozen() != m_frozen) {
-        std::stringstream err;
-        err << (m_frozen ? "" : "un") << "frozen "
-            << "layer \"" << get_name() << "\" has "
-            << (w->is_frozen() ? "" : "un") << "frozen "
-            << "weights \"" << w->get_name() << "\"";
-        LBANN_ERROR(err.str());
-      }
-    }
-
-  }
-
-  /// Initialize GPU objects
-  void setup_gpu() override {
-    Layer::setup_gpu();
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
-
-    const auto& output_dims = get_output_dims();
-    const auto& kernel_dims = get_kernel_dims();
-
-    // Set kernel descriptor
-    CHECK_CUDNN(cudnnCreateFilterDescriptor(&m_kernel_cudnn_desc));
-    CHECK_CUDNN(cudnnSetFilterNdDescriptor(m_kernel_cudnn_desc,
-                                           cudnn::get_data_type(),
-                                           CUDNN_TENSOR_NCHW,
-                                           kernel_dims.size(),
-                                           kernel_dims.data()));
-
-    // Set convolution descriptor
-    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&m_convolution_cudnn_desc));
-    CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(m_convolution_cudnn_desc,
-                                                m_pads.size(),
-                                                m_pads.data(),
-                                                m_strides.data(),
-                                                m_dilations.data(),
-                                                CUDNN_CROSS_CORRELATION,
-                                                cudnn::get_data_type()));
-    CHECK_CUDNN(cudnnSetConvolutionGroupCount(m_convolution_cudnn_desc,
-                                              m_groups));
-
-    // Set bias tensor descriptor
-    if (m_bias_scaling_factor != DataType(0)) {
-      std::vector<int> bias_dims(output_dims.size() + 1, 1);
-      bias_dims[1] = output_dims[0];
-      cudnn::set_tensor_desc(m_bias_cudnn_desc, bias_dims);
-    }
-
-#endif // LBANN_HAS_CUDNN
-  }
+  /** @brief Initialize GPU objects */
+  void setup_gpu() override;
 
 protected:
 
@@ -458,564 +180,23 @@ class base_convolution_layer : public Layer {
   virtual std::vector<int> get_kernel_dims() const = 0;
 
   /** Convolution with cuDNN. */
-  void apply_convolution_cudnn(bool during_forward_prop) {
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
-
-    // Useful constants
-    const DataType zero = DataType(0);
-    const DataType one = DataType(1);
-
-    // Matrices
-    const auto& kernel = m_weights[0]->get_values();
-    const auto& input = (during_forward_prop ?
-                         get_local_prev_activations() :
-                         get_local_prev_error_signals());
-    auto& output = (during_forward_prop ?
-                    get_local_activations() :
-                    get_local_error_signals());
-
-    // Do nothing if there is no local data
-    if (input.Height() < 1 || input.Width() < 1
-        || output.Height() < 1 || output.Width() < 1) {
-      return;
-    }
-
-    // Initialize GPU workspace
-    GPUMat workspace;
-#ifdef HYDROGEN_HAVE_CUB
-    workspace.SetMemoryMode(1);
-#endif // HYDROGEN_HAVE_CUB
-    size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
-    workspace.Resize(workspace_size / sizeof(DataType), 1);
-    workspace_size = workspace.Height() * sizeof(DataType);
-
-    // Convolution parameters
-    std::vector<int> input_dims, output_dims;
-    cudnnTensorDescriptor_t input_desc, output_desc;
-    if (during_forward_prop) {
-      input_dims = get_input_dims();
-      output_dims = get_output_dims();
-      input_desc = m_tensors_cudnn_desc.get_prev_activations();
-      output_desc = m_tensors_cudnn_desc.get_activations();
-    }
-    else {
-      input_dims = get_output_dims();
-      output_dims = get_input_dims();
-      input_desc = m_tensors_cudnn_desc.get_prev_error_signals();
-      output_desc = m_tensors_cudnn_desc.get_error_signals();
-    }
-
-    // Perform convolution on the GPU
-    // Determine convolution algorithm
-    cudnnConvolutionFwdAlgo_t convolution_cudnn_algorithm
-      = get_forward_algo_cudnn(input.Width(), input_desc, input.LockedBuffer(),
-                               m_kernel_cudnn_desc, kernel.LockedBuffer(),
-                               m_convolution_cudnn_desc,
-                               output_desc, output.Buffer(),
-                               workspace_size, workspace.Buffer());
-
-    // Apply convolution
-    CHECK_CUDNN(cudnnConvolutionForward(cudnn::get_handle(),
-                                        &one,
-                                        input_desc,
-                                        input.LockedBuffer(),
-                                        m_kernel_cudnn_desc,
-                                        kernel.LockedBuffer(),
-                                        m_convolution_cudnn_desc,
-                                        convolution_cudnn_algorithm,
-                                        workspace.Buffer(),
-                                        workspace_size,
-                                        &zero,
-                                        output_desc,
-                                        output.Buffer()));
-
-#endif // LBANN_HAS_CUDNN
-  }
+  void apply_convolution_cudnn(bool during_forward_prop);
 
   /** Transposed convolution with cuDNN. */
-  void apply_transposed_convolution_cudnn(bool during_forward_prop) {
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
-
-    // Useful constants
-    const DataType zero = DataType(0);
-    const DataType one = DataType(1);
-
-    // GPU data
-    const auto& kernel = m_weights[0]->get_values();
-    const auto& input = (during_forward_prop ?
-                         get_local_prev_activations() :
-                         get_local_prev_error_signals());
-    auto& output = (during_forward_prop ?
-                    get_local_activations() :
-                    get_local_error_signals());
-
-    // Do nothing if there is no local data
-    if (input.Height() < 1 || input.Width() < 1
-        || output.Height() < 1 || output.Width() < 1) {
-      return;
-    }
-
-    // Initialize GPU workspace
-    // Note: Use CUB GPU memory pool if possible
-    GPUMat workspace;
-#ifdef HYDROGEN_HAVE_CUB
-    workspace.SetMemoryMode(1);
-#endif // HYDROGEN_HAVE_CUB
-    size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
-    workspace.Resize(workspace_size / sizeof(DataType), 1);
-    workspace_size = workspace.Height() * sizeof(DataType);
-
-    // Convolution transpose parameters
-    std::vector<int> input_dims, output_dims;
-    cudnnTensorDescriptor_t input_desc, output_desc;
-    if (during_forward_prop) {
-      input_dims = get_input_dims();
-      output_dims = get_output_dims();
-      input_desc = m_tensors_cudnn_desc.get_prev_activations();
-      output_desc = m_tensors_cudnn_desc.get_activations();
-    }
-    else {
-      input_dims = get_output_dims();
-      output_dims = get_input_dims();
-      input_desc = m_tensors_cudnn_desc.get_prev_error_signals();
-      output_desc = m_tensors_cudnn_desc.get_error_signals();
-    }
-
-    // Perform transposed convolution on the GPU
-    // Determine transposed convolution algorithm
-    cudnnConvolutionBwdDataAlgo_t transposed_convolution_cudnn_algorithm
-      = get_backward_data_algo_cudnn(input.Width(),
-                                     m_kernel_cudnn_desc, kernel.LockedBuffer(),
-                                     input_desc, input.LockedBuffer(),
-                                     m_convolution_cudnn_desc,
-                                     output_desc, output.Buffer(),
-                                     workspace_size, workspace.Buffer());
-    // Perform transposed convolution
-    CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn::get_handle(),
-                                             &one,
-                                             m_kernel_cudnn_desc,
-                                             kernel.LockedBuffer(),
-                                             input_desc,
-                                             input.LockedBuffer(),
-                                             m_convolution_cudnn_desc,
-                                             transposed_convolution_cudnn_algorithm,
-                                             workspace.Buffer(),
-                                             workspace_size,
-                                             &zero,
-                                             output_desc,
-                                             output.Buffer()));
-
-
-  #endif // LBANN_HAS_CUDNN
-  }
-
-  void apply_bias_cudnn() {
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
-    auto& local_output = get_local_activations();
-    if (m_bias_scaling_factor != DataType(0)
-        && local_output.Height() > 0
-        && local_output.Width() > 0) {
-      const DataType one = 1;
-      const auto& bias = m_weights[1]->get_values();
-      CHECK_CUDNN(cudnnAddTensor(cudnn::get_handle(),
-                                 &m_bias_scaling_factor,
-                                 m_bias_cudnn_desc,
-                                 bias.LockedBuffer(),
-                                 &one,
-                                 m_tensors_cudnn_desc.get_activations(),
-                                 local_output.Buffer()));
-    }
-  #endif // LBANN_HAS_CUDNN
-  }
-
-  void compute_gradients_cudnn(bool using_transposed_convolution) {
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
+  void apply_transposed_convolution_cudnn(bool during_forward_prop);
 
-    // Matrices
-    const auto& local_input = get_local_prev_activations();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-
-    // Useful constants
-    const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
-    const bool has_local_data = (local_input.Height() > 0
-                                 && local_input.Width() > 0
-                                 && local_gradient_wrt_output.Height() > 0
-                                 && local_gradient_wrt_output.Width() > 0);
-
-    // Compute bias gradient
-    if (m_bias_scaling_factor != DataType(0)
-        && m_weights[1]->get_optimizer() != nullptr) {
-      optimizer* bias_optimizer = m_weights[1]->get_optimizer();
-      DataType dst_scale = DataType(0), gradient_scale = DataType(0);
-      auto& bias_gradient = bias_optimizer->get_gradient_buffer(
-        dst_scale, gradient_scale, true);
-      gradient_scale /= effective_mini_batch_size;
-      if (has_local_data) {
-        CHECK_CUDNN(cudnnConvolutionBackwardBias(
-                      cudnn::get_handle(),
-                      &gradient_scale,
-                      m_tensors_cudnn_desc.get_prev_error_signals(),
-                      local_gradient_wrt_output.LockedBuffer(),
-                      &dst_scale,
-                      m_bias_cudnn_desc,
-                      bias_gradient.Buffer()));
-      } else {
-        El::Scale(dst_scale, bias_gradient);
-      }
-    }
-
-    // Compute kernel gradient
-    optimizer* kernel_optimizer = m_weights[0]->get_optimizer();
-    if (kernel_optimizer != nullptr) {
-      DataType dst_scale = DataType(0), gradient_scale = DataType(0);
-      auto& kernel_gradient = kernel_optimizer->get_gradient_buffer(
-        dst_scale, gradient_scale, true);
-      gradient_scale /= effective_mini_batch_size;
-      if (has_local_data) {
-        // Initialize GPU workspace
-        GPUMat workspace;
-#ifdef HYDROGEN_HAVE_CUB
-        workspace.SetMemoryMode(1); // CUB GPU memory pool
-#endif // HYDROGEN_HAVE_CUB
-        size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
-        workspace.Resize(workspace_size / sizeof(DataType), 1);
-        workspace_size = workspace.Height() * sizeof(DataType);
-
-        // Initialize cuDNN objects
-        auto&& input_desc = m_tensors_cudnn_desc.get_prev_activations();
-        auto&& gradient_wrt_output_desc = m_tensors_cudnn_desc.get_prev_error_signals();
-
-        // Determine algorithm and compute kernel gradient
-        if (using_transposed_convolution) {
-          cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm
-            = get_backward_filter_algo_cudnn(
-              local_input.Width(),
-              gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(),
-              input_desc, local_input.LockedBuffer(),
-              m_convolution_cudnn_desc,
-              m_kernel_cudnn_desc,
-              workspace_size, workspace.Buffer());
-          CHECK_CUDNN(cudnnConvolutionBackwardFilter(
-                        cudnn::get_handle(),
-                        &gradient_scale,
-                        gradient_wrt_output_desc,
-                        local_gradient_wrt_output.LockedBuffer(),
-                        input_desc,
-                        local_input.LockedBuffer(),
-                        m_convolution_cudnn_desc,
-                        kernel_gradient_cudnn_algorithm,
-                        workspace.Buffer(),
-                        workspace_size,
-                        &dst_scale,
-                        m_kernel_cudnn_desc,
-                        kernel_gradient.Buffer()));
-        } else {
-          cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm
-            = get_backward_filter_algo_cudnn(
-              local_input.Width(),
-              input_desc, local_input.LockedBuffer(),
-              gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(),
-              m_convolution_cudnn_desc,
-              m_kernel_cudnn_desc,
-              workspace_size, workspace.Buffer());
-          CHECK_CUDNN(cudnnConvolutionBackwardFilter(
-                        cudnn::get_handle(),
-                        &gradient_scale,
-                        input_desc,
-                        local_input.LockedBuffer(),
-                        gradient_wrt_output_desc,
-                        local_gradient_wrt_output.LockedBuffer(),
-                        m_convolution_cudnn_desc,
-                        kernel_gradient_cudnn_algorithm,
-                        workspace.Buffer(),
-                        workspace_size,
-                        &dst_scale,
-                        m_kernel_cudnn_desc,
-                        kernel_gradient.Buffer()));
-        }
-      } else {
-        El::Scale(dst_scale, kernel_gradient);
-      }
-    }
-
-#endif // LBANN_HAS_CUDNN
-  }
+  void apply_bias_cudnn();
+  void compute_gradients_cudnn(bool using_transposed_convolution);
 
   /** Convolution with im2col GEMM algorithm. */
-  void apply_convolution_im2col(bool during_forward_prop) {
-
-    // Local matrices
-    const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix();
-    const auto& local_input = (during_forward_prop ?
-                               get_local_prev_activations() :
-                               get_local_prev_error_signals());
-    auto& local_output = (during_forward_prop ?
-                          get_local_activations() :
-                          get_local_error_signals());
-
-    // Matrix parameters
-    const int output_size = local_output.Height();
-    const El::Int local_width = local_input.Width();
-    std::vector<int> input_dims, output_dims;
-    if (during_forward_prop) {
-      input_dims = get_input_dims();
-      output_dims = get_output_dims();
-    }
-    else {
-      input_dims = get_output_dims();
-      output_dims = get_input_dims();
-    }
-    const auto& kernel_dims = get_kernel_dims();
-    const auto& kernel_size = std::accumulate(kernel_dims.begin(),
-                                              kernel_dims.end(),
-                                              1, std::multiplies<int>());
-
-    // Initialize matrices
-    const int m = output_size / output_dims[0];
-    const int n = output_dims[0];
-    const int k = kernel_size / output_dims[0];
-    DMat<Device> input_col, output_col;
-    DMat<Device> im2col_matrix(k, m);
-    const DMat<Device> kernel_matrix(k, n, local_kernel.LockedBuffer(), k);
-
-    // Iterate through input columns
-    for (El::Int col = 0; col < local_width; ++col) {
-
-      // Construct im2col matrix from current input column
-      El::LockedView(input_col, local_input, El::ALL, El::IR(col));
-      im2col(input_col,
-             im2col_matrix,
-             input_dims[0],
-             input_dims.size() - 1,
-             &input_dims[1],
-             m_pads.data(),
-             &kernel_dims[2],
-             m_strides.data());
-
-      // Apply convolution to current input column
-      output_col.Attach(m, n, local_output.Buffer(0, col), m);
-      El::Gemm(El::TRANSPOSE, El::NORMAL,
-               DataType(1), im2col_matrix, kernel_matrix,
-               DataType(0), output_col);
-
-    }
-
-  }
+  void apply_convolution_im2col(bool during_forward_prop);
 
   /** Transposed convolution with im2col GEMM algorithm. */
-  void apply_transposed_convolution_im2col(bool during_forward_prop) {
-
-    // Local matrices
-    const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix();
-    const auto& local_input = (during_forward_prop ?
-                               get_local_prev_activations() :
-                               get_local_prev_error_signals());
-    DMat<Device>& local_output = (during_forward_prop ?
-                                  get_local_activations() :
-                                  get_local_error_signals());
+  void apply_transposed_convolution_im2col(bool during_forward_prop);
 
-    // Matrix parameters
-    const int input_size = local_input.Height();
-    const El::Int local_width = local_input.Width();
-    std::vector<int> input_dims, output_dims;
-    if (during_forward_prop) {
-      input_dims = get_input_dims();
-      output_dims = get_output_dims();
-    }
-    else {
-      input_dims = get_output_dims();
-      output_dims = get_input_dims();
-    }
-    const auto& kernel_dims = get_kernel_dims();
-    const auto& kernel_size = std::accumulate(kernel_dims.begin(),
-                                              kernel_dims.end(),
-                                              1, std::multiplies<int>());
+  void apply_bias_cpu();
 
-    // Initialize matrices
-    const int m = kernel_size / input_dims[0];
-    const int n = input_size / input_dims[0];
-    const int k = input_dims[0];
-    DMat<Device> input_col, output_col;
-    DMat<Device> im2col_matrix(m, n);
-    const DMat<Device> kernel_matrix(m, k, local_kernel.LockedBuffer(), m);
-
-    // Iterate through input columns
-    for (El::Int col = 0; col < local_width; ++col) {
-
-      // Apply transposed convolution to current input column
-      input_col.LockedAttach(n, k, local_input.LockedBuffer(0, col), n);
-      El::Gemm(El::NORMAL, El::TRANSPOSE,
-               DataType(1), kernel_matrix, input_col,
-               DataType(0), im2col_matrix);
-
-      // Perform col2im to accumulate contributions from each kernel
-      // position
-      El::View(output_col, local_output, El::ALL, El::IR(col));
-      col2im(im2col_matrix,
-             output_col,
-             output_dims[0],
-             output_dims.size() - 1,
-             &output_dims[1],
-             m_pads.data(),
-             &kernel_dims[2],
-             m_strides.data());
-
-    }
-
-  }
-
-  void apply_bias_cpu() {
-
-    // Return immediately if there is no bias
-    if (m_bias_scaling_factor == DataType(0)) return;
-
-    // Local matrices
-    const auto& local_bias = m_weights[1]->get_values().LockedMatrix();
-    auto& local_output = get_local_activations();
-
-    // Matrix parameters
-    const El::Int local_width = local_output.Width();
-    const auto& output_dims = get_output_dims();
-    const El::Int num_output_channels = output_dims[0];
-    const El::Int num_per_output_channel = get_output_size() / num_output_channels;
-
-    // Apply bias to each output channel
-    LBANN_OMP_PARALLEL_FOR
-    for (El::Int channel = 0; channel < num_output_channels; ++channel) {
-      const El::Int row_start = channel * num_per_output_channel;
-      const El::Int row_end = (channel+1) * num_per_output_channel;
-      const DataType bias_term = m_bias_scaling_factor * local_bias(channel, 0);
-      for (El::Int col = 0; col < local_width; ++col) {
-        for (El::Int row = row_start; row < row_end; ++row) {
-          local_output(row, col) += bias_term;
-        }
-      }
-    }
-
-  }
-
-  void compute_gradients_im2col(bool using_transposed_convolution) {
-
-    // Local matrices
-    const DMat<Device>& local_input = get_local_prev_activations();
-    const DMat<Device>& local_gradient_wrt_output = get_local_prev_error_signals();
-    const bool has_local_data = (!local_input.IsEmpty()
-                                 && !local_gradient_wrt_output.IsEmpty());
-
-    // Get convolution parameters
-    const El::Int local_width = local_input.Width();
-    const auto& input_dims = get_input_dims();
-    const auto& output_dims = get_output_dims();
-    const int num_input_channels = input_dims[0];
-    const int num_output_channels = output_dims[0];
-    const int num_per_output_channel = get_output_size() / num_output_channels;
-    const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
-    const auto& kernel_dims = get_kernel_dims();
-    const auto& kernel_size = std::accumulate(kernel_dims.begin(),
-                                              kernel_dims.end(),
-                                              1, std::multiplies<int>());
-
-    // Compute bias gradient
-    // Note: Sum is computed with Kahan summation
-    if (m_bias_scaling_factor != DataType(0)
-        && this->m_weights[1]->get_optimizer() != nullptr) {
-      optimizer* bias_optimizer = this->m_weights[1]->get_optimizer();
-      DataType dst_scale = DataType(0), gradient_scale = DataType(0);
-      auto& bias_gradient = bias_optimizer->get_gradient_buffer(
-        dst_scale, gradient_scale, true);
-      gradient_scale /= effective_mini_batch_size;
-      if (has_local_data) {
-        auto& local_bias_gradient = bias_gradient.Matrix();
-        LBANN_OMP_PARALLEL_FOR
-        for (int channel = 0; channel < num_output_channels; ++channel) {
-          const El::Int row_start = channel * num_per_output_channel;
-          const El::Int row_end = (channel+1) * num_per_output_channel;
-          DataType sum = 0;
-          DataType correction = 0;
-          for (El::Int col = 0; col < local_width; ++col) {
-            for (El::Int row = row_start; row < row_end; ++row) {
-              DataType term = local_gradient_wrt_output(row, col);
-              term += correction;
-              const DataType next_sum = sum + term;
-              correction = term - (next_sum - sum);
-              sum = next_sum;
-            }
-          }
-          local_bias_gradient(channel, 0) = dst_scale*local_bias_gradient(channel, 0)
-            + gradient_scale*sum;
-        }
-      } else {
-        El::Scale(dst_scale, bias_gradient);
-      }
-    }
-
-    // Stop early if kernel is not being optimized
-    optimizer* kernel_optimizer = this->m_weights[0]->get_optimizer();
-    if (kernel_optimizer == nullptr) { return; }
-
-    // Initialize matrices
-    const int m = (using_transposed_convolution ?
-                   kernel_size / num_input_channels :
-                   kernel_size / num_output_channels);
-    const int n = (using_transposed_convolution ?
-                   num_input_channels :
-                   num_output_channels);
-    const int k = (using_transposed_convolution ?
-                   get_input_size() / num_input_channels :
-                   get_output_size() / num_output_channels);
-    DataType dst_scale = 0, gradient_scale = 0;
-    auto& kernel_gradient = kernel_optimizer->get_gradient_buffer(
-      dst_scale, gradient_scale, true);
-    El::Scale(dst_scale, kernel_gradient);
-    gradient_scale /= effective_mini_batch_size;
-    DMat<Device> im2col_matrix(m, k);
-    DMat<Device> kernel_gradient_matrix(m, n, kernel_gradient.Buffer(), m);
-
-    // Compute kernel gradient contributions from each data sample
-    for (El::Int col = 0; col < local_width; ++col) {
-      if (using_transposed_convolution) {
-        const DMat<Device> input_col(k, n, local_input.LockedBuffer(0,col), k);
-        const DMat<Device> gradient_wrt_output_col =
-          El::LockedView(local_gradient_wrt_output, El::ALL, El::IR(col));
-        im2col(gradient_wrt_output_col,
-               im2col_matrix,
-               num_output_channels,
-               output_dims.size() - 1,
-               &output_dims[1],
-               m_pads.data(),
-               &kernel_dims[2],
-               m_strides.data());
-        El::Gemm(El::NORMAL, El::NORMAL,
-                 gradient_scale, im2col_matrix, input_col,
-                 DataType(1), kernel_gradient_matrix);
-      }
-      else {
-        const DMat<Device> input_col
-          = El::LockedView(local_input, El::ALL, El::IR(col));
-        const DMat<Device> gradient_wrt_output_col(k, n, local_gradient_wrt_output.LockedBuffer(0,col), k);
-        im2col(input_col,
-               im2col_matrix,
-               num_input_channels,
-               input_dims.size() - 1,
-               &input_dims[1],
-               m_pads.data(),
-               &kernel_dims[2],
-               m_strides.data());
-        El::Gemm(El::NORMAL, El::NORMAL,
-                 gradient_scale, im2col_matrix, gradient_wrt_output_col,
-                 DataType(1), kernel_gradient_matrix);
-      }
-    }
-
-  }
+  void compute_gradients_im2col(bool using_transposed_convolution);
 
 private:
 
@@ -1023,155 +204,37 @@ class base_convolution_layer : public Layer {
 
   /** Copy convolution kernel cuDNN descriptor. */
   static void copy_kernel_cudnn_desc(const cudnnFilterDescriptor_t& src,
-                                     cudnnFilterDescriptor_t& dst) {
-
-    // Create or destroy descriptor if needed
-    if(src != nullptr && dst == nullptr) {
-      CHECK_CUDNN(cudnnCreateFilterDescriptor(&dst));
-    }
-    else if(src == nullptr && dst != nullptr) {
-      CHECK_CUDNN(cudnnDestroyFilterDescriptor(dst));
-      dst = nullptr;
-    }
-
-    // Copy descriptor data if needed
-    if(src != nullptr) {
-      cudnnDataType_t data_type;
-      cudnnTensorFormat_t format;
-      int num_dims;
-      std::vector<int> dims(1);
-      CHECK_CUDNN(cudnnGetFilterNdDescriptor(src,
-                                             dims.size(),
-                                             &data_type,
-                                             &format,
-                                             &num_dims,
-                                             dims.data()));
-      dims.resize(num_dims);
-      CHECK_CUDNN(cudnnGetFilterNdDescriptor(src,
-                                             num_dims,
-                                             &data_type,
-                                             &format,
-                                             &num_dims,
-                                             dims.data()));
-      CHECK_CUDNN(cudnnSetFilterNdDescriptor(dst,
-                                             data_type,
-                                             format,
-                                             num_dims,
-                                             dims.data()));
-    }
-
-  }
-
+                                     cudnnFilterDescriptor_t& dst);
   /** Copy convolution cuDNN descriptor. */
-  static void copy_convolution_cudnn_desc(const cudnnConvolutionDescriptor_t& src,
-                                          cudnnConvolutionDescriptor_t& dst) {
-
-    // Create or destroy descriptor if needed
-    if(src != nullptr && dst == nullptr) {
-      CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&dst));
-    }
-    else if(src == nullptr && dst != nullptr) {
-      CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(dst));
-      dst = nullptr;
-    }
-
-    // Copy descriptor data if needed
-    if(src != nullptr) {
-      cudnnConvolutionMode_t mode;
-      cudnnDataType_t data_type;
-      int num_dims;
-      CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src,
-                                                  0,
-                                                  &num_dims,
-                                                  nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  &mode,
-                                                  &data_type));
-      std::vector<int> pads(num_dims), strides(num_dims), dilations(num_dims);
-      CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src,
-                                                  num_dims,
-                                                  &num_dims,
-                                                  pads.data(),
-                                                  strides.data(),
-                                                  dilations.data(),
-                                                  &mode,
-                                                  &data_type));
-      int num_groups;
-      CHECK_CUDNN(cudnnGetConvolutionGroupCount(src,
-                                                &num_groups));
-      CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(dst,
-                                                  num_dims,
-                                                  pads.data(),
-                                                  strides.data(),
-                                                  dilations.data(),
-                                                  mode,
-                                                  data_type));
-      CHECK_CUDNN(cudnnSetConvolutionGroupCount(dst,
-                                                num_groups));
-    }
-
-  }
+  static void copy_convolution_cudnn_desc(
+    const cudnnConvolutionDescriptor_t& src,
+    cudnnConvolutionDescriptor_t& dst);
 
   /** Get the cuDNN algorithm to use for forward prop. */
   cudnnConvolutionFwdAlgo_t get_forward_algo_cudnn(
     const int local_mini_batch_size,
     const cudnnTensorDescriptor_t& input_desc,
-    const DataType* input,
+    const TensorDataType* input,
     const cudnnFilterDescriptor_t& kernel_desc,
-    const DataType* kernel,
+    const TensorDataType* kernel,
     const cudnnConvolutionDescriptor_t& conv_desc,
     const cudnnTensorDescriptor_t& output_desc,
-    DataType* output,
+    TensorDataType* output,
     size_t ws_size,
-    DataType* ws) {
-    if (m_fwd_cudnn_algos.count(local_mini_batch_size) == 0) {
-#ifdef LBANN_DETERMINISTIC
-      bool deterministic = true;
-#else
-      bool deterministic = false;
-#endif
-      m_fwd_cudnn_algos[local_mini_batch_size] =
-        cudnn::get_fwd_algorithm(
-          true, deterministic,
-          input_desc, input,
-          kernel_desc, kernel,
-          conv_desc,
-          output_desc, output,
-          ws_size, ws);
-    }
-    return m_fwd_cudnn_algos[local_mini_batch_size];
-  }
+    TensorDataType* ws);
 
   /** Get the cuDNN algorithm to use for backward-data. */
   cudnnConvolutionBwdDataAlgo_t get_backward_data_algo_cudnn(
     const int local_mini_batch_size,
     const cudnnFilterDescriptor_t& kernel_desc,
-    const DataType* kernel,
+    const TensorDataType* kernel,
     const cudnnTensorDescriptor_t& prev_error_signal_desc,
-    const DataType* prev_error_signal,
+    const TensorDataType* prev_error_signal,
     const cudnnConvolutionDescriptor_t& conv_desc,
     const cudnnTensorDescriptor_t& error_signal_desc,
-    DataType* error_signal,
+    TensorDataType* error_signal,
     size_t ws_size,
-    DataType* ws) {
-    if (m_bwd_data_cudnn_algos.count(local_mini_batch_size) == 0) {
-#ifdef LBANN_DETERMINISTIC
-      bool deterministic = true;
-#else
-      bool deterministic = false;
-#endif
-      m_bwd_data_cudnn_algos[local_mini_batch_size] =
-        cudnn::get_bwd_data_algorithm(
-          true, deterministic,
-          kernel_desc, kernel,
-          prev_error_signal_desc, prev_error_signal,
-          conv_desc,
-          error_signal_desc, error_signal,
-          ws_size, ws);
-    }
-    return m_bwd_data_cudnn_algos[local_mini_batch_size];
-  }
+    TensorDataType* ws);
 
   /**
    * Get the cuDNN algorithm to use for backward-filter.
@@ -1180,42 +243,24 @@ class base_convolution_layer : public Layer {
   cudnnConvolutionBwdFilterAlgo_t get_backward_filter_algo_cudnn(
     const int local_mini_batch_size,
     const cudnnTensorDescriptor_t& input_desc,
-    const DataType* input,
+    const TensorDataType* input,
     const cudnnTensorDescriptor_t& prev_error_signal_desc,
-    const DataType* prev_error_signal,
+    const TensorDataType* prev_error_signal,
     const cudnnConvolutionDescriptor_t& conv_desc,
     const cudnnFilterDescriptor_t& kernel_gradient_desc,
     size_t ws_size,
-    DataType* ws) {
-    if (m_bwd_filter_cudnn_algos.count(local_mini_batch_size) == 0) {
-#ifdef LBANN_DETERMINISTIC
-      bool deterministic = true;
-#else
-      bool deterministic = false;
-#endif
-      // Temporary filter gradient buffer.
-      GPUMat kernel_gradient;
-#ifdef HYDROGEN_HAVE_CUB
-      kernel_gradient.SetMemoryMode(1);
-#endif
-      kernel_gradient.Resize(this->m_weights[0]->get_matrix_height(),
-                             this->m_weights[0]->get_matrix_width());
-      m_bwd_filter_cudnn_algos[local_mini_batch_size] =
-        cudnn::get_bwd_filter_algorithm(
-          true, deterministic,
-          input_desc, input,
-          prev_error_signal_desc, prev_error_signal,
-          conv_desc,
-          kernel_gradient_desc, kernel_gradient.Buffer(),
-          ws_size, ws);
-    }
-    return m_bwd_filter_cudnn_algos[local_mini_batch_size];
-  }
-
+    TensorDataType* ws);
 #endif // LBANN_HAS_CUDNN
 
+#ifdef LBANN_HAS_DISTCONV
+  friend class base_convolution_adapter<TensorDataType, Device>;
+ protected:
+  using BaseConvAdapterType = base_convolution_adapter<TensorDataType, Device>;
+  void setup_distconv_adapter() override;
+  BaseConvAdapterType& get_distconv_adapter() override;
+  const BaseConvAdapterType& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
 } // namespace lbann
-
 #endif // LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/channelwise_fully_connected.hpp b/include/lbann/layers/learning/channelwise_fully_connected.hpp
new file mode 100644
index 00000000000..928ef732da2
--- /dev/null
+++ b/include/lbann/layers/learning/channelwise_fully_connected.hpp
@@ -0,0 +1,115 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED
+#define LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Apply affine transformation to tensor channels.
+ *
+ *  The input tensor is sliced along the first tensor dimension (the
+ *  "channel" dimension for image data in CHW format) and the same
+ *  affine transformation is applied to each slice. Following a
+ *  row-vector convention:
+ *    @f[ y(i,*) = \text{vec}( x(i,*) ) W^T + b @f]
+ *
+ *  Two weights are required if bias is applied: the linearity and the
+ *  bias. Only the linearity weights are required if bias is not
+ *  applied. If weights aren't provided, the linearity weights are
+ *  initialized with He normal initialization and the bias weights are
+ *  initialized to zero.
+ *
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class channelwise_fully_connected_layer
+  : public data_type_layer<TensorDataType> {
+
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "channelwise_fully_connected layer "
+                "only supports data parallel layout");
+
+public:
+
+  /** @param comm                   LBANN communicator.
+   *  @param output_channel_dims    Output tensor dimensions,
+   *                                excluding the first dimension.
+   *  @param bias                   Whether to apply bias.
+   *  @param transpose              Whether to apply transpose of
+   *                                weights matrix.
+   */
+  channelwise_fully_connected_layer(
+    lbann_comm* comm,
+    std::vector<size_t> output_channel_dims,
+    bool bias,
+    bool transpose);
+
+  channelwise_fully_connected_layer(
+    const channelwise_fully_connected_layer& other) = default;
+  channelwise_fully_connected_layer& operator=(
+    const channelwise_fully_connected_layer& other) = default;
+  ~channelwise_fully_connected_layer() = default;
+
+  channelwise_fully_connected_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_data(size_t max_mini_batch_size) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Whether to apply bias. */
+  bool m_has_bias;
+  /** Whether to transpose linearity. */
+  bool m_transpose;
+
+};
+
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(channelwise_fully_connected);
+
+// Explicit template instantiation
+#ifndef LBANN_CHANNELWISE_FULLY_CONNECTED_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device)                                 \
+  extern template class channelwise_fully_connected_layer<      \
+    T, data_layout::DATA_PARALLEL, Device>
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_CHANNELWISE_FULLY_CONNECTED_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_LEARNING_CHANNELWISE_FULLY_CONNECTED_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp
new file mode 100644
index 00000000000..6d38b6b19b8
--- /dev/null
+++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp
@@ -0,0 +1,196 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED
+#define LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+/** @brief Apply scale and bias to tensor channels.
+ *
+ *  The input tensor is sliced along the first tensor dimension (the
+ *  "channel" dimension, assuming image data in CHW format) and scale
+ *  and bias terms are applied independently to each slice. More
+ *  precisely, given input and output tensors
+ *  @f$ X,Y\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$
+ *  and scale and bias vectors @f$ a,b\in\mathbb{R}^{d_1} @f$:
+ *  @f[
+ *    Y_{i,j,\cdots} = a_i X_{i,j,\cdots} + b_i
+ *  @f]
+ *
+ *  The scale and bias vectors are fused into a single weights tensor
+ *  to reduce the number of gradient allreduces during backprop. In
+ *  particular, the weights tensor is a
+ *  @f$ \text{num\_channels} \times 2 @f$ matrix, where the first
+ *  column correspond to scale terms and the second column to bias
+ *  terms.
+ */
+template <typename TensorDataType, data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class channelwise_scale_bias_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "channelwise_mean_layer only supports "
+                "data-parallel data layout");
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  ///@}
+
+public:
+
+  channelwise_scale_bias_layer(lbann_comm *comm);
+  channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other);
+  channelwise_scale_bias_layer& operator=(
+    const channelwise_scale_bias_layer& other);
+
+  channelwise_scale_bias_layer* copy() const override {
+    return new channelwise_scale_bias_layer(*this);
+  }
+
+  std::string get_type() const override { return "channel-wise scale/bias"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  void setup_matrices(const El::Grid& grid) override;
+  void setup_data(size_t max_mini_batch_size) override;
+
+protected:
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** @brief Objective function gradient w.r.t. weights. */
+  std::unique_ptr<AbsDistMatrixType> m_weights_gradient;
+
+};
+
+// Implementation
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+channelwise_scale_bias_layer<TensorDataType, Layout, Dev>
+::channelwise_scale_bias_layer(lbann_comm *comm)
+  : data_type_layer<TensorDataType>(comm)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+channelwise_scale_bias_layer<TensorDataType, Layout, Dev>
+::channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other)
+  : data_type_layer<TensorDataType>(other),
+    m_weights_gradient(other.m_weights_gradient
+                       ? other.m_weights_gradient->Copy()
+                       : nullptr)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+auto channelwise_scale_bias_layer<TensorDataType, Layout, Dev>
+::operator=(const channelwise_scale_bias_layer& other)
+  -> channelwise_scale_bias_layer& {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_weights_gradient.reset(other.m_weights_gradient
+                           ? other.m_weights_gradient->Copy()
+                           : nullptr);
+  return *this;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, Layout, Dev>
+::setup_matrices(const El::Grid& grid) {
+  data_type_layer<TensorDataType>::setup_matrices(grid);
+  m_weights_gradient.reset(new StarMatDT<TensorDataType, Dev>(grid));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, Layout, Dev>
+::setup_data(size_t max_mini_batch_size) {
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+  const El::Int num_channels = this->get_output_dims()[0];
+
+  // Construct default weights if needed
+  // Note: Scale is initialized to 1 and bias to 0
+  if (!this->has_weights()) {
+    auto w = make_unique<WeightsType>(this->get_comm());
+    std::vector<TensorDataType> vals(2*num_channels,
+                                     El::TypeTraits<TensorDataType>::Zero());
+    std::fill(vals.begin(), vals.begin()+num_channels,
+              El::TypeTraits<TensorDataType>::One());
+    auto init = make_unique<value_initializer<TensorDataType>>(vals);
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+    w->set_name(this->get_name() + "_weights");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->add_weights(w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+  if (this->num_weights() != 1) {
+    LBANN_ERROR("attempted to setup ",
+                this->get_type()," layer \"",this->get_name(),"\" ",
+                "with an invalid number of weights ",
+                "(expected 1, found ",this->num_weights(),")");
+  }
+
+  // Setup weights
+  auto dist = this->get_prev_activations().DistData();
+  dist.colDist = El::STAR;
+  dist.rowDist = El::STAR;
+  this->get_weights(0).set_dims({static_cast<int>(num_channels)}, {2});
+  this->get_weights(0).set_matrix_distribution(dist);
+
+  // Setup gradient w.r.t. weights
+  m_weights_gradient->AlignWith(dist);
+  m_weights_gradient->Resize(num_channels, 2);
+}
+
+LBANN_DEFINE_LAYER_BUILDER(channelwise_scale_bias);
+
+#ifndef LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class channelwise_scale_bias_layer<T, data_layout::DATA_PARALLEL, Device>;
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE
+
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp
index 9a7cf276a5d..19fb2daf248 100644
--- a/include/lbann/layers/learning/convolution.hpp
+++ b/include/lbann/layers/learning/convolution.hpp
@@ -29,20 +29,51 @@
 
 #include "lbann/layers/learning/base_convolution.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+// Forward declaration.
+namespace callback {
+class imcomm;
+}
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class convolution_distconv_adapter
+  : public base_convolution_adapter<TensorDataType, Device> {
+public:
+  using TensorDevType = typename base_convolution_adapter<TensorDataType, Device>::TensorDevType;
+
+  convolution_distconv_adapter(Layer& layer)
+    : base_convolution_adapter<TensorDataType, Device>(layer)
+  {}
+  virtual ~convolution_distconv_adapter() = default;
+
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+  dc::Shape get_activations_local_shape(int index=0) const override;
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief Standard deep learning convolution.
  *
  *  Applies convolution (more precisely, cross-correlation) to input
  *  tensors. This is primarily optimized for image data in NCHW
  *  format.
  */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class convolution_layer : public base_convolution_layer<Device> {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class convolution_layer
+  : public base_convolution_layer<TensorDataType, Device> {
+
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "convolution layer only supports DATA_PARALLEL");
+
 private:
 
-  friend class lbann_callback_imcomm;
+  friend class callback::imcomm;
 
 public:
 
@@ -54,16 +85,7 @@ class convolution_layer : public base_convolution_layer<Device> {
                     int stride,
                     int dilation,
                     int groups,
-                    bool has_bias = true)
-    : convolution_layer(comm,
-                        num_data_dims,
-                        num_output_channels,
-                        std::vector<int>(num_data_dims, conv_dim),
-                        std::vector<int>(num_data_dims, pad),
-                        std::vector<int>(num_data_dims, stride),
-                        std::vector<int>(num_data_dims, dilation),
-                        groups,
-                        has_bias) {}
+                    bool has_bias = true);
 
   convolution_layer(lbann_comm *comm,
                     int num_data_dims,
@@ -73,21 +95,7 @@ class convolution_layer : public base_convolution_layer<Device> {
                     std::vector<int> strides,
                     std::vector<int> dilations,
                     int groups,
-                    bool has_bias = true)
-    : base_convolution_layer<Device>(
-        comm,
-        num_data_dims,
-        num_output_channels,
-        std::move(conv_dims),
-        std::move(pads),
-        std::move(strides),
-        std::move(dilations),
-        groups,
-        has_bias) {
-    static_assert(Layout == data_layout::DATA_PARALLEL,
-                  "convolution layer only supports DATA_PARALLEL");
-
-  }
+                    bool has_bias = true);
 
   convolution_layer* copy() const override { return new convolution_layer(*this); }
 
@@ -99,62 +107,32 @@ class convolution_layer : public base_convolution_layer<Device> {
 
 protected:
 
-  void setup_dims() override {
-    base_convolution_layer<Device>::setup_dims();
-
-    // Get tensor dimensions
-    const auto& input_dims = this->get_input_dims();
-    auto output_dims = input_dims;
-
-    // Initialize output tensor dimensions
-    output_dims[0] = this->m_output_channels;
-    for (size_t i = 0; i < output_dims.size() - 1; ++i) {
-      const auto& input_dim = input_dims[i+1];
-      const auto& kernel_dim = this->m_conv_dims[i];
-      const auto& stride = this->m_strides[i];
-      const auto& pad = this->m_pads[i];
-      const auto& dilation = this->m_dilations[i];
-      const auto& effective_dim = (input_dim
-                                   + 2 * pad
-                                   - dilation * (kernel_dim-1));
-      output_dims[i+1] = (effective_dim + stride - 1) / stride;
-    }
-    this->set_output_dims(output_dims);
-
-  }
-
-  std::vector<int> get_kernel_dims() const {
-    std::vector<int> dims;
-    dims.push_back(this->m_output_channels);
-    dims.push_back(this->get_input_dims()[0] / this->m_groups);
-    dims.insert(dims.end(),
-                this->m_conv_dims.begin(),
-                this->m_conv_dims.end());
-    return dims;
-  }
-
-  void fp_compute() override {
-    if(this->using_gpus()) {
-      base_convolution_layer<Device>::apply_convolution_cudnn(true);
-      base_convolution_layer<Device>::apply_bias_cudnn();
-    } else {
-      base_convolution_layer<Device>::apply_convolution_im2col(true);
-      base_convolution_layer<Device>::apply_bias_cpu();
-    }
-  }
-
-  void bp_compute() override {
-    if(this->using_gpus()) {
-      base_convolution_layer<Device>::compute_gradients_cudnn(false);
-      base_convolution_layer<Device>::apply_transposed_convolution_cudnn(false);
-    } else {
-      base_convolution_layer<Device>::compute_gradients_im2col(false);
-      base_convolution_layer<Device>::apply_transposed_convolution_im2col(false);
-    }
-  }
-
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  std::vector<int> get_kernel_dims() const override;
+  void fp_compute() override;
+  void bp_compute() override;
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class convolution_distconv_adapter<TensorDataType, Layout, Device>;
+ protected:
+  void setup_distconv_adapter() override;
+  bool is_distconv_supported() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(convolution);
+
+#ifndef LBANN_CONVOLUTION_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class convolution_layer<T, data_layout::DATA_PARALLEL, Device>;
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CONVOLUTION_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LEARNING_CONVOLUTION_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp
index f3c1f7bdd9e..6ebce704f9b 100644
--- a/include/lbann/layers/learning/deconvolution.hpp
+++ b/include/lbann/layers/learning/deconvolution.hpp
@@ -28,19 +28,38 @@
 #define LBANN_LAYERS_LEARNING_DECONVOLUTION_HPP_INCLUDED
 
 #include "lbann/layers/learning/base_convolution.hpp"
-#include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
 // Forward declaration.
-class lbann_callback_imcomm;
+namespace callback {
+class imcomm;
+}
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class deconvolution_distconv_adapter: public base_convolution_adapter<TensorDataType, Device> {
+ public:
+  using TensorDevType = typename base_convolution_adapter<TensorDataType, Device>::TensorDevType;
+
+  deconvolution_distconv_adapter(Layer& layer): base_convolution_adapter<TensorDataType, Device>(layer) {}
+  virtual ~deconvolution_distconv_adapter() = default;
+
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+  dc::Shape get_activations_local_shape(int index=0) const override;
+};
+#endif // LBANN_HAS_DISTCONV
 
 /** @brief Transpose of the convolution layer. */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class deconvolution_layer : public base_convolution_layer<Device> {
+template <typename TensorDataType, data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
+class deconvolution_layer : public base_convolution_layer<TensorDataType, Device> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "deconvolution layer only supports DATA_PARALLEL");
 private:
 
-  friend class lbann_callback_imcomm;
+  friend class callback::imcomm;
 
 public:
 
@@ -52,16 +71,7 @@ class deconvolution_layer : public base_convolution_layer<Device> {
                       int stride,
                       int dilation,
                       int groups,
-                      bool has_bias = true)
-    : deconvolution_layer(comm,
-                          num_data_dims,
-                          num_output_channels,
-                          std::vector<int>(num_data_dims, conv_dim),
-                          std::vector<int>(num_data_dims, pad),
-                          std::vector<int>(num_data_dims, stride),
-                          std::vector<int>(num_data_dims, dilation),
-                          groups,
-                          has_bias) {}
+                      bool has_bias = true);
 
   deconvolution_layer(lbann_comm *comm,
                       int num_data_dims,
@@ -71,108 +81,43 @@ class deconvolution_layer : public base_convolution_layer<Device> {
                       std::vector<int> strides,
                       std::vector<int> dilations,
                       int groups,
-                      bool has_bias = true)
-    : base_convolution_layer<Device>(
-        comm,
-        num_data_dims,
-        num_output_channels,
-        std::move(conv_dims),
-        std::move(pads),
-        std::move(strides),
-        std::move(dilations),
-        groups,
-        has_bias) {
-    static_assert(Layout == data_layout::DATA_PARALLEL,
-                  "deconvolution layer only supports DATA_PARALLEL");
+                      bool has_bias = true);
 
+  deconvolution_layer* copy() const override {
+    return new deconvolution_layer(*this);
   }
 
-  deconvolution_layer* copy() const override { return new deconvolution_layer(*this); }
-
   std::string get_type() const override { return "deconvolution"; }
 
   data_layout get_data_layout() const override { return Layout; }
 
   El::Device get_device_allocation() const override { return Device; }
 
-  void setup_dims() override {
-    base_convolution_layer<Device>::setup_dims();
-    std::stringstream err;
-
-    // Get tensor dimensions
-    const auto& input_dims = this->get_input_dims();
-    auto output_dims = input_dims;
-
-    // Check for unsupported features
-    /// @todo Implement dilated and grouped deconvolution
-    if (std::any_of(this->m_dilations.begin(),
-                    this->m_dilations.end(),
-                    [] (int d) { return d != 1; })) {
-      err << this->get_type() << " layer "
-          << "\"" << this->get_name() << "\" "
-          << "has non-unit dilations (";
-      for (size_t i = 0; i < this->m_dilations.size(); ++i) {
-        err << (i > 0 ? ", " : "") << this->m_dilations[i];
-      }
-      err << ")";
-      LBANN_ERROR(err.str());
-    }
-    if (this->m_groups != 1) {
-      err << this->get_type() << " layer "
-          << "\"" << this->get_name() << "\" "
-          << "has non-unit groups "
-          << "(" << this->m_groups << ")";
-      LBANN_ERROR(err.str());
-    }
-
-    // Initialize output tensor dimensions
-    /// @todo Dilated deconvolution
-    output_dims[0] = this->m_output_channels;
-    for (size_t i = 0; i < output_dims.size() - 1; ++i) {
-      const auto& input_dim = input_dims[i+1];
-      const auto& kernel_dim = this->m_conv_dims[i];
-      const auto& stride = this->m_strides[i];
-      const auto& pad = this->m_pads[i];
-      // const auto& dilation = this->m_dilations[i];
-      output_dims[i+1] = (input_dim-1) * stride + kernel_dim - 2 * pad;
-    }
-    this->set_output_dims(output_dims);
-
-  }
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
 
 protected:
 
-  std::vector<int> get_kernel_dims() const {
-    std::vector<int> dims;
-    dims.push_back(this->get_input_dims()[0]);
-    dims.push_back(this->m_output_channels);
-    dims.insert(dims.end(),
-                this->m_conv_dims.begin(),
-                this->m_conv_dims.end());
-    return dims;
-  }
+  std::vector<int> get_kernel_dims() const override;
+  void fp_compute() override;
+  void bp_compute() override;
 
-  void fp_compute() override {
-    if(this->using_gpus()) {
-      base_convolution_layer<Device>::apply_transposed_convolution_cudnn(true);
-      base_convolution_layer<Device>::apply_bias_cudnn();
-    } else {
-      base_convolution_layer<Device>::apply_transposed_convolution_im2col(true);
-      base_convolution_layer<Device>::apply_bias_cpu();
-    }
-  }
+#ifdef LBANN_HAS_DISTCONV
+  friend class deconvolution_distconv_adapter<TensorDataType, Layout, Device>;
+ protected:
+  void setup_distconv_adapter() override;
+  bool is_distconv_supported() const override;
+#endif // LBANN_HAS_DISTCONV
+};
 
-  void bp_compute() override {
-    if(this->using_gpus()) {
-      base_convolution_layer<Device>::compute_gradients_cudnn(true);
-      base_convolution_layer<Device>::apply_convolution_cudnn(false);
-    } else {
-      base_convolution_layer<Device>::compute_gradients_im2col(true);
-      base_convolution_layer<Device>::apply_convolution_im2col(false);
-    }
-  }
+#ifndef LBANN_DECONVOLUTION_LAYER_INSTANTIATE
 
-};
+#define PROTO_DEVICE(T, Device) \
+  extern template class deconvolution_layer<T, data_layout::DATA_PARALLEL, Device>;
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_DECONVOLUTION_LAYER_INSTANTIATE
 
 } // namespace lbann
 
diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp
new file mode 100644
index 00000000000..5c26e1975f3
--- /dev/null
+++ b/include/lbann/layers/learning/embedding.hpp
@@ -0,0 +1,267 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED
+#define LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/memory.hpp"
+
+namespace lbann {
+
+/** @brief Lookup table to vectors of fixed size.
+ *
+ *  Each input value is interpreted as an index and the corresponding
+ *  embedding vector is output. Thus, given an input vector of length
+ *  @f$ \text{sequence\_length} @f$, the output is a
+ *  @f$ \text{sequence\_length} \times \text{embedding\_dim} @f$ tensor.
+ *  If an index is out-of-range, then corresponding output is a vector
+ *  of zeros.
+ *
+ *  The embedding vectors are stored in an
+ *  @f$ \text{embedding\_dim} \times \text{num\_embeddings} @f$
+ *  weights matrix. Note that this is the transpose of the weights in
+ *  the PyTorch embedding layer.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class embedding_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "embedding layer only supports data parallel layout");
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  ///@}
+
+public:
+
+  /**
+   *  @param comm           LBANN communicator.
+   *  @param num_embeddings Size of dictionary of embeddings.
+   *  @param embedding_dim  Size of embedding vectors.
+   *  @param padding_idx    If set, then the corresponding embedding
+   *                        vector is initialized with zeros. The
+   *                        objective function gradient w.r.t. this
+   *                        embedding vector is always zero.
+   */
+  embedding_layer(lbann_comm* comm,
+                  size_t num_embeddings,
+                  size_t embedding_dim,
+                  El::Int padding_idx=-1);
+
+  embedding_layer(const embedding_layer& other);
+  embedding_layer& operator=(const embedding_layer& other);
+  ~embedding_layer() = default;
+
+  embedding_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+
+  description get_description() const override;
+
+protected:
+
+  void setup_matrices(const El::Grid& grid) override;
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_data(size_t max_mini_batch_size) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Size of dictionary of embeddings. */
+  size_t m_num_embeddings;
+  /** Size of embedding vectors. */
+  size_t m_embedding_dim;
+  /** If the padding index is set, then the corresponding embedding
+   *  vector is initialized with zeros. The objective function
+   *  gradient w.r.t. this embedding vector is always zero.
+   */
+  El::Int m_padding_idx;
+
+  /** Gradient w.r.t. embedding weights. */
+  std::unique_ptr<AbsDistMatrixType> m_embeddings_grad;
+
+};
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+embedding_layer<TensorDataType,Layout,Device>::embedding_layer(
+  lbann_comm* comm,
+  size_t num_embeddings,
+  size_t embedding_dim,
+  El::Int padding_idx)
+  : data_type_layer<TensorDataType>(comm),
+    m_num_embeddings{num_embeddings},
+    m_embedding_dim{embedding_dim},
+    m_padding_idx{padding_idx} {}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+embedding_layer<TensorDataType,Layout,Device>::embedding_layer(
+ const embedding_layer<TensorDataType,Layout,Device>& other)
+  : data_type_layer<TensorDataType>(other),
+    m_num_embeddings{other.m_num_embeddings},
+    m_embedding_dim{other.m_embedding_dim},
+    m_padding_idx{other.m_padding_idx},
+    m_embeddings_grad(other.m_embeddings_grad
+                      ? other.m_embeddings_grad->Copy()
+                      : nullptr) {}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+embedding_layer<TensorDataType,Layout,Device>& embedding_layer<TensorDataType,Layout,Device>::operator=(
+  const embedding_layer<TensorDataType,Layout,Device>& other) {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_num_embeddings = other.m_num_embeddings;
+  m_embedding_dim = other.m_embedding_dim;
+  m_padding_idx = other.m_padding_idx;
+  m_embeddings_grad.reset(other.m_embeddings_grad
+                          ? other.m_embeddings_grad->Copy()
+                          : nullptr);
+  return *this;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+embedding_layer<TensorDataType,Layout,Device>* embedding_layer<TensorDataType,Layout,Device>::copy() const {
+  return new embedding_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string embedding_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "embedding";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout embedding_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device embedding_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description embedding_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Num embeddings", m_num_embeddings);
+  desc.add("Embedding dim", m_embedding_dim);
+  desc.add("Padding index", m_padding_idx);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void embedding_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  auto dims = this->get_input_dims();
+  dims.push_back(static_cast<int>(m_embedding_dim));
+  this->set_output_dims(dims);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void embedding_layer<TensorDataType,Layout,Device>::setup_data(size_t max_mini_batch_size) {
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  // Construct default weights if needed
+  // Note: Randomly drawn from normal distribution with mean 0 and
+  // standard deviation 1.
+  if (!this->has_weights()) {
+    auto w = make_unique<WeightsType>(this->get_comm());
+    auto init = make_unique<normal_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::Zero(),
+                                                                El::TypeTraits<TensorDataType>::One());
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+    w->set_name(this->get_name() + "_weights");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->add_weights(w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+  if (this->num_weights() != 1) {
+    LBANN_ERROR("attempted to setup ",
+                this->get_type()," layer \"",this->get_name(),"\" ",
+                "with an invalid number of weights ",
+                "(expected 1, found ",this->num_weights(),")");
+  }
+
+  // Initialize dictionary
+  auto& embeddings = this->get_weights(0);
+  auto matrix_dist = this->get_prev_activations().DistData();
+  matrix_dist.colDist = El::STAR;
+  matrix_dist.rowDist = El::STAR;
+  embeddings.set_dims({static_cast<int>(m_embedding_dim)},
+                      {static_cast<int>(m_num_embeddings)});
+  embeddings.set_matrix_distribution(matrix_dist);
+  embeddings.setup();
+
+  // Zero out embedding vector for padding index
+  if (0 <= m_padding_idx
+      && m_padding_idx < static_cast<El::Int>(m_embedding_dim)) {
+    // FIXME (trb 06/01/2020): Assuming embedding values have data
+    // type that matches this layer. In future, we should abstract
+    // this or dynamically dispatch it.
+    auto& embedding_values =
+      dynamic_cast<AbsDistMatrixType&>(embeddings.get_values());
+    std::unique_ptr<AbsDistMatrixType> pad_embedding(
+      embedding_values.Construct(embedding_values.Grid(),
+                                 embedding_values.Root()));
+    El::View(*pad_embedding, embedding_values, El::ALL, El::IR(m_padding_idx));
+    El::Zero(*pad_embedding);
+  }
+
+  // Initialize gradient w.r.t. embeddings
+  m_embeddings_grad->Resize(m_embedding_dim, m_num_embeddings);
+
+}
+
+LBANN_DEFINE_LAYER_BUILDER(embedding);
+
+#ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class embedding_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_EMBEDDING_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp
new file mode 100644
index 00000000000..cd4535e059b
--- /dev/null
+++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp
@@ -0,0 +1,247 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED
+#define LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+/** @brief Apply scale and bias to tensor entries.
+ *
+ *  Scale and bias terms are applied independently to each tensor
+ *  entry. More precisely, given input, output, scale, and bias
+ *  tensors @f$ X,Y,A,B\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$:
+ *  @f[
+ *    Y = A \circ X + B
+ *  @f]
+ *
+ *  The scale and bias terms are fused into a single weights tensor to
+ *  reduce the number of gradient allreduces during backprop. In
+ *  particular, the weights tensor is a
+ *  @f$ \text{size} \times 2 @f$ matrix, where the first
+ *  column correspond to scale terms and the second column to bias
+ *  terms.
+ */
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class entrywise_scale_bias_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  ///@}
+
+public:
+
+  entrywise_scale_bias_layer(lbann_comm *comm);
+  entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other);
+  entrywise_scale_bias_layer& operator=(
+    const entrywise_scale_bias_layer& other);
+
+  entrywise_scale_bias_layer* copy() const override {
+    return new entrywise_scale_bias_layer(*this);
+  }
+
+  std::string get_type() const override { return "entry-wise scale/bias"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  void setup_matrices(const El::Grid& grid) override;
+  void setup_data(size_t max_mini_batch_size) override;
+
+  void fp_setup_outputs(El::Int mini_batch_size) override;
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override;
+
+protected:
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** @brief Objective function gradient w.r.t. weights. */
+  std::unique_ptr<AbsDistMatrixType> m_weights_gradient;
+
+};
+
+// Implementation
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::entrywise_scale_bias_layer(lbann_comm *comm)
+  : data_type_layer<TensorDataType>(comm)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other)
+  : data_type_layer<TensorDataType>(other),
+  m_weights_gradient(other.m_weights_gradient ?
+                     other.m_weights_gradient->Copy() : nullptr)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+auto entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::operator=(const entrywise_scale_bias_layer& other)
+  -> entrywise_scale_bias_layer& {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_weights_gradient.reset(other.m_weights_gradient ?
+                           other.m_weights_gradient->Copy() :
+                           nullptr);
+  return *this;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::setup_matrices(const El::Grid& grid) {
+  data_type_layer<TensorDataType>::setup_matrices(grid);
+  auto dist = this->get_prev_activations().DistData();
+  dist.rowDist = El::STAR;
+  m_weights_gradient.reset(AbsDistMatrixType::Instantiate(dist));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::setup_data(size_t max_mini_batch_size) {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+    // Initialize output dimensions
+    this->set_output_dims(this->get_input_dims());
+    const auto output_dims = this->get_output_dims();
+    const El::Int output_size = this->get_output_size();
+
+    // Construct default weights if needed
+    // Note: Scale is initialized to 1 and bias to 0
+    if (!this->has_weights()) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      std::vector<TensorDataType> vals(2*output_size,
+                                       El::TypeTraits<TensorDataType>::Zero());
+      std::fill(vals.begin(), vals.begin()+output_size,
+                El::TypeTraits<TensorDataType>::One());
+      auto init = make_unique<value_initializer<TensorDataType>>(vals);
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_weights");
+      w->set_initializer(std::move(init));
+      w->set_optimizer(std::move(opt));
+      this->add_weights(w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+    if (this->num_weights() != 1) {
+      LBANN_ERROR("attempted to setup ",
+                  this->get_type()," layer \"",this->get_name(),"\" ",
+                  "with an invalid number of weights ",
+                  "(expected 1, found ",this->num_weights(),")");
+    }
+
+    // Setup weights
+    auto dist = this->get_prev_activations().DistData();
+    dist.rowDist = El::STAR;
+    this->get_weights(0).set_dims(output_dims,
+                                     {static_cast<int>(2)});
+    this->get_weights(0).set_matrix_distribution(dist);
+
+    // Setup gradient w.r.t. weights
+    m_weights_gradient->AlignWith(dist);
+    m_weights_gradient->Resize(output_size, 2);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::fp_setup_outputs(El::Int mini_batch_size) {
+  data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+
+#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123
+
+  // Check that input and weights tensors are aligned
+  /// @todo Realign weights tensor if misaligned
+  bool aligned = true;
+  try {
+    const auto& x = this->get_prev_activations();
+    const auto& w = m_weights[0]->get_values();
+    aligned = (x.ColAlign() == w.ColAlign()
+               && x.RowAlign() == w.RowAlign());
+  }
+  catch (const exception& e) {
+    // An exception is thrown if you try accessing weights values
+    // before they are initialized. We don't care if this case is
+    // aligned, so it's safe to ignore.
+  }
+  if (!aligned) {
+    std::ostringstream err;
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has misaligned input and weights matrices";
+    LBANN_ERROR(err.str());
+  }
+
+#endif // 0
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void
+entrywise_scale_bias_layer<TensorDataType, Layout, Dev>
+::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) {
+  data_type_layer<TensorDataType>::bp_setup_gradient_wrt_inputs(mini_batch_size);
+  m_weights_gradient->Empty(false);
+  m_weights_gradient->AlignWith(this->get_prev_activations());
+  m_weights_gradient->Resize(this->get_input_size(), 2);
+}
+
+
+LBANN_DEFINE_LAYER_BUILDER(entrywise_scale_bias);
+
+#ifndef LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                     \
+  extern template class entrywise_scale_bias_layer< \
+    T, data_layout::DATA_PARALLEL, Device>;         \
+  extern template class entrywise_scale_bias_layer< \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp
index f62c2318594..68b6ce23135 100644
--- a/include/lbann/layers/learning/fully_connected.hpp
+++ b/include/lbann/layers/learning/fully_connected.hpp
@@ -29,67 +29,55 @@
 
 #include "lbann/layers/learning/learning.hpp"
 #include "lbann/models/model.hpp"
-#include "lbann/weights/initializer.hpp"
-#include "lbann/weights/variance_scaling_initializers.hpp"
+
 #include <string>
-#include <sstream>
 
 namespace lbann {
 
-/** @brief Perform an affine transformation. */
-template <data_layout T_layout, El::Device Dev>
-class fully_connected_layer : public learning_layer {
+/** @brief Affine transformation
+ *
+ *  Flattens the input tensor, multiplies with a weights matrix, and
+ *  optionally applies an entry-wise bias. Following the
+ *  column-vector convention:
+ *    @f[ y = W * \text{vec}(x) + b @f]
+ *
+ *  Two weights are required if bias is applied: the linearity and the
+ *  bias. Only the linearity weights are required if bias is not
+ *  applied. If weights aren't provided, the linearity weights are
+ *  initialized with He normal initialization and the bias weights are
+ *  initialized to zero.
+ */
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class fully_connected_layer : public learning_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
 
-  /** @todo Accept a vector for output_size */
-  fully_connected_layer(lbann_comm *comm,
-                        int output_size,
-                        bool transpose = false,
-                        weights* weight = nullptr,
-                        bool has_bias = true)
-    : learning_layer(comm),
-      m_bias_gradient(nullptr),
-      m_transpose(transpose) {
-
-    // Initialize output tensor dimensions
-    set_output_dims({output_size});
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
 
-    // Initialize bias
-    m_bias_scaling_factor = has_bias ? DataType(1) : DataType(0);
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
 
-  }
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
 
-  fully_connected_layer(const fully_connected_layer& other) :
-    learning_layer(other),
-    m_bias_scaling_factor(other.m_bias_scaling_factor),
-    m_transpose(other.m_transpose) {
+  ///@}
 
-    // Deep matrix copies
-    m_bias_gradient = other.m_bias_gradient;
-    if (m_bias_gradient != nullptr) {
-      m_bias_gradient = m_bias_gradient->Copy();
-    }
-
-  }
+public:
 
-  fully_connected_layer& operator=(const fully_connected_layer& other) {
-    learning_layer::operator=(other);
-    m_bias_scaling_factor = other.m_bias_scaling_factor;
-    m_transpose = other.m_transpose;
+  /** @todo Accept a vector for output_size */
+  fully_connected_layer(lbann_comm *comm,
+                        int output_size,
+                        bool transpose = false,
+                        WeightsType* weight = nullptr,
+                        bool has_bias = true);
 
-    // Deep matrix copies
-    deallocate_matrices();
-    m_bias_gradient = other.m_bias_gradient;
-    if (m_bias_gradient != nullptr) {
-      m_bias_gradient = m_bias_gradient->Copy();
-    }
+  fully_connected_layer(const fully_connected_layer& other);
 
-    return *this;
-  }
+  fully_connected_layer& operator=(const fully_connected_layer& other);
 
-  ~fully_connected_layer() override {
-    deallocate_matrices();
-  }
+  ~fully_connected_layer() override;
 
   fully_connected_layer* copy() const override {
     return new fully_connected_layer(*this);
@@ -99,110 +87,12 @@ class fully_connected_layer : public learning_layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  description get_description() const override {
-    auto&& desc = learning_layer::get_description();
-    const auto& bias_str = (m_bias_scaling_factor == DataType(0) ?
-                            "disabled" : "enabled");
-    desc.add("Bias", bias_str);
-    return desc;
-  }
+  description get_description() const override;
 
 protected:
 
   void setup_matrices(const El::Grid& grid) override;
-
-  void setup_data() override {
-    learning_layer::setup_data();
-
-    // Initialize default weights if none are provided
-    if (this->m_weights.size() > 2) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << "attempted to setup " << m_name << " with an invalid number of weights";
-      throw lbann_exception(err.str());
-    }
-    if (m_bias_scaling_factor != DataType(0)) {
-      this->m_weights.resize(2, nullptr);
-    } else {
-      this->m_weights.resize(1, nullptr);
-    }
-    if (this->m_weights[0] == nullptr) {
-      auto* w = new weights(get_comm());
-      std::unique_ptr<weights_initializer> init(new he_initializer(probability_distribution::gaussian));
-      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-      w->set_name(get_name() + "_linearity_weights");
-      w->set_initializer(init);
-      w->set_optimizer(opt);
-      this->m_weights[0] = w;
-      this->m_model->add_weights(w);
-    }
-    auto& linearity_weights = *this->m_weights[0];
-
-    // Initialize variance scaling initialization
-    auto* cast_initializer
-      = dynamic_cast<variance_scaling_initializer*>(linearity_weights.get_initializer());
-    if (cast_initializer != nullptr) {
-      cast_initializer->set_fan_in(get_input_size());
-      cast_initializer->set_fan_out(get_output_size());
-    }
-
-    // Setup linearity weights
-    auto linearity_dist = get_prev_activations().DistData();
-    if (linearity_dist.colDist != El::MC
-        || linearity_dist.rowDist != El::MR) {
-      linearity_dist.colDist = El::STAR;
-      linearity_dist.rowDist = El::STAR;
-    }
-    if (m_transpose) {
-      linearity_weights.set_dims(get_input_dims(), get_output_dims());
-    } else {
-      linearity_weights.set_dims(get_output_dims(), get_input_dims());
-    }
-    linearity_weights.set_matrix_distribution(linearity_dist);
-
-    // Set up bias if needed.
-    if (m_bias_scaling_factor != DataType(0)) {
-      if (this->m_weights[1] == nullptr) {
-        auto* w = new weights(get_comm());
-        std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-        w->set_name(get_name() + "_bias_weights");
-        w->set_optimizer(opt);
-        this->m_weights[1] = w;
-        this->m_model->add_weights(w);
-      }
-      auto& bias_weights = *this->m_weights[1];
-      // Setup bias weights
-      auto bias_dist = get_activations().DistData();
-      bias_dist.rowDist = El::STAR;
-      bias_weights.set_dims(get_output_dims());
-      bias_weights.set_matrix_distribution(bias_dist);
-      if (this->m_bias_gradient != nullptr) {
-        El::Zeros(*this->m_bias_gradient,
-                  bias_weights.get_matrix_height(),
-                  bias_weights.get_matrix_width());
-      }
-    }
-
-    // Initialize freeze state
-    for (auto&& w : this->m_weights) {
-      if (m_frozen) {
-        w->freeze();
-      } else {
-        w->unfreeze();
-      }
-    }
-    for (auto&& w : this->m_weights) {
-      if (w->is_frozen() != m_frozen) {
-        std::stringstream err;
-        err << (m_frozen ? "" : "un") << "frozen "
-            << "layer \"" << get_name() << "\" has "
-            << (w->is_frozen() ? "" : "un") << "frozen "
-            << "weights \"" << w->get_name() << "\"";
-        LBANN_ERROR(err.str());
-      }
-    }
-
-  }
+  void setup_data(size_t max_mini_batch_size) override;
 
   void fp_compute() override;
   void bp_compute() override;
@@ -212,13 +102,13 @@ class fully_connected_layer : public learning_layer {
   /** Scaling factor for bias term.
    *  If the scaling factor is zero, bias is not applied.
    */
-  DataType m_bias_scaling_factor;
+  TensorDataType m_bias_scaling_factor;
 
   /** Bias weights gradient.
    *  This is this layer's contribution to the objective function
    *  gradient w.r.t. the bias weights.
    */
-  AbsDistMat* m_bias_gradient;
+  AbsDistMatrixType* m_bias_gradient;
 
   /** Whether the transpose of the linearity matrix is applied. */
   bool m_transpose;
@@ -228,8 +118,26 @@ class fully_connected_layer : public learning_layer {
     if (m_bias_gradient != nullptr) delete m_bias_gradient;
   }
 
+  template <typename U>
+  friend void fp_compute_impl(fully_connected_layer<U, T_layout, Dev>& l);
+  template <typename U>
+  friend void bp_compute_impl(fully_connected_layer<U, T_layout, Dev>& l);
 };
 
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(fully_connected);
+
+#ifndef LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class fully_connected_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class fully_connected_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LEARNING_FULLY_CONNECTED_HPP_INCLUDED
diff --git a/include/lbann/layers/learning/learning.hpp b/include/lbann/layers/learning/learning.hpp
index 2f2ab120bc5..f3b0d5e451c 100644
--- a/include/lbann/layers/learning/learning.hpp
+++ b/include/lbann/layers/learning/learning.hpp
@@ -27,16 +27,18 @@
 #ifndef LBANN_LAYER_LEARNING_HPP_INCLUDED
 #define LBANN_LAYER_LEARNING_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
 /** @todo Remove. Layers should inherit directly from the base layer
  *  class.
  */
-class learning_layer : public Layer {
+
+template <typename TensorDataType>
+class learning_layer : public data_type_layer<TensorDataType> {
  public:
-  learning_layer(lbann_comm *comm) : Layer(comm) {}
+  learning_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
 };
 
 } // namespace lbann
diff --git a/include/lbann/layers/loss/categorical_accuracy.hpp b/include/lbann/layers/loss/categorical_accuracy.hpp
index 078abb6b2a4..aa5a5e0d006 100644
--- a/include/lbann/layers/loss/categorical_accuracy.hpp
+++ b/include/lbann/layers/loss/categorical_accuracy.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -42,11 +42,11 @@ namespace lbann {
  *  This is primarily intended for use as a metric since it is not
  *  differentiable.
  */
-template <data_layout T_layout, El::Device Dev>
-class categorical_accuracy_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class categorical_accuracy_layer : public data_type_layer<TensorDataType> {
 public:
 
-  categorical_accuracy_layer(lbann_comm *comm) : Layer(comm) {
+  categorical_accuracy_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 2;
   }
 
@@ -57,18 +57,18 @@ class categorical_accuracy_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
 
     // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -85,6 +85,19 @@ class categorical_accuracy_layer : public Layer {
 
 };
 
+#ifndef LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                     \
+  extern template class categorical_accuracy_layer< \
+    T, data_layout::DATA_PARALLEL, Device>;         \
+  extern template class categorical_accuracy_layer< \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp
index e2ee89e4350..238bda958a4 100644
--- a/include/lbann/layers/loss/cross_entropy.hpp
+++ b/include/lbann/layers/loss/cross_entropy.hpp
@@ -27,33 +27,59 @@
 #ifndef LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class cross_entropy_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  cross_entropy_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~cross_entropy_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  dc::Shape get_prev_activations_shape(int index) const override;
+  dc::Shape get_activations_shape(int index) const override;
+  dc::Shape get_activations_local_shape(int index) const override;
+  void setup_layer(size_t workspace_capacity) override;
+  std::unique_ptr<dc::CrossEntropy> m_cross_entropy;
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief Cross entropy loss function.
  *
  *  Given a predicted distribution @f$y@f$ and ground truth
  *  distribution @f$\hat{y}@f$,
  *  @f[ CE(y,\hat{y}) = - \sum\limits_{i} \hat{y}_i \log y_i @f]
  */
-template <data_layout T_layout, El::Device Dev>
-class cross_entropy_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class cross_entropy_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
-  cross_entropy_layer(lbann_comm *comm) : Layer(comm) {
+  cross_entropy_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 2;
   }
 
   cross_entropy_layer(const cross_entropy_layer& other)
-    : Layer(other) {
+    : data_type_layer<TensorDataType>(other) {
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
   }
 
   cross_entropy_layer& operator=(const cross_entropy_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
@@ -65,18 +91,31 @@ class cross_entropy_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
+
+#ifdef LBANN_HAS_DISTCONV
+    // In the current implementation of cross entropy in Distconv, we
+    // do not use the reshape layer and just assumes both inputs have
+    // the matching shape. Therefore, the following check on the input
+    // dimensions would fail. We could address this by either 1)
+    // implementing the reshape layer, or 2) giving a proper shape to
+    // the ground-truth data.
+    //
+    if (this->distconv_enabled()) {
+      return;
+    }
+#endif
 
     // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -89,19 +128,21 @@ class cross_entropy_layer : public Layer {
 
   }
 
-  void setup_data() override {
-    Layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize workspace
-    const auto& prediction = get_prev_activations(0);
-    switch (get_data_layout()) {
+    const auto& prediction = this->get_prev_activations(0);
+    switch (this->get_data_layout()) {
     case data_layout::DATA_PARALLEL:
-      m_workspace.reset(new StarVCMat<Dev>(prediction.Grid(),
-                                           prediction.Root()));
+      m_workspace.reset(new StarVCMatDT<TensorDataType, Dev>(
+                          prediction.Grid(),
+                          prediction.Root()));
       break;
     case data_layout::MODEL_PARALLEL:
-      m_workspace.reset(new StarMRMat<Dev>(prediction.Grid(),
-                                           prediction.Root()));
+      m_workspace.reset(new StarMRMatDT<TensorDataType, Dev>(
+                          prediction.Grid(),
+                          prediction.Root()));
       break;
     default: LBANN_ERROR("invalid data layout");
     }
@@ -115,55 +156,193 @@ class cross_entropy_layer : public Layer {
 
   void fp_compute() override {
 
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      fp_compute_distconv();
+      return;
+    }
+#endif
+
     // Initialize workspace
-    const auto& prediction = get_prev_activations(0);
+    const auto& prediction = this->get_prev_activations(0);
     m_workspace->AlignWith(prediction.DistData());
     m_workspace->Resize(1, prediction.Width());
 
     // Compute local contributions and accumulate
     /// @todo Consider reduce rather than allreduce
-    local_fp_compute(get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->Matrix());
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-    El::Copy(*m_workspace, get_activations());
+    local_fp_compute();
+    this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, this->get_activations());
 
   }
 
   void bp_compute() override {
 
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      bp_compute_distconv();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+
     // Initialize workspace
-    const auto& prediction = get_prev_activations(0);
+    const auto& prediction = this->get_prev_activations(0);
     m_workspace->AlignWith(prediction.DistData());
-    El::Copy(get_prev_error_signals(), *m_workspace);
+    El::Copy(this->get_prev_error_signals(), *m_workspace);
 
     // Compute local gradients
-    local_bp_compute(get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->LockedMatrix(),
-                     get_local_error_signals(0),
-                     get_local_error_signals(1));
-
+    local_bp_compute();
   }
 
 private:
 
   /** Compute local contributions to cross entropy loss. */
-  static void local_fp_compute(const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               AbsMat& local_contribution);
+  void local_fp_compute();
   /** Compute local gradients. */
-  static void local_bp_compute(const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               const AbsMat& local_gradient_wrt_output,
-                               AbsMat& local_gradient_wrt_prediction,
-                               AbsMat& local_gradient_wrt_ground_truth);
+  void local_bp_compute();
 
   /** Workspace matrix. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>;
+ protected:
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<
+      cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
+  }
+
+  cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+
+  void fp_compute_distconv() {
+    assert_always(this->distconv_enabled());
+    get_distconv_adapter().m_cross_entropy->forward(this->get_distconv_adapter().get_prev_activations(0),
+                                  this->get_distconv_adapter().get_prev_activations(1),
+                                  this->get_distconv_adapter().get_activations());
+  }
 
+  void bp_compute_distconv() {
+    assert_always(this->distconv_enabled());
+    get_distconv_adapter().m_cross_entropy->backward(this->get_distconv_adapter().get_prev_activations(0),
+                                   this->get_distconv_adapter().get_prev_activations(1),
+                                   this->get_distconv_adapter().get_prev_error_signals(0),
+                                   this->get_distconv_adapter().get_error_signals(0),
+                                   this->get_distconv_adapter().get_error_signals(1));
+  }
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&
+cross_entropy_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const cross_entropy_distconv_adapter<
+    TensorDataType, T_layout, Dev>&>(data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&
+cross_entropy_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const cross_entropy_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_prev_activations_shape(int index) const {
+  // Assumes both of the two input tensors have the equal shape.
+  return data_type_distconv_adapter<TensorDataType>::get_prev_activations_shape(0);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_activations_shape(int output_index) const {
+  // NOTE: LBANN matrix is a 2-D matrix, while Distconv keeps the
+  // original spatial and channel dimensions, so
+  // get_output_tensor_shape() doesn't work here.
+  dc::Shape shape = this->get_prev_activations_shape(0);
+  for (int i = 0; i < shape.num_dims() - 1; ++i) {
+    shape[i] = 1;
+  }
+  return shape;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_activations_local_shape(int index) const {
+  assert_eq(index, 0);
+  auto input_shape = this->get_prev_activations().get_local_shape();
+  for (int i = 0; i < input_shape.length() - 1; ++i) {
+    input_shape[i] = 1;
+  }
+  return input_shape;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+  // Output tensors share all dimensions except for the sample dimension
+  auto activations_split = this->get_activations_dist().get_split_shape();
+  auto prev_error_signals_split = this->get_prev_error_signals_dist().get_split_shape();
+  for (int i = 0; i < activations_split.length() - 1; ++i) {
+    activations_split[i] = 1;
+    prev_error_signals_split[i] = 1;
+  }
+  this->get_activations_dist().set_split_shape(activations_split);
+  this->get_prev_error_signals_dist().set_split_shape(prev_error_signals_split);
+
+  for (auto &d: this->m_prev_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_prev_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
+    size_t workspace_capacity) {
+  m_cross_entropy = make_unique<dc::CrossEntropy>(dc::get_backend());
+  m_cross_entropy->setup(this->get_prev_activations(0),
+                         this->get_prev_activations(1),
+                         this->get_activations(0));
+}
+#endif // LBANN_HAS_DISTCONV
+
+#ifndef LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)              \
+  extern template class cross_entropy_layer< \
+    T, data_layout::DATA_PARALLEL, Device>;  \
+  extern template class cross_entropy_layer< \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp
index 6e55f58313e..6bb9ab1b15f 100644
--- a/include/lbann/layers/loss/entrywise.hpp
+++ b/include/lbann/layers/loss/entrywise.hpp
@@ -31,25 +31,46 @@
 
 namespace lbann {
 
+#ifndef LBANN_ENTRYWISE_LAYER_INSTANTIATE
+#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE)                 \
+  extern template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>; \
+  extern template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+#else
+#define BINARY_ETI_DECL_MACRO_DEV(...)
+#endif // LBANN_BINARY_LAYER_INSTANTIATE
+
+#ifdef LBANN_HAS_GPU
+#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T)                      \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU);       \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU)
+#else
+#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T)                \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU)
+#endif // LBANN_HAS_GPU
+
 // Convenience macro to define an entry-wise binary layer class
 #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string)         \
-  struct layer_name##_name_struct {                                     \
-    inline operator std::string() { return layer_string; }              \
-  };                                                                    \
-  template <data_layout Layout, El::Device Device>                      \
-  using layer_name                                                      \
-  = entrywise_binary_layer<Layout, Device, layer_name##_name_struct>;
+  LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string);       \
+  BINARY_ETI_DECL_MACRO(layer_name, float);                             \
+  BINARY_ETI_DECL_MACRO(layer_name, double)
 
 // Cross entropy loss
-DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer, "binary cross entropy");
-DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer, "sigmoid binary cross entropy");
+DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer,
+                              "binary cross entropy");
+DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer,
+                              "sigmoid binary cross entropy");
 
 // Boolean loss functions
 DEFINE_ENTRYWISE_BINARY_LAYER(boolean_accuracy_layer, "Boolean accuracy");
-DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer, "Boolean false negative rate");
-DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer, "Boolean false positive rate");
+DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer,
+                              "Boolean false negative rate");
+DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer,
+                              "Boolean false positive rate");
 
 } // namespace lbann
 
 #undef DEFINE_ENTRYWISE_BINARY_LAYER
+#undef BINARY_ETI_DECL_MACRO
+#undef BINARY_ETI_DECL_MACRO_DEV
+
 #endif // LBANN_LAYERS_LOSS_ENTRYWISE_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/l1_norm.hpp b/include/lbann/layers/loss/l1_norm.hpp
index 8ceb88c09c3..3315b4ec756 100644
--- a/include/lbann/layers/loss/l1_norm.hpp
+++ b/include/lbann/layers/loss/l1_norm.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -35,18 +35,27 @@ namespace lbann {
  *
  *  @f[ \lVert x\rVert_1 = \sum\limits_{i} | x_i | @f]
  */
-template <data_layout T_layout, El::Device Dev>
-class l1_norm_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class l1_norm_layer : public data_type_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
 
-  l1_norm_layer(lbann_comm *comm) : Layer(comm) {}
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
+public:
+
+  l1_norm_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
 
   l1_norm_layer(const l1_norm_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr) {}
   l1_norm_layer& operator=(const l1_norm_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() : nullptr);
     return *this;
@@ -57,18 +66,18 @@ class l1_norm_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
   }
 
-  void setup_data() override {
-    Layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize workspace
-    auto dist = get_prev_activations().DistData();
+    auto dist = this->get_prev_activations().DistData();
     dist.colDist = El::STAR;
-    m_workspace.reset(AbsDistMat::Instantiate(dist));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist));
 #ifdef HYDROGEN_HAVE_CUB
     if (m_workspace->GetLocalDevice() == El::Device::GPU) {
       m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
@@ -81,15 +90,14 @@ class l1_norm_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    m_workspace->Resize(1, get_prev_activations().Width());
+    m_workspace->AlignWith(this->get_prev_activations());
+    m_workspace->Resize(1, this->get_prev_activations().Width());
 
     // Compute local contributions and accumulate
     /// @todo Consider reduce rather than allreduce
-    local_fp_compute(get_local_prev_activations(),
-                     m_workspace->Matrix());
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-    El::Copy(*m_workspace, get_activations());
+    local_fp_compute();
+    this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, this->get_activations());
 
     // Clean up
     m_workspace->Empty();
@@ -100,13 +108,11 @@ class l1_norm_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    El::Copy(get_prev_error_signals(), *m_workspace);
+    m_workspace->AlignWith(this->get_prev_activations());
+    El::Copy(this->get_prev_error_signals(), *m_workspace);
 
     // Compute local gradients
-    local_bp_compute(get_local_prev_activations(),
-                     m_workspace->LockedMatrix(),
-                     get_local_error_signals());
+    local_bp_compute();
 
     // Clean up
     m_workspace->Empty();
@@ -116,18 +122,28 @@ class l1_norm_layer : public Layer {
 private:
 
   /** Compute local contributions to L2 norm. */
-  static void local_fp_compute(const AbsMat& local_input,
-                               AbsMat& local_contribution);
+  void local_fp_compute();
   /** Compute local gradients. */
-  static void local_bp_compute(const AbsMat& local_input,
-                               const AbsMat& local_gradient_wrt_output,
-                               AbsMat& local_gradient_wrt_input);
+  void local_bp_compute();
 
   /** Workspace matrix. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_L1_NORM_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)             \
+  extern template class l1_norm_layer<      \
+    T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class l1_norm_layer<      \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_L1_NORM_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/l2_norm2.hpp b/include/lbann/layers/loss/l2_norm2.hpp
index 15ad24adbd0..0c2d897ba10 100644
--- a/include/lbann/layers/loss/l2_norm2.hpp
+++ b/include/lbann/layers/loss/l2_norm2.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -35,18 +35,27 @@ namespace lbann {
  *
  *  @f[ \lVert x\rVert_2^2 = \sum\limits_{i} x_i^2 @f]
  */
-template <data_layout T_layout, El::Device Dev>
-class l2_norm2_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class l2_norm2_layer : public data_type_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
 
-  l2_norm2_layer(lbann_comm *comm) : Layer(comm) {}
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
+public:
+
+  l2_norm2_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
 
   l2_norm2_layer(const l2_norm2_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr) {}
   l2_norm2_layer& operator=(const l2_norm2_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() : nullptr);
     return *this;
@@ -57,18 +66,18 @@ class l2_norm2_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
   }
 
-  void setup_data() override {
-    Layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize workspace
-    auto dist = get_prev_activations().DistData();
+    auto dist = this->get_prev_activations().DistData();
     dist.colDist = El::STAR;
-    m_workspace.reset(AbsDistMat::Instantiate(dist));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist));
 #ifdef HYDROGEN_HAVE_CUB
     if (m_workspace->GetLocalDevice() == El::Device::GPU) {
       m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
@@ -81,15 +90,14 @@ class l2_norm2_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    m_workspace->Resize(1, get_prev_activations().Width());
+    m_workspace->AlignWith(this->get_prev_activations());
+    m_workspace->Resize(1, this->get_prev_activations().Width());
 
     // Compute local contributions and accumulate
     /// @todo Consider reduce rather than allreduce
-    local_fp_compute(get_local_prev_activations(),
-                     m_workspace->Matrix());
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-    El::Copy(*m_workspace, get_activations());
+    local_fp_compute();
+    this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, this->get_activations());
 
     // Clean up
     m_workspace->Empty();
@@ -100,13 +108,11 @@ class l2_norm2_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    El::Copy(get_prev_error_signals(), *m_workspace);
+    m_workspace->AlignWith(this->get_prev_activations());
+    El::Copy(this->get_prev_error_signals(), *m_workspace);
 
     // Compute local gradients
-    local_bp_compute(get_local_prev_activations(),
-                     m_workspace->LockedMatrix(),
-                     get_local_error_signals());
+    local_bp_compute();
 
     // Clean up
     m_workspace->Empty();
@@ -116,18 +122,28 @@ class l2_norm2_layer : public Layer {
 private:
 
   /** Compute local contributions to L2 norm. */
-  static void local_fp_compute(const AbsMat& local_input,
-                               AbsMat& local_contribution);
+  void local_fp_compute();
   /** Compute local gradients. */
-  static void local_bp_compute(const AbsMat& local_input,
-                               const AbsMat& local_gradient_wrt_output,
-                               AbsMat& local_gradient_wrt_input);
+  void local_bp_compute();
 
   /** Workspace matrix. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_L2_NORM2_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                     \
+  extern template class l2_norm2_layer<             \
+    T, data_layout::DATA_PARALLEL, Device>;         \
+  extern template class l2_norm2_layer<             \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_L2_NORM2_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/mean_absolute_error.hpp b/include/lbann/layers/loss/mean_absolute_error.hpp
index c136f1f6c72..4a9050e8733 100644
--- a/include/lbann/layers/loss/mean_absolute_error.hpp
+++ b/include/lbann/layers/loss/mean_absolute_error.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -39,23 +39,32 @@ namespace lbann {
  *      = \frac{1}{n} \sum\limits_{i=1}^{n} | y_i - \hat{y}_i |
  *  @f]
  */
-template <data_layout T_layout, El::Device Dev>
-class mean_absolute_error_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class mean_absolute_error_layer : public data_type_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
 
-  mean_absolute_error_layer(lbann_comm *comm) : Layer(comm) {
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
+public:
+
+  mean_absolute_error_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 2;
   }
 
   mean_absolute_error_layer(const mean_absolute_error_layer& other)
-    : Layer(other) {
+    : data_type_layer<TensorDataType>(other) {
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
   }
 
   mean_absolute_error_layer& operator=(const mean_absolute_error_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
@@ -67,18 +76,18 @@ class mean_absolute_error_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
 
     // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -91,12 +100,12 @@ class mean_absolute_error_layer : public Layer {
 
   }
 
-  void setup_data() override {
-    Layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize workspace
-    const auto& input_dist = get_prev_activations(0).DistData();
-    m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid,
+    const auto& input_dist = this->get_prev_activations(0).DistData();
+    m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid,
                                               input_dist.root,
                                               El::STAR,
                                               input_dist.rowDist,
@@ -116,17 +125,14 @@ class mean_absolute_error_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    m_workspace->Resize(1, get_prev_activations().Width());
+    m_workspace->AlignWith(this->get_prev_activations());
+    m_workspace->Resize(1, this->get_prev_activations().Width());
 
     // Compute local contributions and accumulate
     /// @todo Consider reduce rather than allreduce
-    local_fp_compute(get_input_size(),
-                     get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->Matrix());
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-    El::Copy(*m_workspace, get_activations());
+    local_fp_compute();
+    this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, this->get_activations());
 
     // Clean up
     m_workspace->Empty();
@@ -137,16 +143,11 @@ class mean_absolute_error_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    El::Copy(get_prev_error_signals(), *m_workspace);
+    m_workspace->AlignWith(this->get_prev_activations());
+    El::Copy(this->get_prev_error_signals(), *m_workspace);
 
     // Compute local gradients
-    local_bp_compute(get_input_size(),
-                     get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->LockedMatrix(),
-                     get_local_error_signals(0),
-                     get_local_error_signals(1));
+    local_bp_compute();
 
     // Clean up
     m_workspace->Empty();
@@ -156,23 +157,28 @@ class mean_absolute_error_layer : public Layer {
 private:
 
   /** Compute local contributions to mean absolute error loss. */
-  static void local_fp_compute(El::Int height,
-                               const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               AbsMat& local_contribution);
+  void local_fp_compute();
   /** Compute local gradients. */
-  static void local_bp_compute(El::Int height,
-                               const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               const AbsMat& local_gradient_wrt_output,
-                               AbsMat& local_gradient_wrt_prediction,
-                               AbsMat& local_gradient_wrt_ground_truth);
+  void local_bp_compute();
 
   /** Workspace matrix. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                     \
+  extern template class mean_absolute_error_layer<  \
+    T, data_layout::DATA_PARALLEL, Device>;         \
+  extern template class mean_absolute_error_layer<  \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/mean_squared_error.hpp b/include/lbann/layers/loss/mean_squared_error.hpp
index 19ead85c346..d3de49a9580 100644
--- a/include/lbann/layers/loss/mean_squared_error.hpp
+++ b/include/lbann/layers/loss/mean_squared_error.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -39,23 +39,32 @@ namespace lbann {
  *      = \frac{1}{n} \sum\limits_{i=1}^{n} (y_i - \hat{y}_i)^2
  *  @f]
  */
-template <data_layout T_layout, El::Device Dev>
-class mean_squared_error_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class mean_squared_error_layer : public data_type_layer<TensorDataType> {
 public:
+  /** @name Public Types */
+  ///@{
 
-  mean_squared_error_layer(lbann_comm *comm) : Layer(comm) {
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
+public:
+
+  mean_squared_error_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 2;
   }
 
   mean_squared_error_layer(const mean_squared_error_layer& other)
-    : Layer(other) {
+    : data_type_layer<TensorDataType>(other) {
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
   }
 
   mean_squared_error_layer& operator=(const mean_squared_error_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_workspace.reset(other.m_workspace ?
                       other.m_workspace->Copy() :
                       nullptr);
@@ -67,18 +76,18 @@ class mean_squared_error_layer : public Layer {
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
 
     // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -91,12 +100,12 @@ class mean_squared_error_layer : public Layer {
 
   }
 
-  void setup_data() override {
-    Layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize workspace
-    const auto& input_dist = get_prev_activations(0).DistData();
-    m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid,
+    const auto& input_dist = this->get_prev_activations(0).DistData();
+    m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid,
                                               input_dist.root,
                                               El::STAR,
                                               input_dist.rowDist,
@@ -116,17 +125,14 @@ class mean_squared_error_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    m_workspace->Resize(1, get_prev_activations().Width());
+    m_workspace->AlignWith(this->get_prev_activations());
+    m_workspace->Resize(1, this->get_prev_activations().Width());
 
     // Compute local contributions and accumulate
     /// @todo Consider reduce rather than allreduce
-    local_fp_compute(get_input_size(),
-                     get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->Matrix());
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-    El::Copy(*m_workspace, get_activations());
+    local_fp_compute();
+    this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, this->get_activations());
 
     // Clean up
     m_workspace->Empty();
@@ -137,16 +143,11 @@ class mean_squared_error_layer : public Layer {
 
     // Initialize workspace
     m_workspace->Empty();
-    m_workspace->AlignWith(get_prev_activations());
-    El::Copy(get_prev_error_signals(), *m_workspace);
+    m_workspace->AlignWith(this->get_prev_activations());
+    El::Copy(this->get_prev_error_signals(), *m_workspace);
 
     // Compute local gradients
-    local_bp_compute(get_input_size(),
-                     get_local_prev_activations(0),
-                     get_local_prev_activations(1),
-                     m_workspace->LockedMatrix(),
-                     get_local_error_signals(0),
-                     get_local_error_signals(1));
+    local_bp_compute();
 
     // Clean up
     m_workspace->Empty();
@@ -156,23 +157,28 @@ class mean_squared_error_layer : public Layer {
 private:
 
   /** Compute local contributions to mean squared error loss. */
-  static void local_fp_compute(El::Int height,
-                               const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               AbsMat& local_contribution);
+  void local_fp_compute();
   /** Compute local gradients. */
-  static void local_bp_compute(El::Int height,
-                               const AbsMat& local_prediction,
-                               const AbsMat& local_ground_truth,
-                               const AbsMat& local_gradient_wrt_output,
-                               AbsMat& local_gradient_wrt_prediction,
-                               AbsMat& local_gradient_wrt_ground_truth);
+  void local_bp_compute();
 
   /** Workspace matrix. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                     \
+  extern template class mean_squared_error_layer<   \
+    T, data_layout::DATA_PARALLEL, Device>;         \
+  extern template class mean_squared_error_layer<   \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp
index 6e0389e5f73..a58362bc4ef 100644
--- a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp
+++ b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED
 #define LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -42,12 +42,12 @@ namespace lbann {
  *
  *  @todo Gracefully handle case where label is not a one-hot vector.
  */
-template <data_layout T_layout, El::Device Dev>
-class top_k_categorical_accuracy_layer : public Layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class top_k_categorical_accuracy_layer : public data_type_layer<TensorDataType> {
 public:
 
   top_k_categorical_accuracy_layer(lbann_comm *comm, El::Int k)
-    : Layer(comm), m_k(k) {
+    : data_type_layer<TensorDataType>(comm), m_k(k) {
     this->m_expected_num_parent_layers = 2;
   }
 
@@ -59,25 +59,25 @@ class top_k_categorical_accuracy_layer : public Layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     desc.add("k", m_k);
     return desc;
   }
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
 
     // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -99,6 +99,18 @@ class top_k_categorical_accuracy_layer : public Layer {
 
 };
 
+#ifndef LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)                           \
+  extern template class top_k_categorical_accuracy_layer< \
+    T, data_layout::DATA_PARALLEL, Device>;               \
+  extern template class top_k_categorical_accuracy_layer< \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED
diff --git a/include/lbann/layers/math/CMakeLists.txt b/include/lbann/layers/math/CMakeLists.txt
index a6d19112716..ca9b3d9461b 100644
--- a/include/lbann/layers/math/CMakeLists.txt
+++ b/include/lbann/layers/math/CMakeLists.txt
@@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS
   unary.hpp
   binary.hpp
   clamp.hpp
+  matmul.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp
index d389ccbaae8..cb462a21dff 100644
--- a/include/lbann/layers/math/binary.hpp
+++ b/include/lbann/layers/math/binary.hpp
@@ -27,68 +27,83 @@
 #ifndef LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED
 #define LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
-/** @brief Templated class for entry-wise binary layers.
- *  @param Layout   Parallelism scheme.
- *  @param Device   Device allocation.
- *  @param Name     Type that can be converted into a string.
- */
-template <data_layout Layout, El::Device Device, typename Name>
-class entrywise_binary_layer : public Layer {
-public:
-
-  entrywise_binary_layer(lbann_comm *comm) : Layer(comm) {
-    this->m_expected_num_parent_layers = 2;
-  }
-  entrywise_binary_layer* copy() const override {
-    return new entrywise_binary_layer<Layout,Device,Name>(*this);
-  }
-  std::string get_type() const override { return Name(); }
-  data_layout get_data_layout() const override { return Layout; }
-  El::Device get_device_allocation() const override { return Device; }
-
-protected:
-
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
-
-    // Check that input dimensions match
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
-      std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
-        err << (i > 0 ? ", " : "")
-            << "layer \"" << parents[i]->get_name() << "\" outputs ";
-        for (size_t j = 0; j < dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << dims[j];
-        }
-      }
-      err << ")";
-      LBANN_ERROR(err.str());
-    }
-
+#define LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(LAYER_NAME, LAYER_STRING)      \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  class LAYER_NAME : public data_type_layer<TensorDataType> {               \
+  public:                                                                   \
+    LAYER_NAME(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {  \
+      this->m_expected_num_parent_layers = 2;                               \
+    }                                                                       \
+    LAYER_NAME* copy() const override {                                     \
+      return new LAYER_NAME<TensorDataType,Layout,Device>(*this);           \
+    }                                                                       \
+    std::string get_type() const override { return LAYER_STRING; }          \
+    data_layout get_data_layout() const override { return Layout; }         \
+    El::Device get_device_allocation() const override { return Device; }    \
+  protected:                                                                \
+    void setup_dims(DataReaderMetaData& dr_metadata) override {                                            \
+      data_type_layer<TensorDataType>::setup_dims(dr_metadata);                        \
+      this->set_output_dims(this->get_input_dims());                        \
+      /* Check that input dimensions match */                               \
+      if (this->get_input_dims(0) != this->get_input_dims(1)) {             \
+        const auto& parents = this->get_parent_layers();                    \
+        std::stringstream err;                                              \
+        err << this->get_type() << " layer \"" << this->get_name() << "\" " \
+            << "has input tensors with different dimensions (";             \
+        for (int i = 0; i < this->get_num_parents(); ++i) {                 \
+          const auto& dims = this->get_input_dims(i);                       \
+          err << (i > 0 ? ", " : "")                                        \
+              << "layer \"" << parents[i]->get_name() << "\" outputs ";     \
+          for (size_t j = 0; j < dims.size(); ++j) {                        \
+            err << (j > 0 ? " x " : "") << dims[j];                         \
+          }                                                                 \
+        }                                                                   \
+        err << ")";                                                         \
+        LBANN_ERROR(err.str());                                             \
+      }                                                                     \
+    }                                                                       \
+    void fp_compute() override;                                             \
+    void bp_compute() override;                                             \
   }
 
-  void fp_compute() override;
-  void bp_compute() override;
-
-};
+// Convenience macros for ETI decls for binary layers
+
+#ifndef LBANN_BINARY_LAYER_INSTANTIATE
+#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE)                   \
+  extern template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>; \
+  extern template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+#else
+#define BINARY_ETI_DECL_MACRO_DEV(...)
+#endif // LBANN_BINARY_LAYER_INSTANTIATE
+
+// Instnatiate both data and model parallel layers
+#define BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, T, DEVICE)             \
+  template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>;  \
+  template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+
+// Instantiate a DEVICE for each allowed tensor data type
+#define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE)      \
+  BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, float, DEVICE); \
+  BINARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, double, DEVICE)
+
+#ifdef LBANN_HAS_GPU
+#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T)                 \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU); \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU)
+#else
+#define BINARY_ETI_DECL_MACRO(LAYER_NAME, T)                 \
+  BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU)
+#endif // LBANN_HAS_GPU
 
 // Convenience macro to define an entry-wise binary layer class
 #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string)         \
-  struct layer_name##_name_struct {                                     \
-    inline operator std::string() { return layer_string; }              \
-  };                                                                    \
-  template <data_layout Layout, El::Device Device>                      \
-  using layer_name                                                      \
-  = entrywise_binary_layer<Layout, Device, layer_name##_name_struct>;
+  LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string);       \
+  BINARY_ETI_DECL_MACRO(layer_name, float);                             \
+  BINARY_ETI_DECL_MACRO(layer_name, double)
 
 // Arithmetic operations
 DEFINE_ENTRYWISE_BINARY_LAYER(add_layer,                "add");
@@ -118,4 +133,7 @@ DEFINE_ENTRYWISE_BINARY_LAYER(logical_xor_layer, "logical xor");
 } // namespace lbann
 
 #undef DEFINE_ENTRYWISE_BINARY_LAYER
+#undef BINARY_ETI_DECL_MACRO
+#undef BINARY_ETI_DECL_MACRO_DEV
+
 #endif // LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED
diff --git a/include/lbann/layers/math/clamp.hpp b/include/lbann/layers/math/clamp.hpp
index 4b79dc06c09..69164b65da3 100644
--- a/include/lbann/layers/math/clamp.hpp
+++ b/include/lbann/layers/math/clamp.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED
 #define LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -42,12 +42,18 @@ namespace lbann {
  *      \end{cases}
  *  @f]
  */
-template <data_layout Layout, El::Device Device>
-class clamp_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class clamp_layer : public data_type_layer<TensorDataType> {
+#ifdef LBANN_HAS_GPU_FP16
+  using CompareType = typename std::conditional<std::is_same<TensorDataType, fp16>::value, float, TensorDataType>::type;
+#else
+  using CompareType = TensorDataType;
+#endif
+
 public:
-  clamp_layer(lbann_comm *comm, DataType min, DataType max)
-    : Layer(comm), m_min(min), m_max(max) {
-    if (m_min > m_max) {
+  clamp_layer(lbann_comm *comm, TensorDataType min, TensorDataType max)
+    : data_type_layer<TensorDataType>(comm), m_min(min), m_max(max) {
+    if (CompareType(m_min) > CompareType(m_max)) {
       std::stringstream err;
       err << "[" << m_min << "," << m_max << "] is an invalid range";
       LBANN_ERROR(err.str());
@@ -59,7 +65,7 @@ class clamp_layer : public Layer {
   El::Device get_device_allocation() const override { return Device; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     std::stringstream ss;
     ss << "[" << m_min << "," << m_max << "]";
     desc.add("Range", ss.str());
@@ -67,21 +73,34 @@ class clamp_layer : public Layer {
   }
 
 protected:
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
   void fp_compute() override;
   void bp_compute() override;
 
 private:
   /** Minimum output. */
-  DataType m_min;
+  TensorDataType m_min;
   /** Maximum output. */
-  DataType m_max;
+  TensorDataType m_max;
 
 };
 
+#ifndef LBANN_CLAMP_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)             \
+  extern template class clamp_layer<        \
+    T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class clamp_layer<        \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CLAMP_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED
diff --git a/include/lbann/layers/math/matmul.hpp b/include/lbann/layers/math/matmul.hpp
new file mode 100644
index 00000000000..1331bee921c
--- /dev/null
+++ b/include/lbann/layers/math/matmul.hpp
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED
+#define LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Matrix multiplication.
+ *
+ *  Takes two 2D input tensors and outputs their matrix product.
+ *  Matrix products are computed independently for each mini-batch
+ *  sample, in a similar manner as NumPy's matmul function.
+ *
+ *  @todo Support >2 dimensions, matvecs, and dot products
+ *
+ */
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class matmul_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "matmul_layer only supports "
+                "data-parallel data layout");
+
+public:
+
+  matmul_layer(lbann_comm *comm,
+               bool transpose_a = false,
+               bool transpose_b = false);
+  matmul_layer(const matmul_layer& other) = default;
+  matmul_layer& operator=(const matmul_layer& other) = default;
+  matmul_layer* copy() const override;
+
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** If true, matrices from the first input tensor are transposed
+   *  before multiplication. */
+  bool m_transpose_a;
+  /** If true, matrices from the second input tensor are transposed
+   *  before multiplication. */
+  bool m_transpose_b;
+
+  template <typename U>
+  friend void fp_compute_impl(matmul_layer<U, Layout, Device>&, bool, bool);
+  template <typename U>
+  friend void bp_compute_impl(matmul_layer<U, Layout, Device>&, bool, bool);
+};
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+matmul_layer<TensorDataType, Layout,Device>::matmul_layer(lbann_comm *comm, bool transpose_a, bool transpose_b)
+  : data_type_layer<TensorDataType>(comm),
+    m_transpose_a{transpose_a},
+    m_transpose_b{transpose_b} {
+  this->m_expected_num_parent_layers = 2;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+matmul_layer<TensorDataType, Layout,Device>* matmul_layer<TensorDataType,Layout,Device>::copy() const {
+  return new matmul_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string matmul_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "matrix multiply";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout matmul_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device matmul_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description matmul_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Transpose A", m_transpose_a);
+  desc.add("Transpose B", m_transpose_b);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void matmul_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+
+  // Input dimensions
+  const auto& input0_dims = this->get_input_dims(0);
+  const auto& input1_dims = this->get_input_dims(1);
+
+  // Lambdas to help print error messages
+  auto print_name = [this] () -> std::string {
+    return this->get_type() + " layer \"" + this->get_name() + "\"";
+  };
+  auto print_inputs = [this, &input0_dims, &input1_dims] () -> std::string {
+    auto print_dims = [] (const decltype(input0_dims)& dims) -> std::string {
+      std::ostringstream ss;
+      for (size_t i = 0; i < dims.size(); ++i) {
+        ss << (i > 0 ? "x" : "") << dims[i];
+      }
+      return ss.str();
+    };
+    const auto& parents = this->get_parent_layers();
+    return lbann::build_string(
+      parents[0]->get_type()," layer \"",parents[0]->get_name(),"\" ",
+      "outputs ",print_dims(input0_dims),", ",
+      parents[1]->get_type()," layer \"",parents[1]->get_name(),"\" ",
+      "outputs ",print_dims(input1_dims));
+  };
+
+  // Check input dimensions
+  if (input0_dims.size() != input1_dims.size()) {
+    LBANN_ERROR("input tensors in ",print_name()," "
+                "have different numbers of dimensions ",
+                "(",print_inputs(),")");
+  }
+  if (input0_dims.size() != 2) {
+    LBANN_ERROR("input tensors in ",print_name()," are not 2D ",
+                "(",print_inputs(),")");
+  }
+
+  // Get matrix dimensions
+  const auto input0_height = *(input0_dims.rbegin()+1);
+  const auto input0_width = *(input0_dims.rbegin());
+  const auto input1_height = *(input1_dims.rbegin()+1);
+  const auto input1_width = *(input1_dims.rbegin());
+  if ((m_transpose_a ? input0_height : input0_width)
+      != (m_transpose_b ? input1_width : input1_height)) {
+    LBANN_ERROR("input tensors in ",print_name()," ",
+                "are not compatible with ",
+                (m_transpose_a ? "T" : "N"), (m_transpose_b ? "T" : "N"),
+                " matrix multiplication ",
+                "(",print_inputs(),")");
+  }
+
+  // Set output dimensions
+  std::vector<int> output_dims(input0_dims);
+  *(output_dims.rbegin()+1) = (m_transpose_a ? input0_width : input0_height);
+  *(output_dims.rbegin()) = (m_transpose_b ? input1_height : input1_width);
+  this->set_output_dims(output_dims);
+
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#ifndef LBANN_MATMUL_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class matmul_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_MATMUL_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED
diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp
index 73034b0593f..550f7fa7b45 100644
--- a/include/lbann/layers/math/unary.hpp
+++ b/include/lbann/layers/math/unary.hpp
@@ -27,42 +27,62 @@
 #ifndef LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED
 #define LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
-/** @brief Templated class for entry-wise unary layers.
- *  @param Layout   Parallelism scheme.
- *  @param Device   Device allocation.
- *  @param Name     Type that can be converted into a string.
- */
-template <data_layout Layout, El::Device Device, typename Name>
-class entrywise_unary_layer : public Layer {
-public:
-  entrywise_unary_layer(lbann_comm *comm) : Layer(comm) {}
-  entrywise_unary_layer* copy() const override {
-    return new entrywise_unary_layer<Layout,Device,Name>(*this);
+#define LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(LAYER_NAME, LAYER_STRING)       \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  class LAYER_NAME : public data_type_layer<TensorDataType> {               \
+  public:                                                                   \
+  LAYER_NAME(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}   \
+    LAYER_NAME* copy() const override {                                     \
+      return new LAYER_NAME<TensorDataType,Layout,Device>(*this);           \
+    }                                                                       \
+    std::string get_type() const override { return LAYER_STRING; }          \
+    data_layout get_data_layout() const override { return Layout; }         \
+    El::Device get_device_allocation() const override { return Device; }    \
+  protected:                                                                \
+    void setup_dims(DataReaderMetaData& dr_metadata) override {                                            \
+      data_type_layer<TensorDataType>::setup_dims(dr_metadata);                        \
+      this->set_output_dims(this->get_input_dims());                        \
+    }                                                                       \
+    void fp_compute() override;                                             \
+    void bp_compute() override;                                             \
   }
-  std::string get_type() const override { return Name(); }
-  data_layout get_data_layout() const override { return Layout; }
-  El::Device get_device_allocation() const override { return Device; }
-protected:
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
-  }
-  void fp_compute() override;
-  void bp_compute() override;
-};
+
+// Convenience macros for ETI decls for unary layers
+
+#ifndef LBANN_UNARY_LAYER_INSTANTIATE
+#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, DEVICE)                   \
+  extern template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>; \
+  extern template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+#else
+#define UNARY_ETI_DECL_MACRO_DEV(...)
+#endif // LBANN_UNARY_LAYER_INSTANTIATE
+
+#define UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, T, DEVICE)          \
+  template class LAYER_NAME<T, data_layout::DATA_PARALLEL, DEVICE>; \
+  template class LAYER_NAME<T, data_layout::MODEL_PARALLEL, DEVICE>
+
+#define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE)      \
+  UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, float, DEVICE); \
+  UNARY_ETI_INST_MACRO_DEV_DT(LAYER_NAME, double, DEVICE)
+
+#ifdef LBANN_HAS_GPU
+#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T)                      \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU);       \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::GPU)
+#else
+#define UNARY_ETI_DECL_MACRO(LAYER_NAME, T)               \
+  UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, T, El::Device::CPU)
+#endif // LBANN_HAS_GPU
 
 // Convenience macro to define an entry-wise unary layer class
-#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string)          \
-  struct layer_name##_name_struct {                                     \
-    inline operator std::string() { return layer_string; }              \
-  };                                                                    \
-  template <data_layout Layout, El::Device Device>                      \
-  using layer_name                                                      \
-  = entrywise_unary_layer<Layout, Device, layer_name##_name_struct>;
+#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string)    \
+  LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string);  \
+  UNARY_ETI_DECL_MACRO(layer_name, float);                        \
+  UNARY_ETI_DECL_MACRO(layer_name, double)
 
 // Logical operations
 DEFINE_ENTRYWISE_UNARY_LAYER(logical_not_layer, "logical not");
@@ -109,4 +129,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(atanh_layer, "hyperbolic arctangent");
 } // namespace lbann
 
 #undef DEFINE_ENTRYWISE_UNARY_LAYER
+#undef UNARY_ETI_DECL_MACRO
+#undef UNARY_ETI_DECL_MACRO_DEV
+
 #endif // LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/CMakeLists.txt b/include/lbann/layers/misc/CMakeLists.txt
index 2b5808fdfa7..06c9e2acfb7 100644
--- a/include/lbann/layers/misc/CMakeLists.txt
+++ b/include/lbann/layers/misc/CMakeLists.txt
@@ -5,6 +5,9 @@ set_full_path(THIS_DIR_HEADERS
   channelwise_mean.hpp
   mini_batch_index.hpp
   mini_batch_size.hpp
+  argmax.hpp
+  argmin.hpp
+  one_hot.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/layers/misc/argmax.hpp b/include/lbann/layers/misc/argmax.hpp
new file mode 100644
index 00000000000..71c581dd19c
--- /dev/null
+++ b/include/lbann/layers/misc/argmax.hpp
@@ -0,0 +1,86 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Get index of maximum-value tensor entry
+ *
+ *  Expects a 1-D input tensor. If multiple entries have the same
+ *  maximum value, outputs the index of the first one.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class argmax_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "argmax layer only supports data parallel layout");
+  static_assert(Device == El::Device::CPU,
+                "argmax layer only supports CPU");
+public:
+
+  argmax_layer(lbann_comm* comm) : data_type_layer<TensorDataType>(comm) { }
+  argmax_layer* copy() const override { return new argmax_layer(*this); }
+  std::string get_type() const override { return "argmax"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
+
+    // Make sure input tensor is 1-D
+    const auto input_dims = this->get_input_dims();
+    if (input_dims.size() != 1) {
+      LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ",
+                  "expects a 1-D input tensor, ",
+                  "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ",
+                  "outputs a ",input_dims.size(),"-D tensor");
+    }
+
+  }
+
+  void fp_compute() override;
+
+};
+
+#ifndef LBANN_ARGMAX_LAYER_INSTANTIATE
+#define PROTO(T) \
+  extern template class argmax_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#endif // LBANN_ARGMAX_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/argmin.hpp b/include/lbann/layers/misc/argmin.hpp
new file mode 100644
index 00000000000..ccfe846bfc1
--- /dev/null
+++ b/include/lbann/layers/misc/argmin.hpp
@@ -0,0 +1,85 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Get index of minimum-value tensor entry
+ *
+ *  Expects a 1-D input tensor. If multiple entries have the same
+ *  minimum value, outputs the index of the first one.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class argmin_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "argmin layer only supports data parallel layout");
+  static_assert(Device == El::Device::CPU,
+                "argmin layer only supports CPU");
+public:
+
+  argmin_layer(lbann_comm* comm) : data_type_layer<TensorDataType>(comm) { }
+  argmin_layer* copy() const override { return new argmin_layer(*this); }
+  std::string get_type() const override { return "argmin"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
+
+    // Make sure input tensor is 1-D
+    const auto input_dims = this->get_input_dims();
+    if (input_dims.size() != 1) {
+      LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ",
+                  "expects a 1-D input tensor, ",
+                  "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ",
+                  "outputs a ",input_dims.size(),"-D tensor");
+    }
+
+  }
+
+  void fp_compute() override;
+
+};
+
+#ifndef LBANN_ARGMIN_LAYER_INSTANTIATE
+#define PROTO(T) \
+  extern template class argmin_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#endif // LBANN_ARGMIN_LAYER_INSTANTIATE
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/channelwise_mean.hpp b/include/lbann/layers/misc/channelwise_mean.hpp
index 5889b853256..aea45b04a6c 100644
--- a/include/lbann/layers/misc/channelwise_mean.hpp
+++ b/include/lbann/layers/misc/channelwise_mean.hpp
@@ -27,20 +27,22 @@
 #ifndef LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED
 #define LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
 /** @todo Replace with more general reduction layer. */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class channelwise_mean_layer : public Layer {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class channelwise_mean_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "channelwise_mean_layer only supports "
+                "data-parallel data layout");
 public:
 
   channelwise_mean_layer(lbann_comm *comm)
-    : Layer(comm) {
-    static_assert(Layout == data_layout::DATA_PARALLEL,
-                  "channelwise_mean_layer only supports "
-                  "data-parallel data layout");
+    : data_type_layer<TensorDataType>(comm) {
     if (comm->am_trainer_master()) {
       LBANN_WARNING("channelwise_mean_layer is experimental "
                     "and may be deprecated at any time");
@@ -54,10 +56,10 @@ class channelwise_mean_layer : public Layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    const auto& input_dims = get_input_dims();
-    set_output_dims({input_dims[0]});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    const auto& input_dims = this->get_input_dims();
+    this->set_output_dims({input_dims[0]});
   }
 
   void fp_compute() override;
@@ -65,6 +67,14 @@ class channelwise_mean_layer : public Layer {
 
 };
 
+#ifndef LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class channelwise_mean_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/channelwise_softmax.hpp b/include/lbann/layers/misc/channelwise_softmax.hpp
new file mode 100644
index 00000000000..41ae2e4f865
--- /dev/null
+++ b/include/lbann/layers/misc/channelwise_softmax.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED
+#define LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Apply softmax to tensor channels.
+ *
+ *  The input tensor is sliced along the first tensor dimension (the
+ *  "channel" dimension for image data in CHW format) and the softmax
+ *  function is applied to each slice:
+ *  @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f]
+ *
+ *  This is not to be confused with @c softmax_mode::CHANNEL for
+ *  @c softmax_layer, which applies the softmax function to entries
+ *  corresponding to the same spatial position. "Channel mode" softmax
+ *  might be described as "position-wise softmax".
+ *
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class channelwise_softmax_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "channelwise_softmax_layer only supports "
+                "data-parallel data layout");
+
+public:
+
+  channelwise_softmax_layer(lbann_comm* comm);
+
+  channelwise_softmax_layer(const channelwise_softmax_layer& other) = default;
+  channelwise_softmax_layer& operator=(const channelwise_softmax_layer& other) = default;
+  channelwise_softmax_layer* copy() const override;
+
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+};
+
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(channelwise_softmax);
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+channelwise_softmax_layer<TensorDataType,Layout,Device>::channelwise_softmax_layer(
+  lbann_comm* comm)
+  : data_type_layer<TensorDataType>(comm)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+channelwise_softmax_layer<TensorDataType,Layout,Device>* channelwise_softmax_layer<TensorDataType,Layout,Device>::copy() const {
+  return new channelwise_softmax_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string channelwise_softmax_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "channel-wise softmax";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout channelwise_softmax_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device channelwise_softmax_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  this->set_output_dims(this->get_input_dims());
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#ifndef LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device)                         \
+  extern template class channelwise_softmax_layer<      \
+    T, data_layout::DATA_PARALLEL, Device>;
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_REGULARIZERS_CHANNELWISE_SOFTMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp
index 8f31d12d545..a370e71f58d 100644
--- a/include/lbann/layers/misc/covariance.hpp
+++ b/include/lbann/layers/misc/covariance.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
 #define LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -43,22 +43,31 @@ namespace lbann {
  *  Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased
  *  estimator.
  */
-template <data_layout Layout, El::Device Device>
-class covariance_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class covariance_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
   covariance_layer(lbann_comm *comm, bool biased)
-    : Layer(comm), m_biased(biased) {
+    : data_type_layer<TensorDataType>(comm), m_biased(biased) {
     this->m_expected_num_parent_layers = 2;
   }
   covariance_layer(const covariance_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_biased(other.m_biased),
       m_means(other.m_means ? other.m_means->Copy() : nullptr),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr) {}
   covariance_layer& operator=(const covariance_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_biased = other.m_biased;
     m_means.reset(other.m_means ? other.m_means->Copy() : nullptr);
     m_workspace.reset(other.m_workspace ?
@@ -72,7 +81,7 @@ class covariance_layer : public Layer {
   El::Device get_device_allocation() const override { return Device; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     desc.add("Biased", m_biased);
     return desc;
   }
@@ -80,23 +89,23 @@ class covariance_layer : public Layer {
 protected:
 
   void setup_matrices(const El::Grid& grid) override {
-    Layer::setup_matrices(grid);
-    auto dist_data = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist_data = this->get_prev_activations().DistData();
     dist_data.colDist = El::STAR;
-    m_means.reset(AbsDistMat::Instantiate(dist_data));
-    m_workspace.reset(AbsDistMat::Instantiate(dist_data));
+    m_means.reset(AbsDistMatrixType::Instantiate(dist_data));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data));
   }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
-    if (get_input_dims(0) != get_input_dims(1)) {
-      const auto& parents = get_parent_layers();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
+    if (this->get_input_dims(0) != this->get_input_dims(1)) {
+      const auto& parents = this->get_parent_layers();
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has input tensors with different dimensions (";
-      for (int i = 0; i < get_num_parents(); ++i) {
-        const auto& dims = get_input_dims(i);
+      for (int i = 0; i < this->get_num_parents(); ++i) {
+        const auto& dims = this->get_input_dims(i);
         err << (i > 0 ? ", " : "")
             << "layer \"" << parents[i]->get_name() << "\" outputs ";
         for (size_t j = 0; j < dims.size(); ++j) {
@@ -117,12 +126,21 @@ class covariance_layer : public Layer {
   bool m_biased;
 
   /** Means for each mini-batch sample.  */
-  std::unique_ptr<AbsDistMat> m_means;
+  std::unique_ptr<AbsDistMatrixType> m_means;
   /** Workspace. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_COVARIANCE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class covariance_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class covariance_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_COVARIANCE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/dist_embedding.hpp b/include/lbann/layers/misc/dist_embedding.hpp
new file mode 100644
index 00000000000..9d7cb445521
--- /dev/null
+++ b/include/lbann/layers/misc/dist_embedding.hpp
@@ -0,0 +1,406 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
+#include "lbann/base.hpp"
+#include "lbann/layers/layer.hpp"
+
+#if defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM)
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/optimizers/sgd.hpp"
+#include "lbann/weights/weights_helpers.hpp"
+#include "lbann/utils/memory.hpp"
+
+namespace lbann {
+
+/** @brief Embedding layer with distributed weights.
+ *
+ *  This is similar to the embedding layer, which takes integer
+ *  indices and returns embedding vectors from a lookup table.
+ *  However, the embedding vectors are distributed between processes
+ *  and one-sided inter-process communication is performed with
+ *  OpenSHMEM (on CPU) or NVSHMEM (on GPU).
+ *
+ *  The main benefit of this model-parallel approach is to handle
+ *  cases where the embedding vectors don't fit on one process. It
+ *  should also have better scaling properties when the mini-batch
+ *  size is very large.
+ *
+ *  To take advantage of sparse gradients, the distributed embedding
+ *  layer provides the option to bypass the optimizer (which currently
+ *  only supports dense gradients) and perform sparse SGD directly on
+ *  the embedding weights. If enabled, SGD occurs during the layers
+ *  "update" phase (i.e. in the virtual update_compute function).
+ *  Otherwise, the layer converts sparse gradients to a dense tensor
+ *  and passes it into the usual optimizer. This is a hack and will be
+ *  deprecated once the optimizer class supports sparse gradients.
+ *
+ *  @warning This is experimental.
+ *
+ *  @todo Sparse SGD with optimizer class
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class dist_embedding_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "distributed embedding layer only supports data parallel layout");
+
+public:
+
+  dist_embedding_layer(
+    lbann_comm* comm,
+    size_t num_embeddings,
+    size_t embedding_dim,
+    bool sparse_sgd,
+    DataType learning_rate,
+    bool barrier_in_forward_prop);
+
+  dist_embedding_layer(const dist_embedding_layer& other);
+  dist_embedding_layer& operator=(const dist_embedding_layer& other);
+  ~dist_embedding_layer();
+
+  dist_embedding_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_data(size_t max_mini_batch_size) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+  bool update_compute() override;
+
+public:
+
+  /** Metadata for an embedding vector from a remote process.
+   *
+   *  This should be treated as an internal implementation detail. It
+   *  is only in public scope so it is available to CUDA kernels in an
+   *  anonymous namespace.
+   */
+  struct vector_metadata {
+    size_t source_rank{0};
+    size_t source_index{0};
+    size_t target_rank{0};
+    size_t target_index{0};
+    bool is_active{false};
+  };
+
+private:
+
+  using LocalMat = El::Matrix<TensorDataType, Device>;
+
+  /** @brief Non-blocking barrier
+   *  @todo Handle case with non-default CUDA stream.
+   *  @todo Move to comm header.
+   */
+  static void nb_barrier(
+    lbann_comm& comm,
+    const El::mpi::Comm& c,
+    Al::request& req);
+
+  void attach_embeddings_to_shmem_buffer();
+  void apply_sparse_sgd_step(
+    size_t num_gradients,
+    LocalMat& local_embeddings);
+
+  /** SHMEM buffer for embedding vectors.
+   *
+   *  If the embedding weights matrix is not already attached to a
+   *  SHMEM buffer, then this layer allocates a SHMEM buffer and
+   *  attaches it. In this case, the layer is responsible for managing
+   *  the buffer.
+   */
+  TensorDataType* m_embeddings_buffer{nullptr};
+  /** Allocated size of @c m_embeddings_buffer. */
+  size_t m_embeddings_buffer_size{0};
+
+  /** SHMEM buffer to communicate embedding vectors. */
+  TensorDataType* m_workspace_buffer{nullptr};
+  /** Allocated size of @c m_workspace_buffer. */
+  size_t m_workspace_buffer_size{0};
+
+  /** SHMEM buffer to communicate metadata for embedding vectors. */
+  vector_metadata* m_metadata_buffer{nullptr};
+  /** Allocated size of @c m_metadata_buffer. */
+  size_t m_metadata_buffer_size{0};
+
+  /** Request to synchronize non-blocking barriers.
+   *
+   *  Careful synchronization is required to ensure the correctness of
+   *  asynchronous, one-sided communication via SHMEM buffers. After
+   *  any modification to a SHMEM buffer (local or remote), a
+   *  non-blocking barrier is launched to signal that the local
+   *  process has finished its work. Before the next access to the
+   *  SHMEM buffer, the non-blocking barrier is synchronized to make
+   *  sure that all remote processes have finished their work and that
+   *  the buffers are safe to access.
+   */
+  Al::request m_nb_barrier_request;
+
+  /** Size of dictionary of embeddings. */
+  size_t m_num_embeddings;
+  /** Size of embedding vectors. */
+  size_t m_embedding_dim;
+
+  /** Perform sparse SGD during backprop.
+   *
+   *  Bypasses optimizer class.
+   */
+  bool m_sparse_sgd;
+  /** SGD learning rate. */
+  DataType m_learning_rate;
+
+  /** Perform a blocking barrier at the beginning of forward prop.
+   *
+   *  This layer performs synchronization with non-blocking barriers
+   *  to ensure the correctness of asynchronous communication.
+   *  However, gradient checking changes the embedding values without
+   *  performing any synchronization. The quickest fix is to do a
+   *  blocking barrier at the beginning of forward prop to make sure
+   *  that all the embeddings are ready to be accessed.
+   *
+   *  @todo Think of a way to avoid this synchronization.
+   */
+  bool m_barrier_in_forward_prop;
+
+};
+
+// ---------------------------------------------
+// Implementation
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>::dist_embedding_layer(
+  lbann_comm* comm,
+  size_t num_embeddings,
+  size_t embedding_dim,
+  bool sparse_sgd,
+  DataType learning_rate,
+  bool barrier_in_forward_prop)
+  : data_type_layer<TensorDataType>(comm),
+    m_num_embeddings{num_embeddings},
+    m_embedding_dim{embedding_dim},
+    m_sparse_sgd{sparse_sgd},
+    m_learning_rate{learning_rate},
+    m_barrier_in_forward_prop{barrier_in_forward_prop} {
+
+  // Learning rate is only used for sparse SGD
+  if (!m_sparse_sgd) {
+    m_learning_rate = -1.0;
+  }
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>::dist_embedding_layer(
+  const dist_embedding_layer& other)
+  : data_type_layer<TensorDataType>(other) {
+  LBANN_ERROR("copy constructor is invalid for dist_embedding_layer");
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>& dist_embedding_layer<TensorDataType,Layout,Device>::operator=(
+  const dist_embedding_layer& other) {
+  LBANN_ERROR("copy assignment operator is invalid for dist_embedding_layer");
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>* dist_embedding_layer<TensorDataType,Layout,Device>::copy() const {
+  return new dist_embedding_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string dist_embedding_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "distributed embedding";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout dist_embedding_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device dist_embedding_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description dist_embedding_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Num embeddings", m_num_embeddings);
+  desc.add("Embedding dim", m_embedding_dim);
+  desc.add("Using sparse SGD", m_sparse_sgd);
+  desc.add("SGD learning rate", m_learning_rate);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  auto dims = this->get_input_dims();
+  dims.push_back(static_cast<int>(m_embedding_dim));
+  this->set_output_dims(dims);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::setup_data(size_t max_mini_batch_size) {
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure SHMEM buffers are safe to reset.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Construct default weights if needed
+  // Note: Randomly drawn from normal distribution with mean 0 and
+  // standard deviation 1.
+  if (!this->has_weights()) {
+    auto w = make_unique<data_type_weights<TensorDataType>>(&comm);
+    auto init = make_unique<normal_initializer<TensorDataType>>(0,1);
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+    w->set_name(this->get_name() + "_weights");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->add_weights(w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+  if (this->num_weights() != 1) {
+    LBANN_ERROR("attempted to setup ",
+                this->get_type()," layer \"",this->get_name(),"\" ",
+                "with an invalid number of weights ",
+                "(expected 1, found ",this->num_weights(),")");
+  }
+
+  // Configure embedding weights
+  auto& embeddings = this->get_weights(0);
+  {
+    auto dist = this->get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    dist.rowDist = El::VC;
+    embeddings.set_dims(
+      {static_cast<int>(m_embedding_dim)},
+      {static_cast<int>(m_num_embeddings)});
+    embeddings.set_matrix_distribution(dist);
+  }
+
+  // Destroy embedding optimizer and create dummy weights
+  // Note: This layer manually performs sparse SGD on embedding
+  // weights during backprop, so the embedding optimizer isn't needed.
+  // However, the layer must send gradients to some optimizer to
+  // prevent the model from optimizing the layer out of compute graph
+  // during backprop. We get around this by creating dummy weights
+  // with no entries.
+  if (m_sparse_sgd) {
+    embeddings.set_optimizer(nullptr);
+    auto w = make_unique<data_type_weights<TensorDataType>>(&comm);
+    auto opt = make_unique<sgd<TensorDataType>>(0.);
+    w->set_name(this->get_name() + "_dummy_weights");
+    w->set_optimizer(std::move(opt));
+    w->set_dims(1);
+    w->set_matrix_distribution(embeddings.get_matrix_distribution());
+    w->setup();
+    this->add_weights(w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+
+  // Setup embedding weights
+  embeddings.setup();
+  attach_embeddings_to_shmem_buffer();
+
+  // Non-blocking barrier
+  // Note: Embeddings have been initialized
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+bool dist_embedding_layer<TensorDataType,Layout,Device>::update_compute() {
+
+  // Apply sparse SGD if needed
+  if (m_sparse_sgd) {
+    const size_t input_size = this->get_input_size();
+    const size_t mini_batch_size = this->get_prev_activations().Width();
+    using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+    auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
+    auto& local_embeddings = dynamic_cast<LocalMat&>(embeddings.Matrix());
+    apply_sparse_sgd_step(input_size * mini_batch_size, local_embeddings);
+  }
+
+  // Non-blocking barrier
+  // Note: Embeddings are up-to-date.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+  return true;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::nb_barrier(
+  lbann_comm& comm,
+  const El::mpi::Comm& c,
+  Al::request& req) {
+  static El::Matrix<float,Device> buffer;
+  buffer.SetMemoryMode(0); // Don't use memory pool
+  buffer.Resize(1, 1);
+  comm.nb_allreduce(buffer, c, req);
+}
+
+// ---------------------------------------------
+// Explicit template instantiation
+// ---------------------------------------------
+
+#ifdef LBANN_HAS_SHMEM
+extern template class dist_embedding_layer<
+  float, data_layout::DATA_PARALLEL, El::Device::CPU>;
+#endif // LBANN_HAS_SHMEM
+#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+extern template class dist_embedding_layer<
+  float, data_layout::DATA_PARALLEL, El::Device::GPU>;
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+
+} // namespace lbann
+#endif // defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM)
+
+// ---------------------------------------------
+// Builder function
+// ---------------------------------------------
+
+namespace lbann
+{
+
+LBANN_DEFINE_LAYER_BUILDER(dist_embedding);
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/mini_batch_index.hpp b/include/lbann/layers/misc/mini_batch_index.hpp
index 51538000dce..75f882ea1b9 100644
--- a/include/lbann/layers/misc/mini_batch_index.hpp
+++ b/include/lbann/layers/misc/mini_batch_index.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED
 #define LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -37,11 +37,13 @@ namespace lbann {
  *  mini-batch sample. Each sample in a model's mini-batch has a
  *  unique index in [0, mini_batch_size).
  */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class mini_batch_index_layer : public Layer {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class mini_batch_index_layer : public data_type_layer<TensorDataType> {
 public:
 
-  mini_batch_index_layer(lbann_comm* comm) : Layer(comm) {
+  mini_batch_index_layer(lbann_comm* comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 0;
   }
 
@@ -52,20 +54,21 @@ class mini_batch_index_layer : public Layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
   }
 
   void fp_compute() override {
+    using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
 
     // Get output matrix
-    auto& output = get_activations();
+    auto& output = this->get_activations();
     auto& local_output = output.Matrix();
     const auto& local_width = local_output.Width();
 
     // Create temporary matrix if output matrix is not on CPU
-    CPUMat local_output_v;
+    CPUMatType local_output_v;
     if (local_output.GetDevice() == El::Device::CPU) {
       El::View(local_output_v, local_output);
     } else {
@@ -75,7 +78,7 @@ class mini_batch_index_layer : public Layer {
     // Populate matrix on CPU
     LBANN_OMP_PARALLEL_FOR
     for (El::Int col = 0; col < local_width; ++col) {
-      local_output_v(0, col) = DataType(output.GlobalCol(col));
+      local_output_v(0, col) = El::To<TensorDataType>(output.GlobalCol(col));
     }
 
     // Copy result from CPU if needed
@@ -87,6 +90,15 @@ class mini_batch_index_layer : public Layer {
 
 };
 
+#ifndef LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class mini_batch_index_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class mini_batch_index_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/mini_batch_size.hpp b/include/lbann/layers/misc/mini_batch_size.hpp
index 5a1445ef422..bd011a73ecd 100644
--- a/include/lbann/layers/misc/mini_batch_size.hpp
+++ b/include/lbann/layers/misc/mini_batch_size.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED
 #define LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -36,11 +36,13 @@ namespace lbann {
  *  Output tensor is a 1D tensor with a single entry containing the
  *  model's current mini-batch size.
  */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class mini_batch_size_layer : public Layer {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class mini_batch_size_layer : public data_type_layer<TensorDataType> {
 public:
 
-  mini_batch_size_layer(lbann_comm* comm) : Layer(comm) {
+  mini_batch_size_layer(lbann_comm* comm) : data_type_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = 0;
   }
 
@@ -51,18 +53,18 @@ class mini_batch_size_layer : public Layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    Layer::fp_setup_outputs(mini_batch_size);
+    data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
     m_mini_batch_size = mini_batch_size;
   }
 
   void fp_compute() override {
-    El::Fill(get_activations(), DataType(m_mini_batch_size));
+    El::Fill(this->get_activations(), El::To<TensorDataType>(m_mini_batch_size));
   }
 
 private:
@@ -72,6 +74,15 @@ class mini_batch_size_layer : public Layer {
 
 };
 
+#ifndef LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class mini_batch_size_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class mini_batch_size_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/one_hot.hpp b/include/lbann/layers/misc/one_hot.hpp
new file mode 100644
index 00000000000..a4b4ea0ecd4
--- /dev/null
+++ b/include/lbann/layers/misc/one_hot.hpp
@@ -0,0 +1,89 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief Convert index to a one-hot vector
+ *
+ *  Expects a scalar input tensor and outputs a 1-D output tensor with
+ *  @c size entries. The input is interpreted as an index, and output
+ *  entries are one if they correspond to that index and zero
+ *  otherwise. If the input is outside @f$[0,\text{size})@f$, then the
+ *  output is all zeros.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class one_hot_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "one-hot layer only supports data-parallel layout");
+public:
+
+  one_hot_layer(lbann_comm* comm, size_t size) : data_type_layer<TensorDataType>(comm) {
+    this->set_output_dims({static_cast<int>(size)});
+  }
+  one_hot_layer* copy() const override { return new one_hot_layer(*this); }
+  std::string get_type() const override { return "one-hot"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+
+    // Make sure input tensor is scalar
+    if (this->get_input_size() != 1) {
+      const auto input_dims = this->get_input_dims();
+      std::ostringstream dim_ss;
+      for (size_t i = 0; i < input_dims.size(); ++i) {
+        dim_ss << (i > 0 ? "x" : "") << input_dims[i];
+      }
+      LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ",
+                  "received an input tensor with invalid dimensions ",
+                  "(expected 1, got ",dim_ss.str(),")");
+    }
+
+  }
+
+  void fp_compute() override;
+
+};
+
+#ifndef LBANN_ONE_HOT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class one_hot_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_ONE_HOT_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp
index bc36581b73f..4006a161667 100644
--- a/include/lbann/layers/misc/variance.hpp
+++ b/include/lbann/layers/misc/variance.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
 #define LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -42,20 +42,29 @@ namespace lbann {
  *  Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased
  *  estimator.
  */
-template <data_layout Layout, El::Device Device>
-class variance_layer : public Layer {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class variance_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
   variance_layer(lbann_comm *comm, bool biased)
-    : Layer(comm), m_biased(biased) {}
+    : data_type_layer<TensorDataType>(comm), m_biased(biased) {}
   variance_layer(const variance_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_biased(other.m_biased),
       m_means(other.m_means ? other.m_means->Copy() : nullptr),
       m_workspace(other.m_workspace ?
                   other.m_workspace->Copy() : nullptr) {}
   variance_layer& operator=(const variance_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_biased = other.m_biased;
     m_means.reset(other.m_means ? other.m_means->Copy() : nullptr);
     m_workspace.reset(other.m_workspace ?
@@ -69,7 +78,7 @@ class variance_layer : public Layer {
   El::Device get_device_allocation() const override { return Device; }
 
   description get_description() const override {
-    auto&& desc = Layer::get_description();
+    auto desc = data_type_layer<TensorDataType>::get_description();
     desc.add("Biased", m_biased);
     return desc;
   }
@@ -77,21 +86,21 @@ class variance_layer : public Layer {
 protected:
 
   void setup_matrices(const El::Grid& grid) override {
-    Layer::setup_matrices(grid);
-    auto dist_data = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist_data = this->get_prev_activations().DistData();
     dist_data.colDist = El::STAR;
-    m_means.reset(AbsDistMat::Instantiate(dist_data));
-    m_workspace.reset(AbsDistMat::Instantiate(dist_data));
+    m_means.reset(AbsDistMatrixType::Instantiate(dist_data));
+    m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data));
   }
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
-    if (get_input_size() <= 1) {
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
+    if (this->get_input_size() <= 1) {
       std::stringstream err;
-      const auto& parents = get_parent_layers();
-      const auto& dims = get_input_dims();
-      err << get_type() << " layer \"" << get_name() << "\" "
+      const auto& parents = this->get_parent_layers();
+      const auto& dims = this->get_input_dims();
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "expects an input tensor with at least two entries, "
           << "but parent layer \"" << parents[0]->get_name() << "\" "
           << "outputs a tensor with dimensions ";
@@ -111,12 +120,21 @@ class variance_layer : public Layer {
   bool m_biased;
 
   /** Means for each mini-batch sample.  */
-  std::unique_ptr<AbsDistMat> m_means;
+  std::unique_ptr<AbsDistMatrixType> m_means;
   /** Workspace. */
-  std::unique_ptr<AbsDistMat> m_workspace;
+  std::unique_ptr<AbsDistMatrixType> m_workspace;
 
 };
 
+#ifndef LBANN_VARIANCE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class variance_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class variance_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_VARIANCE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/CMakeLists.txt b/include/lbann/layers/regularizers/CMakeLists.txt
index cd27df13645..15384770bc0 100644
--- a/include/lbann/layers/regularizers/CMakeLists.txt
+++ b/include/lbann/layers/regularizers/CMakeLists.txt
@@ -2,6 +2,8 @@
 set_full_path(THIS_DIR_HEADERS
   batch_normalization.hpp
   dropout.hpp
+  entrywise_batch_normalization.hpp
+  layer_norm.hpp
   local_response_normalization.hpp
   regularizer.hpp
   selu_dropout.hpp
diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp
index 2f896fcf081..4c5a3013eed 100644
--- a/include/lbann/layers/regularizers/batch_normalization.hpp
+++ b/include/lbann/layers/regularizers/batch_normalization.hpp
@@ -29,6 +29,7 @@
 
 #include "lbann/layers/regularizers/regularizer.hpp"
 #include "lbann/models/model.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
@@ -41,6 +42,36 @@ enum class batch_normalization_stats_aggregation {
   global
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class batch_normalization_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  batch_normalization_distconv_adapter(Layer& layer):
+    data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~batch_normalization_distconv_adapter() = default;
+  void setup_fp_tensors() override;
+  void setup_bp_tensors() override;
+  dc::Shape get_per_channel_stat_shape() const;
+  dc::Dist get_per_channel_stat_dist(const dc::Dist &input_dist) const;
+  void setup_layer(size_t workspace_capacity) override;
+  void fp_compute();
+  void bp_compute();
+
+  TensorDevType m_mean;
+  TensorDevType m_var;
+  TensorDevType m_scale;
+  TensorDevType m_bias;
+  TensorDevType m_running_mean;
+  TensorDevType m_running_var;
+  TensorDevType m_mean_gradient;
+  TensorDevType m_var_gradient;
+  TensorDevType m_scale_gradient;
+  TensorDevType m_bias_gradient;
+  std::unique_ptr<dc::BatchNormalization<TensorDataType>> m_bn;
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief
  *
  *  Each input channel is normalized across the mini-batch to have
@@ -54,35 +85,65 @@ enum class batch_normalization_stats_aggregation {
  *  Shift." In International Conference on Machine Learning,
  *  pp. 448-456. 2015.
  */
-template <data_layout T_layout, El::Device Dev>
-class batch_normalization_layer : public regularizer_layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class batch_normalization_layer : public regularizer_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "batch normalization only supports DATA_PARALLEL");
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  ///@}
 
 private:
 
   /** Decay rate for the running statistics. */
-  DataType m_decay;
+  TensorDataType m_decay;
   /** Small number to avoid division by zero. */
-  DataType m_epsilon;
-  /** Type of statistics aggregation to use. */
-  batch_normalization_stats_aggregation m_stats_aggregation;
+  TensorDataType m_epsilon;
+  /** @brief Size of group to aggregate statistics over.
+   *
+   * If this is 1, the group consists of one process and aggregation
+   * is local. If it is 0, statistics are aggregated globally.
+   */
+  int m_statistics_group_size;
   /**
    * Cache of node-local num_per_sum results for node-local stats.
    * Indexed by effective mini-batch size.
    */
   std::unordered_map<El::Int, El::Int> m_num_per_sum_cache;
 
-  /** Current minibatch means. */
-  std::unique_ptr<AbsDistMat> m_mean;
-  /** Current minibatch standard deviations. */
-  std::unique_ptr<AbsDistMat> m_var;
-  /** Gradient w.r.t. means. */
-  std::unique_ptr <AbsDistMat> m_mean_gradient;
-  /** Gradient w.r.t. standard deviations. */
-  std::unique_ptr<AbsDistMat> m_var_gradient;
+  /** @brief Current minibatch means and standard deviations.
+   *
+   * These are fused for performance when doing non-local batchnorm.
+   */
+  std::unique_ptr<AbsDistMatrixType> m_mean_and_var;
+  /** View of current mini-batch means. */
+  std::unique_ptr<AbsDistMatrixType> m_mean_v;
+  /** View of current mini-batch standard deviations. */
+  std::unique_ptr<AbsDistMatrixType> m_var_v;
+  /** @brief Gradients w.r.t. means and standard deviations.
+   *
+   * These are fused for performance when doing non-local batchnorm.
+   */
+  std::unique_ptr<AbsDistMatrixType> m_mean_and_var_gradient;
+  /** View of gradient w.r.t. means. */
+  std::unique_ptr<AbsDistMatrixType> m_mean_gradient_v;
+  /** View of gradient w.r.t. standard deviations. */
+  std::unique_ptr<AbsDistMatrixType> m_var_gradient_v;
   /** Gradient w.r.t. scaling terms. */
-  std::unique_ptr<AbsDistMat> m_scale_gradient;
+  std::unique_ptr<AbsDistMatrixType> m_scale_gradient;
   /** Gradient w.r.t. bias terms. */
-  std::unique_ptr<AbsDistMat> m_bias_gradient;
+  std::unique_ptr<AbsDistMatrixType> m_bias_gradient;
 
 public:
   /** @brief Set up batch normalization.
@@ -91,56 +152,64 @@ class batch_normalization_layer : public regularizer_layer {
    *  @param decay Controls the momentum of the running mean/standard
    *         deviation averages.
    *  @param epsilon A small number to avoid division by zero.
-   *  @param stats_aggregation The type of statistics to use when training.
+   *  @param statistics_group_size Number of processors to aggregate
+   *         statistics over. Defaults to 1 (i.e. local aggregation).
    */
   batch_normalization_layer(lbann_comm *comm,
-                            DataType decay=0.9,
-                            DataType epsilon=1e-5,
-                            batch_normalization_stats_aggregation stats_aggregation =
-                            batch_normalization_stats_aggregation::local)
-    : regularizer_layer(comm),
+                            TensorDataType decay=0.9,
+                            TensorDataType epsilon=1e-5,
+                            int statistics_group_size=1)
+    : regularizer_layer<TensorDataType>(comm),
       m_decay(decay),
       m_epsilon(epsilon),
-      m_stats_aggregation(stats_aggregation) {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "batch normalization only supports DATA_PARALLEL");
+      m_statistics_group_size(statistics_group_size) {
 #ifdef LBANN_DETERMINISTIC
     // Force global computation.
-    m_stats_aggregation = batch_normalization_stats_aggregation::global;
+    m_statistics_group_size = 0;
 #endif
   }
 
   batch_normalization_layer(const batch_normalization_layer& other)
-    : regularizer_layer(other),
+    : regularizer_layer<TensorDataType>(other),
       m_decay(other.m_decay),
       m_epsilon(other.m_epsilon),
-      m_stats_aggregation(other.m_stats_aggregation),
+      m_statistics_group_size(other.m_statistics_group_size),
       m_num_per_sum_cache(other.m_num_per_sum_cache),
-      m_mean(other.m_mean ? other.m_mean->Copy() : nullptr),
-      m_var(other.m_var ? other.m_var->Copy() : nullptr),
-      m_mean_gradient(other.m_mean_gradient ?
-                      other.m_mean_gradient->Copy() : nullptr),
-      m_var_gradient(other.m_var_gradient ?
-                     other.m_var_gradient->Copy() : nullptr),
+      m_mean_and_var(other.m_mean_and_var ?
+                     other.m_mean_and_var->Copy() : nullptr),
+      m_mean_v(other.m_mean_v ? other.m_mean_v->Copy() : nullptr),
+      m_var_v(other.m_var_v ? other.m_var_v->Copy() : nullptr),
+      m_mean_and_var_gradient(other.m_mean_and_var_gradient ?
+                              other.m_mean_and_var_gradient->Copy() : nullptr),
+      m_mean_gradient_v(other.m_mean_gradient_v ?
+                        other.m_mean_gradient_v->Copy() : nullptr),
+      m_var_gradient_v(other.m_var_gradient_v ?
+                       other.m_var_gradient_v->Copy() : nullptr),
       m_scale_gradient(other.m_scale_gradient ?
                        other.m_scale_gradient->Copy() : nullptr),
       m_bias_gradient(other.m_bias_gradient ?
                       other.m_bias_gradient->Copy() : nullptr) {}
 
   batch_normalization_layer& operator=(const batch_normalization_layer& other) {
-    regularizer_layer::operator=(other);
+    regularizer_layer<TensorDataType>::operator=(other);
     m_decay = other.m_decay;
     m_epsilon = other.m_epsilon;
-    m_stats_aggregation = other.m_stats_aggregation;
+    m_statistics_group_size = other.m_statistics_group_size;
     m_num_per_sum_cache = other.m_num_per_sum_cache;
 
     // Deep copy matrices
-    m_mean.reset(other.m_mean ? other.m_mean->Copy() : nullptr);
-    m_var.reset(other.m_var ? other.m_var->Copy() : nullptr);
-    m_mean_gradient.reset(other.m_mean_gradient ?
-                          other.m_mean_gradient->Copy() : nullptr);
-    m_var_gradient.reset(other.m_var_gradient ?
-                         other.m_var_gradient->Copy() : nullptr);
+    m_mean_and_var.reset(other.m_mean_and_var ?
+                         other.m_mean_and_var->Copy() : nullptr);
+    m_mean_v.reset(other.m_mean_v ?
+                   other.m_mean_v->Copy() : nullptr);
+    m_var_v.reset(other.m_var_v ?
+                  other.m_var_v->Copy() : nullptr);
+    m_mean_and_var_gradient.reset(other.m_mean_and_var_gradient ?
+                                  other.m_mean_and_var_gradient->Copy() : nullptr);
+    m_mean_gradient_v.reset(other.m_mean_gradient_v ?
+                            other.m_mean_gradient_v->Copy() : nullptr);
+    m_var_gradient_v.reset(other.m_var_gradient_v ?
+                           other.m_var_gradient_v->Copy() : nullptr);
     m_scale_gradient.reset(other.m_scale_gradient ?
                            other.m_scale_gradient->Copy() : nullptr);
     m_bias_gradient.reset(other.m_bias_gradient ?
@@ -155,159 +224,154 @@ class batch_normalization_layer : public regularizer_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = regularizer_layer::get_description();
+    auto desc = regularizer_layer<TensorDataType>::get_description();
     desc.add("Decay", m_decay);
     desc.add("Epsilon", m_epsilon);
-    switch (m_stats_aggregation) {
-    case batch_normalization_stats_aggregation::local:
-      desc.add("Statistics aggregation", "local");
-      break;
-    case batch_normalization_stats_aggregation::node_local:
-      desc.add("Statistics aggregation", "node-local");
-      break;
-    case batch_normalization_stats_aggregation::global:
-      desc.add("Statistics aggregation", "global");
-      break;
-    }
+    desc.add("Statistics group size", m_statistics_group_size);
     return desc;
   }
 
 protected:
 
   void setup_matrices(const El::Grid& grid) override {
-    regularizer_layer::setup_matrices(grid);
-    m_mean.reset(new StarMat<Dev>(grid));
-    m_var.reset(new StarMat<Dev>(grid));
-    m_mean_gradient.reset(new StarMat<Dev>(grid));
-    m_var_gradient.reset(new StarMat<Dev>(grid));
-    m_scale_gradient.reset(new StarMat<Dev>(grid));
-    m_bias_gradient.reset(new StarMat<Dev>(grid));
+    regularizer_layer<TensorDataType>::setup_matrices(grid);
+    m_mean_and_var.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_mean_v.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_var_v.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_mean_and_var_gradient.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_mean_gradient_v.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_var_gradient_v.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_scale_gradient.reset(new StarMatDT<TensorDataType, Dev>(grid));
+    m_bias_gradient.reset(new StarMatDT<TensorDataType, Dev>(grid));
   }
 
-  void setup_dims() override {
-    regularizer_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    regularizer_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
-  void setup_data() override {
-    regularizer_layer::setup_data();
-    const auto& output_dims = get_output_dims();
+  void setup_data(size_t max_mini_batch_size) override {
+    regularizer_layer<TensorDataType>::setup_data(max_mini_batch_size);
+    const auto& output_dims = this->get_output_dims();
     const auto& num_channels = output_dims[0];
 
     // Display warning if mini-batch size is small
-    const auto& output = get_activations();
+    const auto& output = this->get_activations();
     const auto& mini_batch_size = output.Width();
     const auto& local_mini_batch_size = mini_batch_size / output.DistSize();
-    if (m_stats_aggregation == batch_normalization_stats_aggregation::global
-        && mini_batch_size <= 4) {
-      std::stringstream err;
-      err << "LBANN warning: "
-          << get_type() << " layer \"" << get_name() << "\" "
-          << "is using global statistics and "
-          << "the mini-batch size (" << mini_batch_size << ") "
-          << "may be too small to get good statistics";
+    if (m_statistics_group_size == 0 && mini_batch_size <= 4) {
       if (output.DistRank() == 0) {
+        std::stringstream err;
+        err << "LBANN warning: "
+            << get_type() << " layer \"" << this->get_name() << "\" "
+            << "is using global statistics and "
+            << "the mini-batch size (" << mini_batch_size << ") "
+            << "may be too small to get good statistics";
         std::cerr << err.str() << std::endl;
       }
-    } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local
-               && local_mini_batch_size*m_comm->get_procs_per_node() <= 4) {
-      std::stringstream err;
-      err << "LBANN warning: "
-          << get_type() << " layer \"" << get_name() << "\" "
-          << "is using node-local statistics and "
-          << "the node-local mini-batch size ("
-          << (local_mini_batch_size*m_comm->get_procs_per_node()) << ") "
-          << "may be too small to get good statistics";
+    } else if (m_statistics_group_size != 0 &&
+               m_statistics_group_size*local_mini_batch_size <= 4) {
+      // This possibly underestimates the aggregation size for processors with
+      // smaller local mini-batch sizes.
       if (output.DistRank() == 0) {
-        std::cerr << err.str() << std::endl;
-      }
-    } else if (m_stats_aggregation == batch_normalization_stats_aggregation::local
-               && local_mini_batch_size <= 4) {
-      std::stringstream err;
+        std::stringstream err;
       err << "LBANN warning: "
-          << get_type() << " layer \"" << get_name() << "\" "
-          << "is using local statistics and "
-          << "the local mini-batch size (" << local_mini_batch_size << ") "
+          << get_type() << " layer \"" << this->get_name() << "\" "
+          << "is aggregating statistics over "
+          << m_statistics_group_size
+          << "processors and the aggregated mini-batch size ("
+          << (m_statistics_group_size*local_mini_batch_size) << ") "
           << "may be too small to get good statistics";
-      if (output.DistRank() == 0) {
         std::cerr << err.str() << std::endl;
       }
     }
 
     // Initialize default weights if none are provided
-    if (this->m_weights.size() > 4) {
+    if (this->num_weights() > 4) {
       std::stringstream err;
-      err << "attempted to setup layer \"" << m_name << "\" "
+      err << "attempted to setup layer \"" << this->m_name << "\" "
           << "with an invalid number of weights";
       LBANN_ERROR(err.str());
     }
-    this->m_weights.resize(4, nullptr);
-    if (this->m_weights[0] == nullptr) {
-      this->m_weights[0] = new weights(get_comm());
-      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(1)));
-      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-      this->m_weights[0]->set_name(get_name() + "_scale");
-      this->m_weights[0]->set_initializer(init);
-      this->m_weights[0]->set_optimizer(opt);
-      this->m_model->add_weights(this->m_weights[0]);
+    this->set_num_weights(4);
+    if (!this->has_weights(0)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::One());
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_scale");
+      w->set_initializer(std::move(init));
+      w->set_optimizer(std::move(opt));
+      this->set_weights(0, w.get());
+      this->m_model->add_weights(std::move(w));
     }
-    if (this->m_weights[1] == nullptr) {
-      this->m_weights[1] = new weights(get_comm());
-      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(0)));
-      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-      this->m_weights[1]->set_name(get_name() + "_bias");
-      this->m_weights[1]->set_initializer(init);
-      this->m_weights[1]->set_optimizer(opt);
-      this->m_model->add_weights(this->m_weights[1]);
+    if (!this->has_weights(1)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::Zero());
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_bias");
+      w->set_initializer(std::move(init));
+      w->set_optimizer(std::move(opt));
+      this->set_weights(1, w.get());
+      this->m_model->add_weights(std::move(w));
     }
-    if (this->m_weights[2] == nullptr) {
-      this->m_weights[2] = new weights(get_comm());
-      this->m_weights[2]->set_name(get_name() + "_running_mean");
-      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(0)));
-      this->m_weights[2]->set_initializer(init);
-      this->m_model->add_weights(this->m_weights[2]);
+    if (!this->has_weights(2)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::Zero());
+      w->set_name(this->get_name() + "_running_mean");
+      w->set_initializer(std::move(init));
+      this->set_weights(2, w.get());
+      this->m_model->add_weights(std::move(w));
     }
-    if (this->m_weights[3] == nullptr) {
-      this->m_weights[3] = new weights(get_comm());
-      this->m_weights[3]->set_name(get_name() + "_running_variance");
-      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(1)));
-      this->m_weights[3]->set_initializer(init);
-      this->m_model->add_weights(this->m_weights[3]);
+    if (!this->has_weights(3)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::One());
+      w->set_name(this->get_name() + "_running_variance");
+      w->set_initializer(std::move(init));
+      this->set_weights(3, w.get());
+      this->m_model->add_weights(std::move(w));
     }
 
     // Setup weights
-    auto dist = get_prev_activations().DistData();
+    auto dist = this->get_prev_activations().DistData();
     dist.colDist = El::STAR;
     dist.rowDist = El::STAR;
-    for (auto* w : this->m_weights) {
-      w->set_dims(num_channels);
-      w->set_matrix_distribution(dist);
+    size_t const num_weights = this->num_weights();
+    for (size_t ii = 0; ii < num_weights; ++ii) {
+      auto& w = this->get_weights(ii);
+      w.set_dims(num_channels);
+      w.set_matrix_distribution(dist);
     }
 
     // Initialize matrices
-    El::Zeros(*m_mean,           num_channels, 1);
-    El::Zeros(*m_var,            num_channels, 1);
-    El::Zeros(*m_mean_gradient,  num_channels, 1);
-    El::Zeros(*m_var_gradient,   num_channels, 1);
+    El::Zeros(*m_mean_and_var,   num_channels, 2);
+    El::Zeros(*m_mean_and_var_gradient, num_channels, 2);
     El::Zeros(*m_scale_gradient, num_channels, 1);
     El::Zeros(*m_bias_gradient,  num_channels, 1);
 
+    // Initialize views.
+    El::View(*m_mean_v, *m_mean_and_var, El::ALL, El::IR(0, 1));
+    El::View(*m_var_v, *m_mean_and_var, El::ALL, El::IR(1, 2));
+    El::View(*m_mean_gradient_v, *m_mean_and_var_gradient,
+             El::ALL, El::IR(0, 1));
+    El::View(*m_var_gradient_v, *m_mean_and_var_gradient,
+             El::ALL, El::IR(1, 2));
+
     // Initialize freeze state
-    for (auto&& w : this->m_weights) {
-      if (m_frozen) {
-        w->freeze();
+    for (size_t ii = 0; ii < num_weights; ++ii) {
+      auto& w = this->get_weights(ii);
+      if (this->m_frozen) {
+        w.freeze();
       } else {
-        w->unfreeze();
+        w.unfreeze();
       }
     }
-    for (auto&& w : this->m_weights) {
-      if (w->is_frozen() != m_frozen) {
-        std::stringstream err;
-        err << (m_frozen ? "" : "un") << "frozen "
-            << "layer \"" << get_name() << "\" has "
-            << (w->is_frozen() ? "" : "un") << "frozen "
-            << "weights \"" << w->get_name() << "\"";
-        LBANN_ERROR(err.str());
+    for (size_t ii = 0; ii < num_weights; ++ii) {
+      auto& w = this->get_weights(ii);
+      if (w.is_frozen() != this->m_frozen) {
+        LBANN_ERROR((this->m_frozen ? "" : "un"), "frozen layer "
+                    "\"", this->get_name(), "\" has ",
+                    (w.is_frozen() ? "" : "un"), "frozen weights "
+                    "\"", w.get_name(), "\"");;
       }
     }
 
@@ -316,8 +380,159 @@ class batch_normalization_layer : public regularizer_layer {
   void fp_compute() override;
   void bp_compute() override;
 
+#ifdef LBANN_HAS_DISTCONV
+  friend class batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>;
+ protected:
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<
+      batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
+  }
+  batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>&
+batch_normalization_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const batch_normalization_distconv_adapter<
+    TensorDataType, T_layout, Dev>&>(data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>&
+batch_normalization_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const batch_normalization_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Shape batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_per_channel_stat_shape() const {
+  auto &l = dynamic_cast<const batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+  const int num_channels = this->get_activations_shape()[dc::get_channel_dim()];
+  // Sanity check that the shared tensors have the correct shape
+  assert_ne(num_channels, 0);
+  assert_eq(l.m_mean_and_var->Matrix().Width() *
+            l.m_mean_and_var->Matrix().Height(),
+            num_channels * 2);
+  dc::Shape per_channel_stat_shape(dc::get_num_dims(l), 1);
+  per_channel_stat_shape[dc::get_channel_dim()] = num_channels;
+  return per_channel_stat_shape;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Dist batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_per_channel_stat_dist(const dc::Dist &input_dist) const {
+  auto shared_dist = dc::Dist::make_distribution(
+      input_dist.get_locale_shape());
+  auto split_shape = input_dist.get_split_shape();
+  // set all dimensions to be 1 except for the channel dimension
+  auto pc = split_shape[-2];
+  // set all elements to 1
+  split_shape = 1;
+  split_shape[-2] = pc;
+  shared_dist.set_split_shape(split_shape);
+
+  return shared_dist;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_fp_tensors() {
+  data_type_distconv_adapter<TensorDataType>::setup_fp_tensors();
+
+  auto &l = static_cast<batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+  const auto &input_dist = this->get_prev_activations_dist();
+
+  const auto per_channel_stat_shape = get_per_channel_stat_shape();
+  const auto shared_dist = get_per_channel_stat_dist(input_dist);
+
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+
+  // mean
+  m_mean = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(m_mean, l.m_mean_v->Buffer()));
+  // var
+  m_var = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(m_var, l.m_var_v->Buffer()));
+  // scale: view to weights[0]
+  m_scale = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  // bias: view to weights[1]
+  m_bias = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  // running_mean: view to weights[2]
+  m_running_mean = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  // running_var: view to weights[3]
+  m_running_var = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_bp_tensors() {
+  data_type_distconv_adapter<TensorDataType>::setup_bp_tensors();
+
+  const auto &prev_error_signal_dist = this->get_prev_error_signals_dist();
+  auto &l = static_cast<batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+
+  const auto per_channel_stat_shape = get_per_channel_stat_shape();
+  const auto shared_dist = get_per_channel_stat_dist(
+      prev_error_signal_dist);
+
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+
+  // scale_gradient
+  m_scale_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(
+      m_scale_gradient, l.m_scale_gradient->Buffer()));
+  // bias_gradient
+  m_bias_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(
+      m_bias_gradient, l.m_bias_gradient->Buffer()));
+  // mean_gradient
+  m_mean_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(
+      m_mean_gradient, l.m_mean_gradient_v->Buffer()));
+  // var_gradient
+  m_var_gradient = TensorDevType(per_channel_stat_shape, loc, shared_dist);
+  assert0(dc::tensor::View(
+      m_var_gradient, l.m_var_gradient_v->Buffer()));
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
+    size_t workspace_capacity) {
+  auto &l = dynamic_cast<batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+  bool global_stats;
+  if (l.m_statistics_group_size  == 0) {
+    global_stats = true;
+  } else if (l.m_statistics_group_size == 1) {
+    global_stats = false;
+  } else {
+    LBANN_ERROR("statistics_group_size must be either 0 or 1 for now.");
+  }
+
+  m_bn = make_unique<dc::BatchNormalization<TensorDataType>>(
+      dc::get_backend(), dc::get_num_dims(l),
+      l.m_decay, l.m_epsilon, global_stats);
+}
+#endif // LBANN_HAS_DISTCONV
+
+#ifndef LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class batch_normalization_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_REGULARIZER_BATCH_NORMALIZATION_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp
index d19f4be4125..35e72af956c 100644
--- a/include/lbann/layers/regularizers/dropout.hpp
+++ b/include/lbann/layers/regularizers/dropout.hpp
@@ -28,7 +28,9 @@
 #define LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED
 
 #include "lbann/layers/regularizers/regularizer.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/cudnn.hpp"
+#include "lbann/utils/random_number_generators.hpp"
 
 namespace lbann {
 
@@ -43,13 +45,22 @@ namespace lbann {
  *  prevent neural networks from overfitting." The Journal of Machine
  *  Learning Research 15, no. 1 (2014): 1929-1958.
  */
-template <data_layout T_layout, El::Device Dev>
-class dropout : public regularizer_layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class dropout : public regularizer_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
   /** Keep units with probabiliy keep_prob. */
   dropout(lbann_comm *comm,
           EvalType keep_prob = EvalType(0.5))
-    : regularizer_layer(comm),
+    : regularizer_layer<TensorDataType>(comm),
       m_keep_prob(keep_prob)
 #ifdef LBANN_HAS_CUDNN
     , m_dropout_cudnn_desc(nullptr),
@@ -58,7 +69,7 @@ class dropout : public regularizer_layer {
   {
 #if defined(LBANN_HAS_CUDNN) && defined(LBANN_DETERMINISTIC)
     /// @todo GPU implementation of dropout with sequential consistency
-    if (Dev == El::Device::GPU && get_comm()->am_trainer_master()) {
+    if (Dev == El::Device::GPU && this->get_comm()->am_trainer_master()) {
       std::cerr << "Warning: GPU dropout currently does not guarantee "
                 << "sequential consistency" << std::endl;
     }
@@ -66,7 +77,7 @@ class dropout : public regularizer_layer {
   }
 
   dropout(const dropout& other)
-    : regularizer_layer(other),
+    : regularizer_layer<TensorDataType>(other),
       m_keep_prob(other.m_keep_prob),
       m_mask(other.m_mask ? other.m_mask->Copy() : nullptr)
 #ifdef LBANN_HAS_CUDNN
@@ -85,9 +96,9 @@ class dropout : public regularizer_layer {
   }
 
   dropout& operator=(const dropout& other) {
-    regularizer_layer::operator=(other);
+    regularizer_layer<TensorDataType>::operator=(other);
     m_keep_prob = other.m_keep_prob;
-    m_mask = other.m_mask ? other.m_mask->Copy() : nullptr;
+    m_mask = other.m_mask ? std::unique_ptr<AbsDistMatrixType>(other.m_mask->Copy()) : nullptr;
 #ifdef LBANN_HAS_CUDNN
     m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
     m_tensors_cudnn_desc.set_layer(this);
@@ -117,25 +128,33 @@ class dropout : public regularizer_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = regularizer_layer::get_description();
+    auto desc = regularizer_layer<TensorDataType>::get_description();
     desc.add("Keep probability", m_keep_prob);
     return desc;
   }
+  /** @brief get prob for keep each unit. */
+  EvalType get_keep_prob() const {
+    return m_keep_prob;
+  }
+  /** @brief set prob for keep each unit. */
+  void set_keep_prob(EvalType keep_prob) {
+    m_keep_prob = keep_prob;
+  }
 
 protected:
 
-  void setup_dims() override {
-    regularizer_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    regularizer_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    regularizer_layer::setup_matrices(grid);
-    m_mask = std::unique_ptr<AbsDistMat>(get_activations().Copy());
+    regularizer_layer<TensorDataType>::setup_matrices(grid);
+    m_mask = std::unique_ptr<AbsDistMatrixType>(this->get_activations().Copy());
   }
 
   void setup_gpu() override {
-    regularizer_layer::setup_gpu();
+    regularizer_layer<TensorDataType>::setup_gpu();
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
@@ -147,7 +166,7 @@ class dropout : public regularizer_layer {
   }
 
   void fp_compute () override {
-    if (using_gpus()) {
+    if (this->using_gpus()) {
       fp_compute_gpu();
     } else {
       fp_compute_cpu();
@@ -155,7 +174,7 @@ class dropout : public regularizer_layer {
   }
 
   void bp_compute () override {
-    if (using_gpus()) {
+    if (this->using_gpus()) {
       bp_compute_gpu();
     } else {
       bp_compute_cpu();
@@ -167,31 +186,31 @@ class dropout : public regularizer_layer {
   void fp_compute_cpu() {
 
     // Matrices
-    const auto& input = get_prev_activations();
-    auto& output = get_activations();
+    const auto& input = this->get_prev_activations();
+    auto& output = this->get_activations();
 
     // Do nothing if dropout is disabled
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     if (mode != execution_mode::training || m_keep_prob < EvalType(0)) {
       El::Copy(input, output);
       return;
     }
 
     // Construct mask matrix
-    const DataType scale = 1 / m_keep_prob;
+    const TensorDataType scale = static_cast<TensorDataType>(1 / m_keep_prob);
     const auto& height = input.Height();
     const auto& width = input.Width();
     m_mask->Resize(height, width);
 #ifdef LBANN_DETERMINISTIC
-    bernoulli_fill_procdet(*m_mask, height, width, DataType(m_keep_prob));
+    bernoulli_fill_procdet(*m_mask, height, width, TensorDataType(m_keep_prob));
     El::Scale(scale, *m_mask);
 #else
     El::EntrywiseMap(*m_mask,
-                     (std::function<DataType(const DataType&)>)
-                     ([this,scale](const DataType& z)->DataType {
+                     (std::function<TensorDataType(const TensorDataType&)>)
+                     ([this,scale](const TensorDataType& z)->TensorDataType {
                        auto& gen = get_fast_generator();
                        std::bernoulli_distribution dist(m_keep_prob);
-                       return dist(gen) ? scale : DataType(0);
+                       return dist(gen) ? scale : El::TypeTraits<TensorDataType>::Zero();
                      }));
 #endif // LBANN_DETERMINISTIC
 
@@ -202,9 +221,9 @@ class dropout : public regularizer_layer {
 
   /** Adjust gradients for dropout in backprop. */
   void bp_compute_cpu() {
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    auto& gradient_wrt_input = get_error_signals();
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
+    auto& gradient_wrt_input = this->get_error_signals();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     if (mode != execution_mode::training || m_keep_prob < EvalType(0)) {
       El::Copy(gradient_wrt_output, gradient_wrt_input);
     } else {
@@ -218,13 +237,13 @@ class dropout : public regularizer_layer {
 #else
 
     // Matrices
-    const auto& input = get_prev_activations();
+    const auto& input = this->get_prev_activations();
     const auto& local_input = input.LockedMatrix();
-    auto& output = get_activations();
+    auto& output = this->get_activations();
     auto& local_output = output.Matrix();
 
     // Do nothing if dropout is disabled or there is no local data
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     if (mode != execution_mode::training || m_keep_prob < EvalType(0)) {
       El::Copy(input, output);
       return;
@@ -236,7 +255,7 @@ class dropout : public regularizer_layer {
     auto&& output_desc = m_tensors_cudnn_desc.get_activations();
     size_t size;
     CHECK_CUDNN(cudnnDropoutGetReserveSpaceSize(input_desc, &size));
-    m_reserve_space.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1);
+    m_reserve_space.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1);
 
     // Apply dropout on the GPU
     CHECK_CUDNN(cudnnDropoutForward(cudnn::get_handle(),
@@ -246,7 +265,7 @@ class dropout : public regularizer_layer {
                                     output_desc,
                                     local_output.Buffer(),
                                     m_reserve_space.Buffer(),
-                                    m_reserve_space.Height() * sizeof(DataType)));
+                                    m_reserve_space.Height() * sizeof(TensorDataType)));
 
 #endif // LBANN_HAS_CUDNN
   }
@@ -257,13 +276,13 @@ class dropout : public regularizer_layer {
 #else
 
     // Matrices
-    const auto& gradient_wrt_output = get_prev_error_signals();
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
     const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
-    auto& gradient_wrt_input = get_error_signals();
+    auto& gradient_wrt_input = this->get_error_signals();
     auto& local_gradient_wrt_input = gradient_wrt_input.Matrix();
 
     // Copy error signal if dropout is disabled
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     if (mode != execution_mode::training || m_keep_prob < EvalType(0)) {
       El::Copy(gradient_wrt_output, gradient_wrt_input);
     } else {
@@ -276,7 +295,7 @@ class dropout : public regularizer_layer {
                                          m_tensors_cudnn_desc.get_error_signals(),
                                          local_gradient_wrt_input.Buffer(),
                                          m_reserve_space.Buffer(),
-                                         m_reserve_space.Height() * sizeof(DataType)));
+                                         m_reserve_space.Height() * sizeof(TensorDataType)));
       }
     }
 #endif // LBANN_HAS_CUDNN
@@ -296,7 +315,7 @@ class dropout : public regularizer_layer {
     // Setup RNG state
     size_t size;
     CHECK_CUDNN(cudnnDropoutGetStatesSize(cudnn::get_handle(), &size));
-    m_states.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1);
+    m_states.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1);
 
     // Setup dropout descriptor
     CHECK_CUDNN(cudnnCreateDropoutDescriptor(&m_dropout_cudnn_desc));
@@ -304,7 +323,7 @@ class dropout : public regularizer_layer {
                                           cudnn::get_handle(),
                                           float(1 - m_keep_prob),
                                           m_states.Buffer(),
-                                          m_states.Height() * sizeof(DataType),
+                                          m_states.Height() * sizeof(TensorDataType),
                                           get_generator()()));
 
   }
@@ -313,21 +332,35 @@ class dropout : public regularizer_layer {
   /** Probability of keeping each unit. */
   EvalType m_keep_prob;
   /** Current dropout mask (a scaled Bernoulli random matrix). */
-  std::unique_ptr<AbsDistMat> m_mask;
+  std::unique_ptr<AbsDistMatrixType> m_mask;
 
 #ifdef LBANN_HAS_CUDNN
   /** Dropout cuDNN descriptor. */
   cudnnDropoutDescriptor_t m_dropout_cudnn_desc;
   /** Tensor cuDNN descriptors. */
-  cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::entrywise_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
   /** RNG state for cuDNN dropout. */
-  GPUMat m_states;
+  El::Matrix<TensorDataType, El::Device::GPU> m_states;
   /** Work space for cuDNN dropout. */
-  GPUMat m_reserve_space;
+  El::Matrix<TensorDataType, El::Device::GPU> m_reserve_space;
 #endif // LBANN_HAS_CUDNN
 
 };
 
+template <typename T, data_layout L, El::Device D>
+using dropout_layer = dropout<T, L, D>;
+
+LBANN_DEFINE_LAYER_BUILDER(dropout);
+
+#ifndef LBANN_DROPOUT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class dropout<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class dropout<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_DROPOUT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp
new file mode 100644
index 00000000000..67ed575faf8
--- /dev/null
+++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp
@@ -0,0 +1,249 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED
+#define LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/memory.hpp"
+
+namespace lbann {
+
+/** @brief
+ *
+ *  Each input entry is normalized across the mini-batch to have zero
+ *  mean and unit standard deviation. This uses the standard approach
+ *  of maintaining the running mean and standard deviation (with
+ *  exponential decay) for use at test time. See:
+ *
+ *  Sergey Ioffe and Christian Szegedy. "Batch Normalization:
+ *  Accelerating Deep Network Training by Reducing Internal Covariate
+ *  Shift." In International Conference on Machine Learning,
+ *  pp. 448-456. 2015.
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class entrywise_batch_normalization_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
+
+public:
+
+  entrywise_batch_normalization_layer(lbann_comm* comm,
+                                      TensorDataType decay=0.9,
+                                      TensorDataType epsilon=1e-5)
+    : data_type_layer<TensorDataType>(comm), m_decay(decay), m_epsilon(epsilon) {}
+
+  entrywise_batch_normalization_layer(const entrywise_batch_normalization_layer& other)
+    : data_type_layer<TensorDataType>(other),
+      m_decay(other.m_decay),
+      m_epsilon(other.m_epsilon),
+      m_batch_statistics(other.m_batch_statistics ?
+                         other.m_batch_statistics->Copy() :
+                         nullptr),
+      m_batch_statistics_gradient(other.m_batch_statistics_gradient ?
+                                  other.m_batch_statistics_gradient->Copy() :
+                                  nullptr) {}
+
+  entrywise_batch_normalization_layer& operator=(const entrywise_batch_normalization_layer& other) {
+    data_type_layer<TensorDataType>::operator=(other);
+    m_decay = other.m_decay;
+    m_epsilon = other.m_epsilon;
+    m_batch_statistics.reset(other.m_batch_statistics ?
+                             other.m_batch_statistics->Copy() :
+                             nullptr);
+    m_batch_statistics_gradient.reset(other.m_batch_statistics_gradient ?
+                                      other.m_batch_statistics_gradient->Copy() :
+                                      nullptr);
+    return *this;
+  }
+
+  entrywise_batch_normalization_layer* copy() const override { return new entrywise_batch_normalization_layer(*this); }
+  std::string get_type() const override { return "entry-wise batch normalization"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  description get_description() const override {
+    auto desc = data_type_layer<TensorDataType>::get_description();
+    desc.add("Decay", m_decay);
+    desc.add("Epsilon", m_epsilon);
+    return desc;
+  }
+
+protected:
+
+  void setup_matrices(const El::Grid& grid) override {
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist = this->get_prev_activations().DistData();
+    dist.rowDist = El::STAR;
+    m_batch_statistics.reset(AbsDistMatrixType::Instantiate(dist));
+    m_batch_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist));
+  }
+
+  void setup_data(size_t max_mini_batch_size) override {
+    data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+    // Initialize output dimensions
+    this->set_output_dims(this->get_input_dims());
+    const auto output_dims = this->get_output_dims();
+    const auto output_size = this->get_output_size();
+
+    // Initialize default weights if none are provided
+    if (this->num_weights() > 2) {
+      std::stringstream err;
+      err << "attempted to setup layer \"" << this->get_name() << "\" "
+          << "with an invalid number of weights "
+          << "(found " << this->num_weights() << ", expected 2)";
+      LBANN_ERROR(err.str());
+    }
+    this->set_num_weights(2);
+    if (!this->has_weights(0)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::Zero());
+      w->set_name(this->get_name() + "_running_mean");
+      w->set_initializer(std::move(init));
+      this->set_weights(0, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+    if (!this->has_weights(1)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<TensorDataType>>(El::TypeTraits<TensorDataType>::One());
+      w->set_name(this->get_name() + "_running_variance");
+      w->set_initializer(std::move(init));
+      this->set_weights(1, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+
+    // Setup weights
+    auto dist = this->get_prev_activations().DistData();
+    dist.rowDist = El::STAR;
+    auto const num_weights = this->num_weights();
+    for (size_t ii = 0; ii < num_weights; ++ii) {
+      auto& w = this->get_weights(ii);
+      w.set_dims(output_dims);
+      w.set_matrix_distribution(dist);
+    }
+
+    // Initialize matrices
+    m_batch_statistics->AlignWith(dist);
+    m_batch_statistics->Resize(output_size, 2);
+    m_batch_statistics_gradient->AlignWith(dist);
+    m_batch_statistics_gradient->Resize(output_size, 2);
+
+  }
+
+  void fp_setup_outputs(El::Int mini_batch_size) override {
+    data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+    const auto& input = this->get_prev_activations();
+    const auto input_size = this->get_input_size();
+
+    // Make sure batch statistics tensor is aligned with input tensor
+    m_batch_statistics->Empty(false);
+    m_batch_statistics->AlignWith(input);
+    m_batch_statistics->Resize(input_size, 2);
+
+#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123
+
+    // Check that weights tensors is aligned with input tensor
+    /// @todo Realign tensors if misaligned
+    bool aligned = true;
+    try {
+      const auto& running_mean = weights_values(0);
+      const auto& running_var = weights_values(1);
+      aligned = (input.ColAlign() == running_mean.ColAlign()
+                 && input.RowAlign() == running_mean.RowAlign()
+                 && input.ColAlign() == running_var.ColAlign()
+                 && input.RowAlign() == running_var.RowAlign());
+    }
+    catch (const exception& e) {
+      // An exception is thrown if you try accessing weights values
+      // before they are initialized. We don't care if this case is
+      // aligned, so it's safe to ignore.
+    }
+    if (!aligned) {
+      std::ostringstream err;
+      err << this->get_type() << " layer \"" << this->get_name() << "\" "
+          << "has misaligned input and weights matrices";
+      LBANN_ERROR(err.str());
+    }
+
+#endif // 0
+
+  }
+
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
+    data_type_layer<TensorDataType>::bp_setup_gradient_wrt_inputs(mini_batch_size);
+    m_batch_statistics_gradient->Empty(false);
+    m_batch_statistics_gradient->AlignWith(this->get_prev_activations());
+    m_batch_statistics_gradient->Resize(this->get_input_size(), 2);
+  }
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Decay rate for the running statistics. */
+  TensorDataType m_decay;
+  /** Small number to avoid division by zero. */
+  TensorDataType m_epsilon;
+
+  /** @brief Current mini-batch statistics.
+   *
+   *  These are fused for performance when doing non-local batchnorm.
+   */
+  std::unique_ptr<AbsDistMatrixType> m_batch_statistics;
+  /** @brief Gradients w.r.t. current mini-batch statistics.
+   *
+   * These are fused for performance when doing non-local batchnorm.
+   */
+  std::unique_ptr<AbsDistMatrixType> m_batch_statistics_gradient;
+
+};
+
+#ifndef LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class entrywise_batch_normalization_layer< \
+    T, data_layout::DATA_PARALLEL, Device>;                  \
+  extern template class entrywise_batch_normalization_layer< \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/instance_norm.hpp b/include/lbann/layers/regularizers/instance_norm.hpp
new file mode 100644
index 00000000000..f9f3c7f41fd
--- /dev/null
+++ b/include/lbann/layers/regularizers/instance_norm.hpp
@@ -0,0 +1,148 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED
+#define LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+namespace lbann {
+
+/** @brief
+ *
+ *  Each channel within a data sample is normalized to have zero mean
+ *  and unit standard deviation. See:
+ *
+ *  Dmitry Ulyanov, Andrea Vedaldi, and Victor Lempitsky. "Instance
+ *  normalization: The missing ingredient for fast stylization." arXiv
+ *  preprint arXiv:1607.08022 (2016).
+ *
+ *  This is equivalent to applying layer normalization independently
+ *  to each channel. Note that this layer does not apply a
+ *  channel-wise scale and bias. Use the channel-wise scale/bias layer
+ *  to reproduce that functionality.
+ *
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class instance_norm_layer : public data_type_layer<TensorDataType> {
+  static_assert(Layout == data_layout::DATA_PARALLEL,
+                "instance norm layer only supports data parallel layout");
+public:
+
+  /**
+   *  @param comm       LBANN communicator
+   *  @param epsilon    Small number to avoid division by zero
+   */
+  instance_norm_layer(lbann_comm* comm, TensorDataType epsilon=1e-5);
+
+  instance_norm_layer(const instance_norm_layer& other) = default;
+  instance_norm_layer& operator=(const instance_norm_layer& other) = default;
+  instance_norm_layer* copy() const override;
+
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Small number to avoid division by zero. */
+  TensorDataType m_epsilon;
+
+  /** Contains per-channel sums and sums of squares. */
+  El::Matrix<TensorDataType,Device> m_workspace;
+
+};
+
+// Builder function
+LBANN_DEFINE_LAYER_BUILDER(instance_norm);
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+instance_norm_layer<TensorDataType,Layout,Device>::instance_norm_layer(
+  lbann_comm* comm,
+  TensorDataType epsilon)
+  : data_type_layer<TensorDataType>(comm), m_epsilon(epsilon)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+instance_norm_layer<TensorDataType,Layout,Device>* instance_norm_layer<TensorDataType,Layout,Device>::copy() const {
+  return new instance_norm_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string instance_norm_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "instance norm";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout instance_norm_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device instance_norm_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description instance_norm_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Epsilon", m_epsilon);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void instance_norm_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  this->set_output_dims(this->get_input_dims());
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#ifndef LBANN_INSTANCE_NORM_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device)                 \
+  extern template class instance_norm_layer<    \
+    T, data_layout::DATA_PARALLEL, Device>;
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_INSTANCE_NORM_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_REGULARIZERS_INSTANCE_NORM_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/layer_norm.hpp b/include/lbann/layers/regularizers/layer_norm.hpp
new file mode 100644
index 00000000000..19421e91085
--- /dev/null
+++ b/include/lbann/layers/regularizers/layer_norm.hpp
@@ -0,0 +1,222 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
+#define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+
+#include <memory>
+
+namespace lbann {
+
+/** @brief
+ *
+ *  Each data sample is normalized to have zero mean and unit standard
+ *  deviation. See:
+ *
+ *  Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer
+ *  normalization." arXiv preprint arXiv:1607.06450 (2016).
+ *
+ *  Note that this layer does not apply an entry-wise scale and bias
+ *  like in the paper. Use the entry-wise scale/bias layer to
+ *  reproduce that functionality.
+ *
+ */
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+class layer_norm_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
+public:
+
+  /**
+   *  @param comm       LBANN communicator
+   *  @param epsilon    Small number to avoid division by zero
+   */
+  layer_norm_layer(lbann_comm* comm, TensorDataType epsilon=1e-5);
+
+  layer_norm_layer(const layer_norm_layer& other);
+  layer_norm_layer& operator=(const layer_norm_layer& other);
+  layer_norm_layer* copy() const override;
+
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+  description get_description() const override;
+
+protected:
+
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_matrices(const El::Grid& grid) override;
+  void fp_setup_outputs(El::Int mini_batch_size) override;
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override;
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** Small number to avoid division by zero. */
+  TensorDataType m_epsilon;
+
+  /** @brief Per-sample statistics.
+   *
+   *  The means and variances are fused for performance.
+   */
+  std::unique_ptr<AbsDistMatType> m_statistics;
+  /** @brief Gradients w.r.t. per-sample statistics.
+   *
+   *  The means and variances are fused for performance.
+   */
+  std::unique_ptr<AbsDistMatType> m_statistics_gradient;
+
+};
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+layer_norm_layer<TensorDataType,Layout,Device>::layer_norm_layer(
+  lbann_comm* comm,
+  TensorDataType epsilon)
+  : data_type_layer<TensorDataType>(comm), m_epsilon(epsilon)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+layer_norm_layer<TensorDataType,Layout,Device>::layer_norm_layer(
+  const layer_norm_layer<TensorDataType,Layout,Device>& other)
+  : data_type_layer<TensorDataType>(other),
+    m_epsilon(other.m_epsilon),
+    m_statistics(other.m_statistics
+                 ? other.m_statistics->Copy()
+                 : nullptr),
+    m_statistics_gradient(other.m_statistics_gradient
+                          ? other.m_statistics_gradient->Copy()
+                          : nullptr)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+layer_norm_layer<TensorDataType,Layout,Device>& layer_norm_layer<TensorDataType,Layout,Device>::operator=(
+  const layer_norm_layer<TensorDataType,Layout,Device>& other) {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_epsilon = other.m_epsilon;
+  m_statistics.reset(other.m_statistics
+                     ? other.m_statistics->Copy()
+                     : nullptr);
+  m_statistics_gradient.reset(other.m_statistics_gradient
+                              ? other.m_statistics_gradient->Copy()
+                              : nullptr);
+  return *this;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+layer_norm_layer<TensorDataType,Layout,Device>* layer_norm_layer<TensorDataType,Layout,Device>::copy() const {
+  return new layer_norm_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string layer_norm_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "layer norm";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout layer_norm_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device layer_norm_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description layer_norm_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Epsilon", m_epsilon);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  this->set_output_dims(this->get_input_dims());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType,Layout,Device>::setup_matrices(const El::Grid& grid) {
+  data_type_layer<TensorDataType>::setup_matrices(grid);
+  auto dist = this->get_prev_activations().DistData();
+  dist.colDist = El::STAR;
+  m_statistics.reset(AbsDistMatrixType::Instantiate(dist));
+  m_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType,Layout,Device>::fp_setup_outputs(El::Int mini_batch_size) {
+  data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+  const auto& input = this->get_prev_activations();
+  m_statistics->Empty(false);
+  m_statistics->AlignWith(input);
+  m_statistics->Resize(2, input.Width());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType,Layout,Device>::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) {
+  data_type_layer<TensorDataType>::bp_setup_gradient_wrt_inputs(mini_batch_size);
+  const auto& input = this->get_prev_activations();
+  m_statistics_gradient->Empty(false);
+  m_statistics_gradient->AlignWith(input);
+  m_statistics_gradient->Resize(2, input.Width());
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class layer_norm_layer<   \
+    T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class layer_norm_layer<   \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/local_response_normalization.hpp b/include/lbann/layers/regularizers/local_response_normalization.hpp
index 23ff7051fab..77c077c5ce3 100644
--- a/include/lbann/layers/regularizers/local_response_normalization.hpp
+++ b/include/lbann/layers/regularizers/local_response_normalization.hpp
@@ -43,28 +43,35 @@ namespace lbann {
  *  Advances in Neural Information Processing Systems,
  *  pp. 1097-1105. 2012.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class local_response_normalization_layer : public regularizer_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class local_response_normalization_layer : public regularizer_layer<TensorDataType> {
+#ifdef LBANN_HAS_CUDNN
+  using ScalingType = cudnn::ScalingParamType<TensorDataType>;
+#else
+  using ScalingType = TensorDataType;
+#endif // LBANN_HAS_CUDNN
+
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "local_response_normalization only supports DATA_PARALLEL");
 public:
 
   local_response_normalization_layer(lbann_comm *comm,
                                      int window_width,
-                                     DataType alpha,
-                                     DataType beta,
-                                     DataType k)
-    : regularizer_layer(comm),
+                                     TensorDataType alpha,
+                                     TensorDataType beta,
+                                     TensorDataType k)
+    : regularizer_layer<TensorDataType>(comm),
       m_window_width(window_width), m_alpha(alpha), m_beta(beta), m_k(k)
 #ifdef LBANN_HAS_CUDNN
     , m_lrn_cudnn_desc(nullptr),
       m_tensors_cudnn_desc(this)
 #endif // LBANN_HAS_CUDNN
-  {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "local_response_normalization only supports DATA_PARALLEL");
-  }
+  { }
 
   local_response_normalization_layer(const local_response_normalization_layer& other)
-    : regularizer_layer(other),
+    : regularizer_layer<TensorDataType>(other),
       m_window_width(other.m_window_width),
       m_alpha(other.m_alpha),
       m_beta(other.m_beta),
@@ -87,7 +94,7 @@ class local_response_normalization_layer : public regularizer_layer {
   }
 
   local_response_normalization_layer& operator=(const local_response_normalization_layer& other) {
-    regularizer_layer::operator=(other);
+    regularizer_layer<TensorDataType>::operator=(other);
     m_window_width = other.m_window_width;
     m_alpha = other.m_alpha;
     m_beta = other.m_beta;
@@ -110,6 +117,7 @@ class local_response_normalization_layer : public regularizer_layer {
     m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
     m_tensors_cudnn_desc.set_layer(this);
 #endif // LBANN_HAS_CUDNN
+    return *this;
   }
 
   ~local_response_normalization_layer() override {
@@ -128,7 +136,7 @@ class local_response_normalization_layer : public regularizer_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = regularizer_layer::get_description();
+    auto desc = regularizer_layer<TensorDataType>::get_description();
     desc.add("alpha", m_alpha);
     desc.add("beta", m_beta);
     desc.add("k", m_k);
@@ -137,14 +145,14 @@ class local_response_normalization_layer : public regularizer_layer {
 
 protected:
 
-  void setup_dims() override {
-    regularizer_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    regularizer_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   /// Initialize GPU objects
   void setup_gpu() override {
-    regularizer_layer::setup_gpu();
+    regularizer_layer<TensorDataType>::setup_gpu();
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
@@ -178,17 +186,17 @@ class local_response_normalization_layer : public regularizer_layer {
   /** Normalization window width. */
   int m_window_width;
   /** LRN alpha scaling parameter. */
-  DataType m_alpha;
+  TensorDataType m_alpha;
   /** LRN beta power parameter. */
-  DataType m_beta;
+  TensorDataType m_beta;
   /** LRN k parameter. */
-  DataType m_k;
+  TensorDataType m_k;
 
 #ifdef LBANN_HAS_CUDNN
   /** LRN cuDNN descriptor. */
   cudnnLRNDescriptor_t m_lrn_cudnn_desc;
   /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::data_parallel_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
 #endif // LBANN_HAS_CUDNN
 
   /// GPU implementation of forward propagation
@@ -196,11 +204,11 @@ class local_response_normalization_layer : public regularizer_layer {
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
+    const auto& local_input = this->get_local_prev_activations();
+    auto& local_output = this->get_local_activations();
     if (local_input.Height() > 0 && local_input.Width() > 0) {
-      const DataType zero = DataType(0);
-      const DataType one = DataType(1);
+      const ScalingType zero = El::TypeTraits<ScalingType>::Zero();
+      const ScalingType one = El::TypeTraits<ScalingType>::One();
       CHECK_CUDNN(cudnnLRNCrossChannelForward(cudnn::get_handle(),
                                               m_lrn_cudnn_desc,
                                               CUDNN_LRN_CROSS_CHANNEL_DIM1,
@@ -219,13 +227,13 @@ class local_response_normalization_layer : public regularizer_layer {
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
-    const auto& local_input = get_local_prev_activations();
-    const auto& local_output = get_local_activations();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
+    const auto& local_input = this->get_local_prev_activations();
+    const auto& local_output = this->get_local_activations();
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    auto& local_gradient_wrt_input = this->get_local_error_signals();
     if (local_input.Height() > 0 && local_input.Width() > 0) {
-      const DataType zero = DataType(0);
-      const DataType one = DataType(1);
+      const ScalingType zero = El::TypeTraits<ScalingType>::Zero();
+      const ScalingType one = El::TypeTraits<ScalingType>::One();
       CHECK_CUDNN(cudnnLRNCrossChannelBackward(cudnn::get_handle(),
                                                m_lrn_cudnn_desc,
                                                CUDNN_LRN_CROSS_CHANNEL_DIM1,
@@ -247,23 +255,24 @@ class local_response_normalization_layer : public regularizer_layer {
   void fp_compute_cpu() {
 
     // Local matrices
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
+    const auto& local_input = this->get_local_prev_activations();
+    auto& local_output = this->get_local_activations();
 
     // Matrix parameters
     const int local_width = local_input.Width();
-    const DataType* input_buffer = local_input.LockedBuffer();
+    const TensorDataType* input_buffer = local_input.LockedBuffer();
     const int input_ldim = local_input.LDim();
-    DataType* output_buffer = local_output.Buffer();
+    TensorDataType* output_buffer = local_output.Buffer();
     const int output_ldim = local_output.LDim();
 
     // Get LRN parameters
-    const auto& output_dims = get_output_dims();
+    const auto& output_dims = this->get_output_dims();
     const int num_channels = output_dims[0];
-    const int num_per_channel = get_output_size() / num_channels;
+    const int num_per_channel = this->get_output_size() / num_channels;
 
     // Check if LRN is using default beta parameter
-    const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75)
+    const bool default_beta = (std::fabs((m_beta - El::To<TensorDataType>(0.75))
+                                         / El::To<TensorDataType>(0.75))
                                < 2 * std::numeric_limits<DataType>::epsilon());
 
     ////////////////////////////////////////////////////////////////
@@ -282,7 +291,7 @@ class local_response_normalization_layer : public regularizer_layer {
           block_start += max_block_size) {
         const int block_size = std::min(max_block_size,
                                         num_per_channel - block_start);
-        DataType workspace[max_block_size];
+        TensorDataType workspace[max_block_size];
 
         // Iterate through channels
         for (int channel = 0; channel < num_channels; ++channel) {
@@ -290,32 +299,33 @@ class local_response_normalization_layer : public regularizer_layer {
           const int window_end = std::min(channel + m_window_width / 2, num_channels - 1);
 
           // Compute sum of squares in workspace
-          std::fill(workspace, workspace + block_size, DataType(0));
+          std::fill(workspace, workspace + block_size, El::TypeTraits<TensorDataType>::Zero());
           for (int window_pos = window_start; window_pos <= window_end; ++window_pos) {
             for (int block_pos = 0; block_pos < block_size; ++block_pos) {
               const int index = block_start + block_pos + window_pos * num_per_channel;
-              const DataType input_entry = input_buffer[index + sample * input_ldim];
+              const TensorDataType input_entry = input_buffer[index + sample * input_ldim];
               workspace[block_pos] += input_entry * input_entry;
             }
           }
 
           // Compute 1 / (k + alpha * sum(x^2) ) in workspace
           for (int block_pos = 0; block_pos < block_size; ++block_pos) {
-            workspace[block_pos] = 1 / (m_k + m_alpha * workspace[block_pos]);
+            workspace[block_pos] = El::TypeTraits<TensorDataType>::One()
+              / (m_k + m_alpha * workspace[block_pos]);
           }
 
           // Compute output
           for (int block_pos = 0; block_pos < block_size; ++block_pos) {
             const int index = block_start + block_pos + channel * num_per_channel;
-            const DataType scale_factor = workspace[block_pos];
-            const DataType input_entry = input_buffer[index + sample * input_ldim];
-            DataType& output_entry = output_buffer[index + sample * output_ldim];
+            const TensorDataType scale_factor = workspace[block_pos];
+            const TensorDataType input_entry = input_buffer[index + sample * input_ldim];
+            TensorDataType& output_entry = output_buffer[index + sample * output_ldim];
             if (default_beta) { // Special case when beta = 0.75
               output_entry = (input_entry
-                              * std::sqrt(scale_factor * std::sqrt(scale_factor)));
+                              * El::Sqrt(scale_factor * El::Sqrt(scale_factor)));
             }
             else {
-              output_entry = input_entry * std::pow(scale_factor, m_beta);
+              output_entry = input_entry * El::Pow(scale_factor, m_beta);
             }
           }
 
@@ -330,30 +340,31 @@ class local_response_normalization_layer : public regularizer_layer {
   void bp_compute_cpu() {
 
     // Get local matrices
-    const auto& local_input = get_local_prev_activations();
-    const auto& local_output = get_local_activations();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
+    const auto& local_input = this->get_local_prev_activations();
+    const auto& local_output = this->get_local_activations();
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    auto& local_gradient_wrt_input = this->get_local_error_signals();
 
     // Get matrix buffers
     const int local_width = local_input.Width();
-    const DataType* input_buffer = local_input.LockedBuffer();
+    const TensorDataType* input_buffer = local_input.LockedBuffer();
     const int input_ldim = local_input.LDim();
-    const DataType* output_buffer = local_output.LockedBuffer();
+    const TensorDataType* output_buffer = local_output.LockedBuffer();
     const int output_ldim = local_output.LDim();
-    const DataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer();
+    const TensorDataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer();
     const int gradient_wrt_output_ldim = local_gradient_wrt_output.LDim();
-    DataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer();
+    TensorDataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer();
     const int gradient_wrt_input_ldim = local_gradient_wrt_input.LDim();
 
     // Get LRN parameters
-    const auto& output_dims = get_output_dims();
+    const auto& output_dims = this->get_output_dims();
     const int num_channels = output_dims[0];
-    const int num_per_channel = get_output_size() / num_channels;
+    const int num_per_channel = this->get_output_size() / num_channels;
 
     // Check if LRN is using default beta parameter
-    const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75)
-                               < 2 * std::numeric_limits<DataType>::epsilon());
+    const bool default_beta = (std::fabs((m_beta - El::To<TensorDataType>(0.75))
+                                         / El::To<TensorDataType>(0.75))
+                               < El::To<TensorDataType>(2) * std::numeric_limits<TensorDataType>::epsilon());
 
     ////////////////////////////////////////////////////////////////
     // error_signal(i)
@@ -375,7 +386,7 @@ class local_response_normalization_layer : public regularizer_layer {
           block_start += max_block_size) {
         const int block_size = std::min(max_block_size,
                                         num_per_channel - block_start);
-        DataType workspace[max_block_size];
+        TensorDataType workspace[max_block_size];
 
         // Iterate through channels
         for (int channel = 0; channel < num_channels; ++channel) {
@@ -383,45 +394,46 @@ class local_response_normalization_layer : public regularizer_layer {
           const int window_end = std::min(channel + m_window_width / 2, num_channels - 1);
 
           // Compute sum of squares in workspace
-          std::fill(workspace, workspace + block_size, DataType(0));
+          std::fill(workspace, workspace + block_size, El::TypeTraits<TensorDataType>::Zero());
           for (int window_pos = window_start; window_pos <= window_end; ++window_pos) {
             for (int block_pos = 0; block_pos < block_size; ++block_pos) {
               const int index = block_start + block_pos + window_pos * num_per_channel;
-              const DataType input_entry = input_buffer[index + sample * input_ldim];
+              const TensorDataType input_entry = input_buffer[index + sample * input_ldim];
               workspace[block_pos] += input_entry * input_entry;
             }
           }
 
           // Compute 1 / (k + alpha * sum(x^2) ) in workspace
           for (int block_pos = 0; block_pos < block_size; ++block_pos) {
-            workspace[block_pos] = 1 / (m_k + m_alpha * workspace[block_pos]);
+            workspace[block_pos] = El::TypeTraits<TensorDataType>::One()
+              / (m_k + m_alpha * workspace[block_pos]);
           }
 
           // Compute error signal contribution for current entry
           for (int block_pos = 0; block_pos < block_size; ++block_pos) {
             const int index = block_start + block_pos + channel * num_per_channel;
-            const DataType scale_factor = workspace[block_pos];
-            const DataType gradient_wrt_output_entry
+            const TensorDataType scale_factor = workspace[block_pos];
+            const TensorDataType gradient_wrt_output_entry
               = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim];
-            DataType& gradient_wrt_input_entry
+            TensorDataType& gradient_wrt_input_entry
               = gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim];
             if (default_beta) { // Special case when beta = 0.75
               gradient_wrt_input_entry
-                = gradient_wrt_output_entry * std::sqrt(scale_factor * std::sqrt(scale_factor));
+                = gradient_wrt_output_entry * El::Sqrt(scale_factor * El::Sqrt(scale_factor));
             }
             else {
               gradient_wrt_input_entry
-                = gradient_wrt_output_entry * std::pow(scale_factor, m_beta);
+                = gradient_wrt_output_entry * El::Pow(scale_factor, m_beta);
             }
           }
 
           // Compute y * dy / (k + alpha * sum(x^2) ) in workspace
           for (int block_pos = 0; block_pos < block_size; ++block_pos) {
             const int index = block_start + block_pos + channel * num_per_channel;
-            const DataType output_entry = output_buffer[index + sample * output_ldim];
-            const DataType gradient_wrt_output_entry
+            const TensorDataType output_entry = output_buffer[index + sample * output_ldim];
+            const TensorDataType gradient_wrt_output_entry
               = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim];
-            workspace[block_pos] = (-2 * m_alpha * m_beta * workspace[block_pos]
+            workspace[block_pos] = (El::To<TensorDataType>(-2) * m_alpha * m_beta * workspace[block_pos]
                                     * output_entry * gradient_wrt_output_entry);
           }
 
@@ -429,7 +441,7 @@ class local_response_normalization_layer : public regularizer_layer {
           for (int window_pos = window_start; window_pos <= window_end; ++window_pos) {
             for (int block_pos = 0; block_pos < block_size; ++block_pos) {
               const int index = block_start + block_pos + window_pos * num_per_channel;
-              const DataType input_entry = input_buffer[index + sample * input_ldim];
+              const TensorDataType input_entry = input_buffer[index + sample * input_ldim];
               gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim]
                 += workspace[block_pos] * input_entry;
             }
@@ -444,6 +456,17 @@ class local_response_normalization_layer : public regularizer_layer {
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(local_response_normalization);
+
+#ifndef LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device)                             \
+  extern template class local_response_normalization_layer< \
+    T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_LOCAL_RESPONSE_NORMALIZATION_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/regularizer.hpp b/include/lbann/layers/regularizers/regularizer.hpp
index c01b892c820..51966d28258 100644
--- a/include/lbann/layers/regularizers/regularizer.hpp
+++ b/include/lbann/layers/regularizers/regularizer.hpp
@@ -26,16 +26,17 @@
 
 #ifndef LBANN_LAYER_REGULARIZER_HPP_INCLUDED
 #define LBANN_LAYER_REGULARIZER_HPP_INCLUDED
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
 /** @todo Remove. Layers should inherit directly from the base layer
  *  class.
  */
-class regularizer_layer : public Layer {
+template <typename TensorDataType>
+class regularizer_layer : public data_type_layer<TensorDataType> {
  public:
-  regularizer_layer(lbann_comm *comm) : Layer(comm) {}
+  regularizer_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
 };
 
 } // namespace lbann
diff --git a/include/lbann/layers/regularizers/selu_dropout.hpp b/include/lbann/layers/regularizers/selu_dropout.hpp
index a2b3d6475a3..f62e6e509c4 100644
--- a/include/lbann/layers/regularizers/selu_dropout.hpp
+++ b/include/lbann/layers/regularizers/selu_dropout.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED
 
 #include "lbann/layers/regularizers/regularizer.hpp"
+#include "lbann/models/model.hpp"
 
 namespace lbann {
 
@@ -39,15 +40,27 @@ namespace lbann {
  *  Hochreiter. "Self-normalizing neural networks." In Advances in
  *  Neural Information Processing Systems, pp. 971-980. 2017.
  */
-template <data_layout T_layout, El::Device Dev>
-class selu_dropout : public regularizer_layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class selu_dropout : public regularizer_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The tensor type expected in this object. */
+  using CPUMatrixType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  ///@}
+
  public:
   /** Keep units with probabiliy keep_prob. */
   selu_dropout(lbann_comm *comm,
-               float keep_prob=0.95f,
-               DataType alpha = DataType(1.6732632423543772848170429916717),
-               DataType scale = DataType(1.0507009873554804934193349852946)) :
-    regularizer_layer(comm),
+               TensorDataType keep_prob = TensorDataType(0.95f),
+               TensorDataType alpha = TensorDataType(1.6732632423543772848170429916717),
+               TensorDataType scale = TensorDataType(1.0507009873554804934193349852946)) :
+    regularizer_layer<TensorDataType>(comm),
     m_keep_prob(keep_prob),
     m_mask(nullptr) {
 #ifdef LBANN_DETERMINISTIC
@@ -56,13 +69,13 @@ class selu_dropout : public regularizer_layer {
     // Compute alpha' and the affine transform.
     m_alpha_prime = -scale*alpha;
     m_a = keep_prob +
-      m_alpha_prime*m_alpha_prime*keep_prob*(DataType(1) - keep_prob);
-    m_a = DataType(1) / std::sqrt(m_a);
-    m_b = -m_a * m_alpha_prime*(DataType(1) - keep_prob);
+      m_alpha_prime*m_alpha_prime*keep_prob*(El::TypeTraits<TensorDataType>::One() - keep_prob);
+    m_a = El::TypeTraits<TensorDataType>::One() / El::Sqrt(m_a);
+    m_b = -m_a * m_alpha_prime*(El::TypeTraits<TensorDataType>::One() - keep_prob);
   }
 
   selu_dropout(const selu_dropout& other) :
-    regularizer_layer(other),
+    regularizer_layer<TensorDataType>(other),
     m_alpha_prime(other.m_alpha_prime),
     m_a(other.m_a),
     m_b(other.m_b),
@@ -72,7 +85,7 @@ class selu_dropout : public regularizer_layer {
   }
 
   selu_dropout& operator=(const selu_dropout& other) {
-    regularizer_layer::operator=(other);
+    regularizer_layer<TensorDataType>::operator=(other);
     m_alpha_prime = other.m_alpha_prime;
     m_a = other.m_a;
     m_b = other.m_b;
@@ -95,35 +108,35 @@ class selu_dropout : public regularizer_layer {
 
   El::Device get_device_allocation() const override { return Dev; }
 
-  void setup_dims() override {
-    regularizer_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    regularizer_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    regularizer_layer::setup_matrices(grid);
+    regularizer_layer<TensorDataType>::setup_matrices(grid);
     if (m_mask != nullptr) { delete m_mask; }
-    m_mask = get_activations().Copy();
+    m_mask = this->get_activations().Copy();
   }
 
  protected:
   /** Drop out units in forward propagation. */
   void fp_compute() override {
-    if (this->m_model->get_execution_mode() != execution_mode::training ||
+    if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training ||
         m_keep_prob < 0.0f) {
       // Do nothing if dropout is disabled
-      El::Copy(get_prev_activations(), get_activations());
+      El::Copy(this->get_prev_activations(), this->get_activations());
     } else {
 
-      const auto *input_acts = &get_prev_activations();
+      const auto *input_acts = &this->get_prev_activations();
       const El::Int height = input_acts->Height();
       const El::Int width = input_acts->Width();
       const El::Int local_height = input_acts->LocalHeight();
       const El::Int local_width = input_acts->LocalWidth();
 
       const auto& local_input_acts = input_acts->LockedMatrix();
-      Mat& local_output_acts = get_local_activations();
-      Mat& local_mask = m_mask->Matrix();
+      CPUMatrixType& local_output_acts = this->get_local_activations();
+      CPUMatrixType& local_mask = m_mask->Matrix();
 
       // Construct and apply mask and the affine transform.
       // TODO: Optimize.
@@ -132,7 +145,7 @@ class selu_dropout : public regularizer_layer {
         for (El::Int row = 0; row < local_height; ++row) {
           local_output_acts(row, col) = m_a *
             (local_input_acts(row, col)*local_mask(row, col) +
-             m_alpha_prime*(1 - local_mask(row, col))) + m_b;
+             m_alpha_prime*(El::TypeTraits<TensorDataType>::One() - local_mask(row, col))) + m_b;
         }
       }
 
@@ -141,14 +154,14 @@ class selu_dropout : public regularizer_layer {
 
   /** Adjust gradients for dropout in backprop. */
   void bp_compute() override {
-    if (this->m_model->get_execution_mode() != execution_mode::training
+    if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training
         || m_keep_prob < 0.0f) {
-      El::Copy(get_prev_error_signals(), get_error_signals());
+      El::Copy(this->get_prev_error_signals(), this->get_error_signals());
     } else {
 
-      const auto& local_prev_error_signal = get_local_prev_error_signals();
-      Mat& local_error_signal = get_local_error_signals();
-      Mat& local_mask = m_mask->Matrix();
+      const auto& local_prev_error_signal = this->get_local_prev_error_signals();
+      CPUMatrixType& local_error_signal = this->get_local_error_signals();
+      CPUMatrixType& local_mask = m_mask->Matrix();
       const El::Int local_height = local_prev_error_signal.Height();
       const El::Int local_width = local_prev_error_signal.Width();
       // Reweight with the affine scale factor and the dropout mask.
@@ -164,17 +177,26 @@ class selu_dropout : public regularizer_layer {
 
  private:
   /** Alpha prime, the low-variance saturation point. */
-  DataType m_alpha_prime;
+  TensorDataType m_alpha_prime;
   /** Affine scaling parameter to keep mean/variance at desired value. */
-  DataType m_a;
+  TensorDataType m_a;
   /** Affine additive parameter to keep mean/variance at desired value. */
-  DataType m_b;
+  TensorDataType m_b;
   /** Probability of keeping each unit. */
-  float m_keep_prob;
+  TensorDataType m_keep_prob;
   /** Current dropout mask (a scaled Bernoulli random matrix). */
-  AbsDistMat *m_mask;
+  AbsDistMatrixType *m_mask;
 };
 
+#ifndef LBANN_SELU_DROPOUT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class selu_dropout<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class selu_dropout<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_SELU_DROPOUT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/CMakeLists.txt b/include/lbann/layers/transform/CMakeLists.txt
index 645d1764511..874ed611b23 100644
--- a/include/lbann/layers/transform/CMakeLists.txt
+++ b/include/lbann/layers/transform/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
-  concatenation.hpp
+  concatenate.hpp
   pooling.hpp
   reshape.hpp
   slice.hpp
diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp
index d3e827e6ee7..127cd581e52 100644
--- a/include/lbann/layers/transform/bernoulli.hpp
+++ b/include/lbann/layers/transform/bernoulli.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_BERNOULLI_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
@@ -36,18 +37,20 @@ namespace lbann {
  *
  *  During validation and testing, outputs are all zero.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class bernoulli_layer : public transform_layer {
-private:
-  /** Probability of outputting 1. */
-  DataType m_prob;
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class bernoulli_layer : public transform_layer<TensorDataType> {
+public:
+
+  using ProbabilityType = double;
 
 public:
   bernoulli_layer(lbann_comm *comm,
                   std::vector<int> dims,
-                  DataType prob = DataType(0.5))
-    : transform_layer(comm), m_prob(prob) {
-    set_output_dims(dims);
+                  ProbabilityType prob = 0.5)
+    : transform_layer<TensorDataType>(comm), m_prob(prob) {
+    this->set_output_dims(dims);
     this->m_expected_num_parent_layers = 0;
   }
   bernoulli_layer* copy() const override { return new bernoulli_layer(*this); }
@@ -56,7 +59,7 @@ class bernoulli_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     desc.add("Probability", m_prob);
     return desc;
   }
@@ -64,16 +67,34 @@ class bernoulli_layer : public transform_layer {
 protected:
 
   void fp_compute() override {
-    auto& output = get_activations();
-    if (this->m_model->get_execution_mode() == execution_mode::training) {
+    auto& output = this->get_activations();
+    if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) {
       bernoulli_fill(output, output.Height(), output.Width(), m_prob);
     } else {
       El::Zero(output);
     }
   }
 
+private:
+
+  /** Probability of outputting 1. */
+  ProbabilityType m_prob;
+
 };
 
+LBANN_DEFINE_LAYER_BUILDER(bernoulli);
+
+#ifndef LBANN_BERNOULLI_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device) \
+  extern template class bernoulli_layer<T, data_layout::DATA_PARALLEL, Device>;  \
+  extern template class bernoulli_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_BERNOULLI_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_BERNOULLI_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/categorical_random.hpp b/include/lbann/layers/transform/categorical_random.hpp
index ac756dbeb5f..ef14c4ed4c1 100644
--- a/include/lbann/layers/transform/categorical_random.hpp
+++ b/include/lbann/layers/transform/categorical_random.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
@@ -40,16 +41,18 @@ namespace lbann {
  *
  *  @todo Remove.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class categorical_random_layer : public transform_layer {
-
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class categorical_random_layer : public transform_layer<TensorDataType> {
+  static_assert(Dev == El::Device::CPU,
+                "categorical random layer currently only supports CPU");
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "categorical random layer currently only "
+                "supports DATA_PARALLEL");
  public:
   categorical_random_layer(lbann_comm *comm)
-    : transform_layer(comm) {
-    static_assert(Dev == El::Device::CPU,
-                  "categorical random layer currently only supports CPU");
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "categorical random layer currently only supports DATA_PARALLEL");
+    : transform_layer<TensorDataType>(comm) {
   }
   categorical_random_layer* copy() const override { return new categorical_random_layer(*this); }
   std::string get_type() const override { return "categorical random"; }
@@ -61,19 +64,19 @@ class categorical_random_layer : public transform_layer {
   void fp_compute() override {
 
     // Input and output matrices
-    const auto& input = get_prev_activations();
+    const auto& input = this->get_prev_activations();
     const auto& local_input = input.LockedMatrix();
-    auto& local_output = get_local_activations();
+    auto& local_output = this->get_local_activations();
     const auto& width = input.Width();
     const auto& local_height = local_input.Height();
     const auto& local_width = local_input.Width();
 
     // Initialize output and random numbers
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     El::Zero(local_output);
-    StarVCMat<El::Device::CPU> rand_mat(input.Grid(), input.Root());
+    StarVCMatDT<TensorDataType, El::Device::CPU> rand_mat(input.Grid(), input.Root());
     if (mode == execution_mode::training) {
-      uniform_fill(rand_mat, 1, width, DataType(0.5), DataType(0.5));
+      uniform_fill(rand_mat, 1, width, TensorDataType(0.5), TensorDataType(0.5));
     }
 
     // Process each mini-batch sample
@@ -85,7 +88,7 @@ class categorical_random_layer : public transform_layer {
       if (mode == execution_mode::training) {
         // Choose first output with CDF above random number in (0,1)
         const auto& rand = rand_mat.GetLocal(0, col);
-        DataType cdf = DataType(0);
+        TensorDataType cdf = El::TypeTraits<TensorDataType>::Zero();
         for (El::Int row = 0; row < local_height; ++row) {
           cdf += local_input(row, col);
           if (rand < cdf) {
@@ -101,7 +104,7 @@ class categorical_random_layer : public transform_layer {
       }
 
       // Output a one-hot vector
-      local_output(index, col) = DataType(1);
+      local_output(index, col) = El::TypeTraits<TensorDataType>::One();
 
     }
 
@@ -109,6 +112,20 @@ class categorical_random_layer : public transform_layer {
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(categorical_random);
+
+#ifndef LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE
+
+#define PROTO(T)                           \
+  extern template class categorical_random_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+
+#endif // LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp
new file mode 100644
index 00000000000..2b3e5091436
--- /dev/null
+++ b/include/lbann/layers/transform/concatenate.hpp
@@ -0,0 +1,410 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED
+#define LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <layers.pb.h>
+
+namespace lbann {
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout,
+          El::Device Device>
+class concatenate_distconv_adapter : public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  concatenate_distconv_adapter(Layer& layer):
+      data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~concatenate_distconv_adapter() = default;
+  dc::Shape get_activations_local_shape(int index=0) const override;
+  void fp_compute();
+  void bp_compute();
+};
+#endif // LBANN_HAS_DISTCONV
+
+/** @brief Concatenate tensors along specified dimension. */
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class concatenate_layer : public data_type_layer<TensorDataType> {
+public:
+
+  concatenate_layer(lbann_comm *comm, size_t concat_dim);
+  concatenate_layer(const concatenate_layer& other) = default;
+  concatenate_layer& operator=(const concatenate_layer& other) = default;
+
+  concatenate_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
+
+  description get_description() const override;
+
+protected:
+
+  void setup_pointers() override;
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+
+  void fp_setup_outputs(El::Int mini_batch_size) override;
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override;
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** @brief Tensor dimension to concatenate along. */
+  size_t m_concat_dim;
+
+#ifdef LBANN_HAS_GPU
+  /** @brief Workspace buffer.
+   *
+   *  Parameters for CUDA kernels are copied into this buffer and
+   *  asynchronously transferred to GPU.
+   */
+  std::vector<unsigned char> m_workspace;
+  /** @brief CUDA event for workspace buffer.
+   *
+   *  Makes sure asynchronous GPU memory transfers are completed
+   *  before modifying workspace buffer.
+   */
+  cuda::event_wrapper m_workspace_event;
+#endif // LBANN_HAS_GPU
+
+  template <typename U>
+  friend void fp_compute_impl(concatenate_layer<U,Layout,Device>&, size_t);
+  template <typename U, El::Device D>
+  friend void bp_setup_gradient_wrt_inputs_impl(concatenate_layer<U,Layout,D>&);
+  template <typename U>
+  friend void bp_compute_impl(concatenate_layer<U,Layout,Device>&, size_t);
+
+#ifdef LBANN_HAS_DISTCONV
+  friend class concatenate_distconv_adapter<TensorDataType, Layout, Device>;
+ protected:
+  bool is_distconv_supported() const override {
+    // Only supported for the channel dimension
+    return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL
+        && m_concat_dim == 0;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<
+      concatenate_distconv_adapter<TensorDataType, Layout, Device>>(*this);
+  }
+  concatenate_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() override;
+  const concatenate_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
+};
+
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+concatenate_layer<TensorDataType,Layout,Device>::concatenate_layer(
+  lbann_comm *comm,
+  size_t concat_dim)
+  : data_type_layer<TensorDataType>(comm),
+    m_concat_dim{concat_dim} {
+  this->m_expected_num_parent_layers = -1; // No limit on parents
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+concatenate_layer<TensorDataType, Layout,Device>* concatenate_layer<TensorDataType,Layout,Device>::copy() const {
+  return new concatenate_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string concatenate_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "concatenate";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout concatenate_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device concatenate_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description concatenate_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Concatenation dimension", m_concat_dim);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::setup_pointers() {
+  data_type_layer<TensorDataType>::setup_pointers();
+  if (this->get_num_parents() < 1) {
+    LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ",
+                "has no parents");
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+
+  // Dimensions of first input tensor
+  auto output_dims = this->get_input_dims(0);
+  if (m_concat_dim >= output_dims.size()) {
+    std::ostringstream err;
+    err << get_type() << " layer \"" << this->get_name() << "\" "
+        << "is concatenating along dimension " << m_concat_dim << ", "
+        << "but it has a " << output_dims.size() << "-D input tensor "
+        << "(parent layer \"" << this->get_parent_layers()[0]->get_name() << "\" "
+        << "outputs with dimensions ";
+    for (size_t d=0; d<output_dims.size(); ++d) {
+      err << (d>0 ? " x " : "") << output_dims[d];
+    }
+    err << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Dimensions of remaining input tensors
+  for (int j=1; j<this->get_num_parents(); ++j) {
+    const auto& input_dims = this->get_input_dims(j);
+    if (input_dims.size() != output_dims.size()
+        || !std::equal(input_dims.begin(),
+                       input_dims.begin() + m_concat_dim,
+                       output_dims.begin())
+        || !std::equal(input_dims.begin() + m_concat_dim + 1,
+                       input_dims.end(),
+                       output_dims.begin() + m_concat_dim + 1)) {
+      std::ostringstream err;
+      err << get_type() << " layer \"" << this->get_name() << "\" "
+          << "expects input tensors with dimensions ";
+      for (size_t d=0; d<output_dims.size(); ++d) {
+        err << (d>0 ? " x " : "");
+        if (d == m_concat_dim) { err << "X"; }
+        else { err << output_dims[d]; }
+      }
+      err << ", but parent layer "
+          << "\"" << this->get_parent_layers()[j]->get_name() << "\" "
+          << "outputs with dimensions ";
+      for (size_t d=0; d < input_dims.size(); ++d) {
+        err << (d>0 ? " x " : "") << input_dims[d];
+      }
+      LBANN_ERROR(err.str());
+    }
+    output_dims[m_concat_dim] += input_dims[m_concat_dim];
+  }
+
+  // Model-parallel implementation only supports flat data
+  if (Layout == data_layout::MODEL_PARALLEL
+      && std::accumulate(&output_dims[0], &output_dims[m_concat_dim], 1, std::multiplies<int>()) > 1) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "attempted to concatenate along dimension ",m_concat_dim,", ",
+                "but model-parallel concatenate layer "
+                "only supports flat data");
+  }
+
+  // Update output dimensions
+  this->set_output_dims(output_dims);
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::fp_setup_outputs(El::Int mini_batch_size) {
+#ifdef LBANN_HAS_DISTCONV
+  if (!this->keep_original_outputs(0)) return;
+#endif // LBANN_HAS_DISTCONV
+  const auto& input0 = this->get_prev_activations(0);
+  auto& output = this->get_activations();
+  output.Empty(false);
+  if (this->get_num_parents() == 1) {
+    El::LockedView(output, input0);
+  }
+  else {
+    output.AlignWith(input0);
+    output.Resize(this->get_output_size(), input0.Width());
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::fp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    get_distconv_adapter().fp_compute();
+    return;
+  }
+#endif
+
+  // Just make a view if there is one input
+  if (this->get_num_parents() == 1) {
+    El::LockedView(this->get_activations(), this->get_prev_activations(0));
+    return;
+  }
+
+  // Perform concatenation
+  fp_compute_impl(*this, m_concat_dim);
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void bp_setup_gradient_wrt_inputs_impl(
+  concatenate_layer<TensorDataType,data_layout::MODEL_PARALLEL,Device>& l) {
+#ifdef LBANN_HAS_DISTCONV
+  if (l.distconv_enabled()) {
+    LBANN_ERROR("Model-parallel LBANN matrix not supported in distconv");
+  }
+#endif // LBANN_HAS_DISTCONV
+
+  // Slice Elemental matrices
+  // Note: Assume each mini-batch sample is flat.
+  const size_t num_inputs = l.get_num_parents();
+  const auto& output_grad = l.get_prev_error_signals();
+  size_t offset = 0;
+  for (size_t j=0; j<num_inputs; ++j) {
+    auto& input_grad = l.get_error_signals(j);
+    const auto& input_size = l.get_input_size(j);
+    El::LockedView(input_grad, output_grad,
+                   El::IR(offset, offset+input_size), El::ALL);
+    offset += input_size;
+  }
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void bp_setup_gradient_wrt_inputs_impl(
+  concatenate_layer<TensorDataType,data_layout::DATA_PARALLEL,Device>& l) {
+
+  const size_t num_inputs = l.get_num_parents();
+  const auto& output_grad = l.get_prev_error_signals();
+  if (num_inputs == 1) {
+#ifdef LBANN_HAS_DISTCONV
+    if (!l.keep_original_gradient_wrt_inputs(0)) return;
+#endif
+    El::LockedView(l.get_error_signals(0), output_grad);
+  }
+  else {
+    for (size_t j=0; j<num_inputs; ++j) {
+#ifdef LBANN_HAS_DISTCONV
+      if (!l.keep_original_gradient_wrt_inputs(j)) continue;
+#endif
+      auto& input_grad = l.get_error_signals(j);
+      input_grad.AlignWith(output_grad);
+      input_grad.Resize(l.get_input_size(j), output_grad.Width());
+    }
+  }
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) {
+  bp_setup_gradient_wrt_inputs_impl(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_layer<TensorDataType,Layout,Device>::bp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    get_distconv_adapter().bp_compute();
+    return;
+  }
+#endif
+
+  // Just make a view if there is one input
+  if (this->get_num_parents() == 1) {
+    El::LockedView(this->get_error_signals(0), this->get_prev_error_signals());
+    return;
+  }
+
+  // Perform slice
+  bp_compute_impl(*this, m_concat_dim);
+
+}
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+concatenate_distconv_adapter<TensorDataType, T_layout, Dev>&
+concatenate_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<concatenate_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const concatenate_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const concatenate_distconv_adapter<TensorDataType, T_layout, Dev>&
+concatenate_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const concatenate_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dc::Shape concatenate_distconv_adapter<TensorDataType, Layout, Device>::
+get_activations_local_shape(int index) const {
+  assert_eq(index, 0);
+  auto shape = this->get_prev_activations().get_local_shape();
+  shape[-2] = this->get_activations_shape()[-2];
+  return shape;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_distconv_adapter<TensorDataType, Layout, Device>::
+fp_compute() {
+  assert_always(this->layer().get_num_parents() == 2);
+  dc::tensor::Concatenate(this->get_activations(0),
+                          this->get_prev_activations(0),
+                          this->get_prev_activations(1),
+                          El::GPUManager::Stream());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void concatenate_distconv_adapter<TensorDataType, Layout, Device>::
+bp_compute() {
+  dc::tensor::Slice(this->get_error_signals(0),
+                    this->get_error_signals(1),
+                    this->get_prev_error_signals(0),
+                    El::GPUManager::Stream());
+}
+#endif // LBANN_HAS_DISTCONV
+
+LBANN_DEFINE_LAYER_BUILDER(concatenate);
+
+#ifndef LBANN_CONCATENATE_LAYER_INSTANTIATE
+
+#define PROTO_DEVICE(T, Device)             \
+  extern template class concatenate_layer<  \
+    T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class concatenate_layer<  \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+#endif // LBANN_CONCATENATE_LAYER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_TRANSFORM_CONCATENATE_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/concatenation.hpp b/include/lbann/layers/transform/concatenation.hpp
deleted file mode 100644
index 5355787269f..00000000000
--- a/include/lbann/layers/transform/concatenation.hpp
+++ /dev/null
@@ -1,288 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_CONCATENATION_HPP_INCLUDED
-#define LBANN_LAYER_CONCATENATION_HPP_INCLUDED
-
-#include "lbann/layers/transform/transform.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-/** @brief Concatenate tensors along specified dimension. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class concatenation_layer : public transform_layer {
-public:
-
-  concatenation_layer(lbann_comm *comm, El::Int concat_dim)
-    : transform_layer(comm), m_concat_dim(concat_dim) {
-    this->m_expected_num_parent_layers = -1; // No limit on parents
-  }
-
-  concatenation_layer(const concatenation_layer& other)
-    : transform_layer(other),
-      m_concat_dim(other.m_concat_dim),
-      m_concat_points(other.m_concat_points) {
-    m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr);
-    m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr);
-  }
-
-  concatenation_layer& operator=(const concatenation_layer& other) {
-    transform_layer::operator=(other);
-    m_concat_dim = other.m_concat_dim;
-    m_concat_points = other.m_concat_points;
-    m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr);
-    m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr);
-  }
-
-  concatenation_layer* copy() const override { return new concatenation_layer(*this); }
-  std::string get_type() const override { return "concatenation"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
-  description get_description() const override {
-    auto&& desc = transform_layer::get_description();
-    desc.add("Concatenation dimension", m_concat_dim);
-    return desc;
-  }
-
-protected:
-
-  void setup_pointers() override {
-    transform_layer::setup_pointers();
-    if (get_num_parents() < 1) {
-      std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has no parents";
-      LBANN_ERROR(err.str());
-    }
-  }
-
-  void setup_matrices(const El::Grid& grid) override {
-    transform_layer::setup_matrices(grid);
-    const auto& input = get_prev_activations();
-    m_input_v.reset(input.Construct(input.Grid(), input.Root()));
-    m_output_v.reset(input.Construct(input.Grid(), input.Root()));
-  }
-
-  void setup_dims() override {
-    transform_layer::setup_dims();
-
-    // Get concatenation points for first parent layer
-    auto output_dims = get_input_dims(0);
-    if (m_concat_dim < 0
-        || m_concat_dim >= (El::Int) output_dims.size()) {
-      std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has " << output_dims.size() << " dimensions, "
-          << "but attempted to concatenate along "
-          << "dimension " << m_concat_dim;
-      LBANN_ERROR(err.str());
-    }
-    m_concat_points.clear();
-    m_concat_points.push_back(0);
-    m_concat_points.push_back(output_dims[m_concat_dim]);
-
-    // Get concatenation points for remaining parent layers
-    for (int i = 1; i < get_num_parents(); ++i) {
-      const auto& input_dims = get_input_dims(i);
-      if (input_dims.size() != output_dims.size()
-          || !std::equal(input_dims.begin(),
-                         input_dims.begin() + m_concat_dim,
-                         output_dims.begin())
-          || !std::equal(input_dims.begin() + m_concat_dim + 1,
-                         input_dims.end(),
-                         output_dims.begin() + m_concat_dim + 1)) {
-        std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
-            << "expects input tensors with dimensions ";
-        for (size_t j = 0; j < output_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "");
-          if ((int) j == m_concat_dim) {
-            err << "X";
-          } else {
-            err << output_dims[j];
-          }
-        }
-        err << ", but parent layer "
-            << "\"" << m_parent_layers[i]->get_name() << "\" "
-            << "outputs with dimensions ";
-        for (size_t j = 0; j < input_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << input_dims[j];
-        }
-        LBANN_ERROR(err.str());
-      }
-      output_dims[m_concat_dim] += input_dims[m_concat_dim];
-      m_concat_points.push_back(output_dims[m_concat_dim]);
-    }
-
-    // Update output dimensions
-    set_output_dims(output_dims);
-
-  }
-
-  void fp_setup_outputs(El::Int mini_batch_size) override {
-    const auto& num_inputs = get_num_parents();
-    const auto& output_dims = get_output_dims();
-
-    // Initialize output tensor
-    auto& output = get_activations();
-    output.Empty(false);
-    if (num_inputs > 1) {
-      output.AlignWith(get_prev_activations());
-      output.Resize(get_output_size(), mini_batch_size);
-    } else {
-      El::LockedView(output, get_prev_activations());
-      return;
-    }
-
-    // Divide output tensor into unit slices along concat dimension
-    // Note: Each unit slice is divided into contiguous "unit blocks"
-    const auto& output_num_unit_slices = output_dims[m_concat_dim];
-    const auto& blocks_per_slice
-      = (m_concat_dim > 0 ?
-         std::accumulate(&output_dims[0], &output_dims[m_concat_dim],
-                         1, std::multiplies<int>()) :
-         1);
-    const auto& unit_block_size
-      = std::accumulate(output_dims.begin() + m_concat_dim + 1,
-                        output_dims.end(),
-                        1, std::multiplies<int>());
-    const auto& output_block_stride = (output_num_unit_slices
-                                       * unit_block_size);
-
-    // Populate slices of output tensor with input tensors
-    for (int i = 0; i < num_inputs; ++i) {
-      const auto& input_dims = get_input_dims(i);
-      auto& input = get_prev_activations(i);
-
-      // Divide input tensor into unit slices
-      const auto& input_num_unit_slices = input_dims[m_concat_dim];
-
-      // Merge unit slices
-      const auto& block_size = input_num_unit_slices * unit_block_size;
-      const auto& output_block_offset = m_concat_points[i] * unit_block_size;
-
-      // Populate output tensor one block at a time
-      for (int block = 0; block < blocks_per_slice; ++block) {
-        const auto& input_offset = block * block_size;
-        const auto& output_offset = (output_block_offset
-                                     + block * output_block_stride);
-        El::LockedView(*m_input_v, input,
-                       El::IR(input_offset, input_offset + block_size),
-                       El::ALL);
-        El::View(*m_output_v, output,
-                 El::IR(output_offset, output_offset + block_size),
-                 El::ALL);
-        El::Copy(*m_input_v, *m_output_v);
-      }
-
-    }
-
-  }
-
-  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
-    const auto& num_inputs = get_num_parents();
-    const auto& output_dims = get_output_dims();
-
-    // Divide output tensor into unit slices along concat dimension
-    // Note: Each unit slice is divided into contiguous "unit blocks"
-    const auto& output_num_unit_slices = output_dims[m_concat_dim];
-    const auto& blocks_per_slice
-      = (m_concat_dim > 0 ?
-         std::accumulate(&output_dims[0], &output_dims[m_concat_dim],
-                         1, std::multiplies<int>()) :
-         1);
-    const auto& unit_block_size
-      = std::accumulate(output_dims.begin() + m_concat_dim + 1,
-                        output_dims.end(),
-                        1, std::multiplies<int>());
-    const auto& output_block_stride = (output_num_unit_slices
-                                       * unit_block_size);
-
-    // Populate gradient w.r.t. input tensors
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    for (int i = 0; i < num_inputs; ++i) {
-      const auto& input_dims = get_input_dims(i);
-      const auto& input_size = get_input_size(i);
-      auto& gradient_wrt_input = get_error_signals(i);
-
-      // Divide input tensor into unit slices
-      const auto& input_num_unit_slices = input_dims[m_concat_dim];
-
-      // Merge unit slices and get first contiguous output block
-      const auto& block_size = input_num_unit_slices * unit_block_size;
-      const auto& output_block_offset = m_concat_points[i] * unit_block_size;
-      El::LockedView(*m_output_v, gradient_wrt_output,
-                     El::IR(output_block_offset,
-                            output_block_offset + block_size),
-                     El::ALL);
-
-      // Populate gradient w.r.t. input tensor one block at a time
-      // Note: If there is only one block, the tensor can be a view
-      if (blocks_per_slice > 1) {
-        gradient_wrt_input.AlignWith(*m_output_v);
-        gradient_wrt_input.Resize(input_size, mini_batch_size);
-        for (int block = 0; block < blocks_per_slice; ++block) {
-          const auto& input_offset = block * block_size;
-          const auto& output_offset = (output_block_offset
-                                       + block * output_block_stride);
-          El::LockedView(*m_output_v, gradient_wrt_output,
-                         El::IR(output_offset, output_offset + block_size),
-                         El::ALL);
-          El::View(*m_input_v, gradient_wrt_input,
-                         El::IR(input_offset, input_offset + block_size),
-                         El::ALL);
-          El::Copy(*m_output_v, *m_input_v);
-        }
-      } else {
-        El::LockedView(gradient_wrt_input, *m_output_v);
-      }
-
-    }
-
-  }
-
-  void fp_compute() override {}
-  void bp_compute() override {}
-
-private:
-
-  /** Tensor dimension to concatenation. */
-  El::Int m_concat_dim;
-  /** Concatenation points for each child layer. */
-  std::vector<El::Int> m_concat_points;
-
-  /** View into input tensor. */
-  std::unique_ptr<AbsDistMat> m_input_v;
-  /** View into output tensor. */
-  std::unique_ptr<AbsDistMat> m_output_v;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_CONCATENATION_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/constant.hpp b/include/lbann/layers/transform/constant.hpp
index f4390884a56..b13737e424d 100644
--- a/include/lbann/layers/transform/constant.hpp
+++ b/include/lbann/layers/transform/constant.hpp
@@ -32,15 +32,17 @@
 namespace lbann {
 
 /** @brief Constant output. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class constant_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class constant_layer : public transform_layer<TensorDataType> {
 public:
 
   constant_layer(lbann_comm *comm,
-                 DataType value,
+                 TensorDataType value,
                  std::vector<int> dims)
-    : transform_layer(comm), m_value(value) {
-    set_output_dims(dims);
+    : transform_layer<TensorDataType>(comm), m_value(value) {
+    this->set_output_dims(dims);
     this->m_expected_num_parent_layers = 0;
   }
 
@@ -50,7 +52,7 @@ class constant_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     desc.add("Value", m_value);
     return desc;
   }
@@ -59,19 +61,30 @@ class constant_layer : public transform_layer {
 
   void fp_compute() override {
     if (m_value == EvalType(0)) {
-      El::Zero(get_activations());
+      El::Zero(this->get_activations());
     } else {
-      El::Fill(get_activations(), m_value);
+      El::Fill(this->get_activations(), m_value);
     }
   }
 
 private:
 
   /** Constant value. */
-  DataType m_value;
+  TensorDataType m_value;
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(constant);
+
+#ifndef LBANN_CONSTANT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class constant_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class constant_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_CONSTANT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_CONSTANT_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/crop.hpp b/include/lbann/layers/transform/crop.hpp
index f0b37b293d3..e77ab06a18f 100644
--- a/include/lbann/layers/transform/crop.hpp
+++ b/include/lbann/layers/transform/crop.hpp
@@ -40,21 +40,37 @@ namespace lbann {
  *  to the red-top-left corner and (1,1,1) to the blue-bottom-right
  *  corner. The crop size is determined at setup.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class crop_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class crop_layer : public transform_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "crop layer only supports DATA_PARALLEL");
+#ifdef LBANN_HAS_GPU_FP16
+  using CompareType = typename std::conditional<std::is_same<TensorDataType, fp16>::value, float, TensorDataType>::type;
+#else
+  using CompareType = TensorDataType;
+#endif
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
   crop_layer(lbann_comm *comm,
              std::vector<int> dims)
-    : transform_layer(comm) {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "crop layer only supports DATA_PARALLEL");
-    set_output_dims(dims);
+    : transform_layer<TensorDataType>(comm) {
+    this->set_output_dims(dims);
     this->m_expected_num_parent_layers = 2;
   }
 
   crop_layer(const crop_layer& other)
-    : transform_layer(other),
+    : transform_layer<TensorDataType>(other),
       m_input_v(other.m_input_v ?
                 other.m_input_v->Copy() : nullptr),
       m_output_v(other.m_output_v ?
@@ -62,7 +78,7 @@ class crop_layer : public transform_layer {
       m_crop_pos_v(other.m_crop_pos_v ?
                    other.m_crop_pos_v->Copy() : nullptr){}
   crop_layer& operator=(const crop_layer& other) {
-    transform_layer::operator=(other);
+    transform_layer<TensorDataType>::operator=(other);
     m_input_v.reset(other.m_input_v ?
                     other.m_input_v->Copy() : nullptr);
     m_output_v.reset(other.m_output_v ?
@@ -78,14 +94,14 @@ class crop_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   void setup_matrices(const El::Grid& grid) override {
-    transform_layer::setup_matrices(grid);
-    const auto& input = get_prev_activations();
+    transform_layer<TensorDataType>::setup_matrices(grid);
+    const auto& input = this->get_prev_activations();
     const auto& dist = input.DistData();
     m_input_v.reset(input.Construct(input.Grid(), input.Root()));
     m_output_v.reset(input.Construct(input.Grid(), input.Root()));
 
     /// @todo Setup the input tensor with this data distribution
-    m_crop_pos_v.reset(AbsDistMat::Instantiate(*dist.grid,
+    m_crop_pos_v.reset(AbsDistMatrixType::Instantiate(*dist.grid,
                                                dist.root,
                                                El::STAR,
                                                dist.rowDist,
@@ -95,30 +111,30 @@ class crop_layer : public transform_layer {
 
   }
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
     std::stringstream err;
 
     // Make sure input tensors have valid dimensions
-    const auto& input_dims = get_input_dims(0);
-    const auto& loc_dims = get_input_dims(1);
-    const auto& output_dims = get_output_dims();
+    const auto& input_dims = this->get_input_dims(0);
+    const auto& loc_dims = this->get_input_dims(1);
+    const auto& output_dims = this->get_output_dims();
     if (input_dims.size() != output_dims.size()) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "expects a crop input tensor with "
           << output_dims.size() << " dimensions, "
           << "but parent layer "
-          << "\"" << m_parent_layers[0]->get_name() << "\" "
+          << "\"" << this->get_parent_layers()[0]->get_name() << "\" "
           << "outputs a tensor with "
           << input_dims.size() << " dimensions";
       LBANN_ERROR(err.str());
     }
     if (loc_dims.size() != 1 || loc_dims[0] != (int) input_dims.size()) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "expects a 1D crop position tensor with "
           << output_dims.size() << " entries, "
           << "but parent layer "
-          << "\"" << m_parent_layers[1]->get_name() << "\" "
+          << "\"" << this->get_parent_layers()[1]->get_name() << "\" "
           << "outputs a tensor with dimensions ";
       for (size_t i = 0; i < loc_dims.size(); ++i) {
         err << (i > 0 ? " x " : "") << loc_dims[i];
@@ -131,14 +147,14 @@ class crop_layer : public transform_layer {
 protected:
 
   void fp_compute() override {
-    switch (get_input_dims().size()) {
+    switch (this->get_input_dims().size()) {
     case 3: fp_compute_3d(); break;
     default: fp_compute_nd();
     }
   }
 
   void bp_compute() override {
-    switch (get_input_dims().size()) {
+    switch (this->get_input_dims().size()) {
     case 3: bp_compute_3d(); break;
     default: bp_compute_nd();
     }
@@ -146,22 +162,22 @@ class crop_layer : public transform_layer {
 
 private:
   /** View into input tensor. */
-  std::unique_ptr<AbsDistMat> m_input_v;
+  std::unique_ptr<AbsDistMatrixType> m_input_v;
   /** View into output tensor. */
-  std::unique_ptr<AbsDistMat> m_output_v;
+  std::unique_ptr<AbsDistMatrixType> m_output_v;
   /** View into crop positions. */
-  std::unique_ptr<AbsDistMat> m_crop_pos_v;
+  std::unique_ptr<AbsDistMatrixType> m_crop_pos_v;
 
   /** Forward prop implementation for n-dimensional tensors. */
   void fp_compute_nd() {
 
     // Input and output tensors
-    const auto& input = get_prev_activations(0);
-    auto& output = get_activations();
+    const auto& input = this->get_prev_activations(0);
+    auto& output = this->get_activations();
 
     // Tensor dimensions
-    const auto& input_dims = get_input_dims(0);
-    const auto& output_dims = get_output_dims();
+    const auto& input_dims = this->get_input_dims(0);
+    const auto& output_dims = this->get_output_dims();
     const El::Int num_dims = output_dims.size();
     const auto& local_width = input.LocalWidth();
     const auto& region_size = output_dims.back();
@@ -169,7 +185,7 @@ class crop_layer : public transform_layer {
     // Get crop position
     m_crop_pos_v->Empty(false);
     m_crop_pos_v->AlignWith(input);
-    const auto& input1 = get_prev_activations(1);
+    const auto& input1 = this->get_prev_activations(1);
     if (m_crop_pos_v->DistData() == input1.DistData()) {
       El::LockedView(*m_crop_pos_v, input1);
     } else {
@@ -186,7 +202,7 @@ class crop_layer : public transform_layer {
       std::vector<El::Int> crop_offsets;
       for (El::Int d = 0; d < num_dims; ++d) {
         const auto& pos = local_crop_pos(d, local_col);
-        if (pos < DataType(0) || pos > DataType(1)) {
+        if (CompareType(pos) < CompareType(0.0) || CompareType(pos) > CompareType(1.0)) {
           std::stringstream err;
           err << "crop position not in range [0,1] (pos=(";
           for (El::Int i = 0; i < local_crop_pos.Height(); ++i) {
@@ -196,7 +212,7 @@ class crop_layer : public transform_layer {
           LBANN_ERROR(err.str());
         }
         const El::Int num_offsets = input_dims[d] - output_dims[d] + 1;
-        crop_offsets.push_back(std::min(El::Int(pos * num_offsets),
+        crop_offsets.push_back(std::min(El::Int(static_cast<CompareType>(pos) * num_offsets),
                                         num_offsets - 1));
       }
 
@@ -241,17 +257,17 @@ class crop_layer : public transform_layer {
   void bp_compute_nd() {
 
     // Clear error signals
-    El::Zero(get_error_signals(0));
-    El::Zero(get_error_signals(1));
+    El::Zero(this->get_error_signals(0));
+    El::Zero(this->get_error_signals(1));
 
     // Input and gradient tensors
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    auto& gradient_wrt_input = get_error_signals(0);
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
+    auto& gradient_wrt_input = this->get_error_signals(0);
     const auto& local_crop_pos = m_crop_pos_v->LockedMatrix();
 
     // Tensor dimensions
-    const auto& input_dims = get_input_dims(0);
-    const auto& output_dims = get_output_dims();
+    const auto& input_dims = this->get_input_dims(0);
+    const auto& output_dims = this->get_output_dims();
     const El::Int num_dims = output_dims.size();
     const auto& local_width = gradient_wrt_input.LocalWidth();
     const auto& region_size = output_dims.back();
@@ -265,7 +281,7 @@ class crop_layer : public transform_layer {
       std::vector<El::Int> crop_offsets;
       for (El::Int d = 0; d < num_dims; ++d) {
         const auto& pos = local_crop_pos(d, local_col);
-        if (pos < DataType(0) || pos > DataType(1)) {
+        if (CompareType(pos) < CompareType(0.0) || CompareType(pos) > CompareType(1.0)) {
           std::stringstream err;
           err << "crop position not in range [0,1] (pos=(";
           for (El::Int i = 0; i < local_crop_pos.Height(); ++i) {
@@ -275,7 +291,7 @@ class crop_layer : public transform_layer {
           LBANN_ERROR(err.str());
         }
         const El::Int num_offsets = input_dims[d] - output_dims[d] + 1;
-        crop_offsets.push_back(std::min(El::Int(pos * num_offsets),
+        crop_offsets.push_back(std::min(El::Int(static_cast<CompareType>(pos) * num_offsets),
                                         num_offsets - 1));
       }
 
@@ -327,6 +343,16 @@ class crop_layer : public transform_layer {
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(crop);
+
+#ifndef LBANN_CROP_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class crop_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_CROP_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_CROP_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/discrete_random.hpp b/include/lbann/layers/transform/discrete_random.hpp
index c668971726f..18d30846e25 100644
--- a/include/lbann/layers/transform/discrete_random.hpp
+++ b/include/lbann/layers/transform/discrete_random.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
@@ -39,8 +40,14 @@ namespace lbann {
  *
  *  @todo Remove.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class discrete_random_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class discrete_random_layer : public transform_layer<TensorDataType> {
+  static_assert(Dev == El::Device::CPU,
+                "discrete random layer currently only supports CPU");
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "discrete random layer currently only supports DATA_PARALLEL");
  private:
 
   /** Values in discrete distribution. */
@@ -50,13 +57,9 @@ class discrete_random_layer : public transform_layer {
   discrete_random_layer(lbann_comm *comm,
                         std::vector<DataType> values,
                         std::vector<int> dims)
-    : transform_layer(comm),
+    : transform_layer<TensorDataType>(comm),
       m_values(values) {
-    static_assert(Dev == El::Device::CPU,
-                  "discrete random layer currently only supports CPU");
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "discrete random layer currently only supports DATA_PARALLEL");
-    set_output_dims(dims);
+    this->set_output_dims(dims);
   }
   discrete_random_layer* copy() const override { return new discrete_random_layer(*this); }
   std::string get_type() const override { return "discrete random"; }
@@ -65,9 +68,9 @@ class discrete_random_layer : public transform_layer {
 
  protected:
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    if (get_input_size() != (int) m_values.size()) {
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    if (this->get_input_size() != (int) m_values.size()) {
       LBANN_ERROR("input tensor dimensions don't match number of "
                   "values in discrete distribution");
     }
@@ -76,9 +79,9 @@ class discrete_random_layer : public transform_layer {
   void fp_compute() override {
 
     // Input and output matrices
-    const auto& input = get_prev_activations();
+    const auto& input = this->get_prev_activations();
     const auto& local_input = input.LockedMatrix();
-    auto& output = get_activations();
+    auto& output = this->get_activations();
     auto& local_output = output.Matrix();
     const int num_values = m_values.size();
     const auto& num_outputs = local_output.Height();
@@ -86,9 +89,9 @@ class discrete_random_layer : public transform_layer {
     const auto& local_width = input.LocalWidth();
 
     // Initialize random numbers
-    const auto& mode = this->m_model->get_execution_mode();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
     if (mode == execution_mode::training) {
-      uniform_fill(output, 1, width, DataType(0.5), DataType(0.5));
+      uniform_fill(output, 1, width, TensorDataType(0.5), TensorDataType(0.5));
     }
 
     // Process each mini-batch sample
@@ -119,6 +122,16 @@ class discrete_random_layer : public transform_layer {
 
 };
 
+#ifndef LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE
+#define PROTO(T)                           \
+  extern template class discrete_random_layer< \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#endif // LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE
 } // namespace lbann
 
 #endif // LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/dummy.hpp b/include/lbann/layers/transform/dummy.hpp
index ec451fbe08e..4a3371752b9 100644
--- a/include/lbann/layers/transform/dummy.hpp
+++ b/include/lbann/layers/transform/dummy.hpp
@@ -36,10 +36,12 @@ namespace lbann {
  *  Does no computation and is primarily intended as a placeholder for
  *  unused layer outputs.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class dummy_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class dummy_layer : public transform_layer<TensorDataType> {
 public:
-  dummy_layer(lbann_comm *comm) : transform_layer(comm) {
+  dummy_layer(lbann_comm *comm) : transform_layer<TensorDataType>(comm) {
     this->m_expected_num_child_layers = 0;
   }
   dummy_layer* copy() const override { return new dummy_layer(*this); }
@@ -50,6 +52,17 @@ class dummy_layer : public transform_layer {
   void fp_compute() override {}
 };
 
+LBANN_DEFINE_LAYER_BUILDER(dummy);
+
+#ifndef LBANN_DUMMY_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class dummy_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class dummy_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_DUMMY_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_DUMMY_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/evaluation.hpp b/include/lbann/layers/transform/evaluation.hpp
index 014ff9e3849..aaf5d3ea209 100644
--- a/include/lbann/layers/transform/evaluation.hpp
+++ b/include/lbann/layers/transform/evaluation.hpp
@@ -32,7 +32,16 @@
 namespace lbann {
 
 /** @brief Interface with objective function and metrics. */
-class abstract_evaluation_layer : public transform_layer {
+template <typename TensorDataType>
+class abstract_evaluation_layer : public transform_layer<TensorDataType> {
+public:
+#ifdef LBANN_DETERMINISTIC
+  using EvalDataType = EvalType;
+#else
+  using EvalDataType = TensorDataType;
+#endif
+  using CPUMatType = El::Matrix<EvalDataType, El::Device::CPU>;
+
 public:
 
   /** Get scaling factor. */
@@ -51,8 +60,8 @@ class abstract_evaluation_layer : public transform_layer {
 
 protected:
   abstract_evaluation_layer(lbann_comm *comm);
-  void setup_dims() override;
-  void setup_data() override;
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
+  void setup_data(size_t max_mini_batch_size) override;
   void fp_compute() override;
   void bp_compute() override;
 
@@ -63,7 +72,7 @@ class abstract_evaluation_layer : public transform_layer {
   /** Evaluated value.
    *  The value may be stored in pinned memory.
    */
-  CPUMat m_value;
+  CPUMatType m_value;
   /** Non-blocking allreduce request. */
   Al::request m_allreduce_req;
 #ifdef LBANN_HAS_GPU
@@ -77,16 +86,39 @@ class abstract_evaluation_layer : public transform_layer {
  *  Computes the average value across a mini-batch. If the input
  *  tensor has multiple neurons, their values are added together.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class evaluation_layer : public abstract_evaluation_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class evaluation_layer : public abstract_evaluation_layer<TensorDataType> {
 public:
-  evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {}
+  evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer<TensorDataType>(comm) {}
   evaluation_layer* copy() const override { return new evaluation_layer(*this); }
   std::string get_type() const override { return "evaluation"; }
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 };
 
+LBANN_DEFINE_LAYER_BUILDER(evaluation);
+
+#ifndef LBANN_EVALUATION_LAYER_INSTANTIATE
+#define PROTO(T)                           \
+  extern template class abstract_evaluation_layer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+
+#define PROTO_DEVICE(T, Device)                                         \
+  extern template class evaluation_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class evaluation_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_EVALUATION_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_EVALUATION_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp
index 7ab43afc3a7..9123c2e1172 100644
--- a/include/lbann/layers/transform/gaussian.hpp
+++ b/include/lbann/layers/transform/gaussian.hpp
@@ -28,30 +28,38 @@
 #define LBANN_LAYER_GAUSSIAN_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
 
-/** @brief Random values with Gaussian distribution.
- *
- *  During validation and testing, outputs are all equal to the
- *  distribution mean.
- */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class gaussian_layer : public transform_layer {
+/** @brief Random values from Gaussian/normal distribution. */
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class gaussian_layer : public transform_layer<TensorDataType> {
 private:
-  /** Gaussian distribution mean. */
-  DataType m_mean;
-  /** Gaussian distribution standard deviation. */
-  DataType m_stdev;
+  /** @brief Gaussian distribution mean. */
+  TensorDataType m_mean;
+  /** @brief Gaussian distribution standard deviation. */
+  TensorDataType m_stdev;
+  /** @brief Whether to have deterministic output when not training.
+   *
+   *  Applies to execution modes other than training, e.g. validation
+   *  and inference. If true, outputs are all equal to the
+   *  distribution mean when not training.
+   */
+  bool m_training_only;
 
 public:
   gaussian_layer(lbann_comm *comm,
                  const std::vector<int>& dims,
-                 DataType mean = DataType(0),
-                 DataType stdev = DataType(1))
-    : transform_layer(comm), m_mean(mean), m_stdev(stdev) {
-    set_output_dims(dims);
+                 TensorDataType mean = El::TypeTraits<TensorDataType>::Zero(),
+                 TensorDataType stdev = El::TypeTraits<TensorDataType>::One(),
+                 bool training_only = false)
+    : transform_layer<TensorDataType>(comm),
+      m_mean(mean), m_stdev(stdev), m_training_only(training_only) {
+    this->set_output_dims(dims);
     this->m_expected_num_parent_layers = 0;
   }
   gaussian_layer* copy() const override { return new gaussian_layer(*this); }
@@ -60,25 +68,37 @@ class gaussian_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     desc.add("Mean", m_mean);
     desc.add("Standard deviation", m_stdev);
+    desc.add("Training only", m_training_only);
     return desc;
   }
 
 protected:
 
   void fp_compute() override {
-    auto& output = get_activations();
-    if (this->m_model->get_execution_mode() == execution_mode::training) {
-      gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev);
-    } else {
+    auto& output = this->get_activations();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
+    if (m_training_only && (mode != execution_mode::training)) {
       El::Fill(output, m_mean);
     }
+    else {
+      gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev);
+    }
   }
 
 };
 
+#ifndef LBANN_GAUSSIAN_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class gaussian_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class gaussian_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_GAUSSIAN_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_GAUSSIAN_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/hadamard.hpp b/include/lbann/layers/transform/hadamard.hpp
index 04426334b91..7987ef0b72f 100644
--- a/include/lbann/layers/transform/hadamard.hpp
+++ b/include/lbann/layers/transform/hadamard.hpp
@@ -34,11 +34,13 @@
 namespace lbann {
 
 /** @brief Entry-wise tensor product. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class hadamard_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class hadamard_layer : public transform_layer<TensorDataType> {
 public:
 
-  hadamard_layer(lbann_comm *comm) : transform_layer(comm) {
+  hadamard_layer(lbann_comm *comm) : transform_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = -1; // No limit on parents
   }
 
@@ -50,29 +52,29 @@ class hadamard_layer : public transform_layer {
 protected:
 
   void setup_pointers() override {
-    transform_layer::setup_pointers();
-    if (get_num_parents() < 1) {
+    transform_layer<TensorDataType>::setup_pointers();
+    if (this->get_num_parents() < 1) {
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has no parent layers";
       LBANN_ERROR(err.str());
     }
   }
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
 
     // Check that input dimensions match
-    const auto& output_dims = get_output_dims();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      if (get_input_dims(i) != output_dims) {
-        const auto& parents = get_parent_layers();
+    const auto& output_dims = this->get_output_dims();
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      if (this->get_input_dims(i) != output_dims) {
+        const auto& parents = this->get_parent_layers();
         std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
+        err << get_type() << " layer \"" << this->get_name() << "\" "
             << "has input tensors with incompatible dimensions (";
-        for (int j = 0; j < get_num_parents(); ++j) {
-          const auto& dims = get_input_dims(j);
+        for (int j = 0; j < this->get_num_parents(); ++j) {
+          const auto& dims = this->get_input_dims(j);
           err << (j > 0 ? ", " : "")
               << "layer \"" << parents[j]->get_name() << "\" outputs ";
           for (size_t k = 0; k < dims.size(); ++k) {
@@ -87,35 +89,35 @@ class hadamard_layer : public transform_layer {
   }
 
   void fp_compute() override {
-    auto& output = get_activations();
-    switch (get_num_parents()) {
-    case 0: El::Fill(output, DataType(1)); break;
-    case 1: El::LockedView(output, get_prev_activations()); break;
+    auto& output = this->get_activations();
+    switch (this->get_num_parents()) {
+    case 0: El::Fill(output, El::TypeTraits<TensorDataType>::One()); break;
+    case 1: El::LockedView(output, this->get_prev_activations()); break;
     default:
-      El::Hadamard(get_prev_activations(0),
-                   get_prev_activations(1),
+      El::Hadamard(this->get_prev_activations(0),
+                   this->get_prev_activations(1),
                    output);
-      for (int i = 2; i < get_num_parents(); ++i) {
-        El::Hadamard(get_prev_activations(i), output, output);
+      for (int i = 2; i < this->get_num_parents(); ++i) {
+        El::Hadamard(this->get_prev_activations(i), output, output);
       }
     }
   }
 
   void bp_compute() override {
-    const int num_parents = get_num_parents();
-    const auto& gradient_wrt_output = get_prev_error_signals();
+    const int num_parents = this->get_num_parents();
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
     switch (num_parents) {
     case 0: break;
     case 1:
-      El::LockedView(get_error_signals(), gradient_wrt_output);
+      El::LockedView(this->get_error_signals(), gradient_wrt_output);
       break;
     default:
       for (int i = 0; i < num_parents; ++i) {
-        auto& gradient_wrt_input = get_error_signals(i);
+        auto& gradient_wrt_input = this->get_error_signals(i);
         El::Copy(gradient_wrt_output, gradient_wrt_input);
         for (int j = 0; j < num_parents; ++j) {
           if (i != j) {
-            El::Hadamard(get_prev_activations(j),
+            El::Hadamard(this->get_prev_activations(j),
                          gradient_wrt_input,
                          gradient_wrt_input);
           }
@@ -126,6 +128,17 @@ class hadamard_layer : public transform_layer {
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(hadamard);
+
+#ifndef LBANN_HADAMARD_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class hadamard_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class hadamard_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_HADAMARD_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_HADAMARD_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/in_top_k.hpp b/include/lbann/layers/transform/in_top_k.hpp
index 85abe8caba4..26852889c3c 100644
--- a/include/lbann/layers/transform/in_top_k.hpp
+++ b/include/lbann/layers/transform/in_top_k.hpp
@@ -38,12 +38,14 @@ namespace lbann {
  *  one and the rest to zero. Ties are broken in favor of entries with
  *  smaller indices.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class in_top_k_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class in_top_k_layer : public transform_layer<TensorDataType> {
  public:
 
   in_top_k_layer(lbann_comm *comm, El::Int k)
-    : transform_layer(comm), m_k(k) {
+    : transform_layer<TensorDataType>(comm), m_k(k) {
     if (m_k < 0) {
       std::stringstream err;
       err << "invalid parameter for top-k search (k=" << m_k << ")";
@@ -57,16 +59,16 @@ class in_top_k_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     desc.add("k", m_k);
     return desc;
   }
 
  protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void fp_compute() override;
@@ -78,6 +80,15 @@ class in_top_k_layer : public transform_layer {
 
 };
 
+#ifndef LBANN_IN_TOP_K_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class in_top_k_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class in_top_k_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_IN_TOP_K_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_IN_TOP_K_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp
index abf6689aa82..35db88a633c 100644
--- a/include/lbann/layers/transform/pooling.hpp
+++ b/include/lbann/layers/transform/pooling.hpp
@@ -33,19 +33,42 @@
 #include "lbann/utils/cudnn.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/utils/im2col.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class pooling_distconv_adapter : public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  pooling_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~pooling_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  dc::Shape get_activations_local_shape(int index=0) const override;
+  void setup_layer(size_t workspace_capacity) override;
+  void fp_compute();
+  void bp_compute();
+  std::unique_ptr<dc::Pooling<TensorDataType>> m_pooling;
+};
+#endif // LBANN_HAS_DISTCONV
+
 // Forward declaration
-template <data_layout T_layout, El::Device Dev>
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
 class unpooling_layer;
 
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class pooling_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class pooling_layer : public transform_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "pooling only supports DATA_PARALLEL");
 private:
 
   /** Pooling mode. */
-  const pool_mode m_pool_mode;
+  pool_mode m_pool_mode;
 
   /** Pooling window dimensions. */
   std::vector<int> m_pool_dims;
@@ -67,10 +90,10 @@ class pooling_layer : public transform_layer {
   /** Pooling descriptor. */
   cudnnPoolingDescriptor_t m_pooling_cudnn_desc;
   /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+  cudnn::data_parallel_layer_tensor_manager<TensorDataType> m_tensors_cudnn_desc;
 #endif // LBANN_HAS_CUDNN
 
-  friend class unpooling_layer<T_layout, Dev>;
+  friend class unpooling_layer<TensorDataType, T_layout, Dev>;
 
 public:
 
@@ -93,7 +116,7 @@ class pooling_layer : public transform_layer {
                 std::vector<int> pads,
                 std::vector<int> strides,
                 pool_mode mode)
-    : transform_layer(comm),
+    : transform_layer<TensorDataType>(comm),
       m_pool_mode(mode),
       m_pool_dims(pool_dims),
       m_pads(pads),
@@ -103,9 +126,6 @@ class pooling_layer : public transform_layer {
       m_tensors_cudnn_desc(this)
 #endif // LBANN_HAS_CUDNN
   {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "pooling only supports DATA_PARALLEL");
-
     // Initialize input dimensions and pooling parameters
     m_pool_size = std::accumulate(m_pool_dims.begin(),
                                   m_pool_dims.end(),
@@ -115,7 +135,7 @@ class pooling_layer : public transform_layer {
   }
 
   pooling_layer(const pooling_layer& other)
-    : transform_layer(other),
+    : transform_layer<TensorDataType>(other),
       m_pool_mode(other.m_pool_mode),
       m_pool_dims(other.m_pool_dims),
       m_pool_size(other.m_pool_size),
@@ -134,7 +154,7 @@ class pooling_layer : public transform_layer {
   }
 
   pooling_layer& operator=(const pooling_layer& other){
-    transform_layer::operator=(other);
+    transform_layer<TensorDataType>::operator=(other);
     m_pool_mode = other.m_pool_mode;
     m_pool_dims = other.m_pool_dims;
     m_pool_size = other.m_pool_size;
@@ -163,7 +183,7 @@ class pooling_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     std::stringstream ss;
 
     // Pool mode
@@ -210,21 +230,21 @@ class pooling_layer : public transform_layer {
 
 protected:
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    const auto& input_dims = get_input_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    const auto& input_dims = this->get_input_dims();
     auto output_dims = input_dims;
     for(size_t i = 0; i < output_dims.size() - 1; ++i) {
       const int effective_dim = (input_dims[i+1] + 2 * m_pads[i]
                                  - m_pool_dims[i] + 1);
       output_dims[i+1] = (effective_dim + m_strides[i] - 1) / m_strides[i];
     }
-    set_output_dims(output_dims);
+    this->set_output_dims(output_dims);
   }
 
   /// Initialize GPU objects
   void setup_gpu() override {
-    transform_layer::setup_gpu();
+    transform_layer<TensorDataType>::setup_gpu();
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
@@ -262,6 +282,12 @@ class pooling_layer : public transform_layer {
 
   void fp_compute() override {
     if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+      if (this->distconv_enabled()) {
+        get_distconv_adapter().fp_compute();
+        return;
+      }
+#endif // LBANN_HAS_DISTCONV
       fp_compute_cudnn();
     } else {
       fp_compute_im2col();
@@ -270,6 +296,12 @@ class pooling_layer : public transform_layer {
 
   void bp_compute() override {
     if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+      if (this->distconv_enabled()) {
+        get_distconv_adapter().bp_compute();
+        return;
+      }
+#endif // LBANN_HAS_DISTCONV
       bp_compute_cudnn();
     } else {
       bp_compute_im2col();
@@ -283,11 +315,12 @@ class pooling_layer : public transform_layer {
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
+    using ScalingType = cudnn::ScalingParamType<TensorDataType>;
+    const auto& local_input = this->get_local_prev_activations();
+    auto& local_output = this->get_local_activations();
     if (local_input.Height() > 0 && local_input.Width() > 0) {
-      const DataType zero = DataType(0);
-      const DataType one = DataType(1);
+      const auto zero = El::TypeTraits<ScalingType>::Zero();
+      const auto one = El::TypeTraits<ScalingType>::One();
       CHECK_CUDNN(cudnnPoolingForward(cudnn::get_handle(),
                                       m_pooling_cudnn_desc,
                                       &one,
@@ -305,15 +338,16 @@ class pooling_layer : public transform_layer {
 #ifndef LBANN_HAS_CUDNN
     LBANN_ERROR("cuDNN not detected");
 #else
-    const auto& local_input = get_local_prev_activations();
-    const auto& local_output = get_local_activations();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
+    using ScalingType = cudnn::ScalingParamType<TensorDataType>;
+    const auto& local_input = this->get_local_prev_activations();
+    const auto& local_output = this->get_local_activations();
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    auto& local_gradient_wrt_input = this->get_local_error_signals();
     if (local_input.Height() > 0 && local_input.Width() > 0) {
 
       // Useful constants
-      const DataType one = DataType(1);
-      const DataType zero = DataType(0);
+      const auto one = El::TypeTraits<ScalingType>::One();
+      const auto zero = El::TypeTraits<ScalingType>::Zero();
 
       // Perform backprop on GPU
       CHECK_CUDNN(cudnnPoolingBackward(cudnn::get_handle(),
@@ -340,23 +374,23 @@ class pooling_layer : public transform_layer {
     }
 
     // Local matrices
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
+    const auto& local_input = this->get_local_prev_activations();
+    auto& local_output = this->get_local_activations();
 
     // Pool parameters
     const int local_width = local_input.Width();
-    const auto& input_dims = get_input_dims();
+    const auto& input_dims = this->get_input_dims();
     const int num_channels = input_dims[0];
-    const int num_per_output_channel = get_output_size() / num_channels;
+    const int num_per_output_channel = this->get_output_size() / num_channels;
 
     // Initialize max pool indices if needed
     if(m_pool_mode == pool_mode::max) {
-      m_max_pool_indices.assign(get_output_size() * local_width, 0);
+      m_max_pool_indices.assign(this->get_output_size() * local_width, 0);
     }
 
     // Initialize matrices
-    DMat<Dev> im2col_mat(m_pool_size * num_channels, num_per_output_channel);
-    DMat<Dev> input_mat;
+    El::Matrix<TensorDataType, Dev> im2col_mat(m_pool_size * num_channels, num_per_output_channel);
+    El::Matrix<TensorDataType, Dev> input_mat;
 
     // Iterate through data samples
     for(int sample = 0; sample < local_width; ++sample) {
@@ -364,7 +398,7 @@ class pooling_layer : public transform_layer {
       // Construct im2col matrix from input
       El::LockedView(input_mat, local_input,
                      El::ALL, El::IR(sample));
-      im2col(input_mat,
+      im2col<TensorDataType>(input_mat,
              im2col_mat,
              num_channels,
              input_dims.size() - 1,
@@ -375,16 +409,16 @@ class pooling_layer : public transform_layer {
 
       if(m_pool_mode == pool_mode::max) {
         // Apply max pooling
-        DataType *output_buffer = local_output.Buffer(0, sample);
-        int *indices_buffer = &m_max_pool_indices[sample * get_output_size()];
+        TensorDataType *output_buffer = local_output.Buffer(0, sample);
+        int *indices_buffer = &m_max_pool_indices[sample * this->get_output_size()];
         LBANN_OMP_PARALLEL_FOR
         for(int channel = 0; channel < num_channels; ++channel) {
           for(int j = 0; j < num_per_output_channel; ++j) {
-            DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
-            DataType max_entry = im2col_buffer[0];
+            TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
+            TensorDataType max_entry = im2col_buffer[0];
             int max_index = 0;
             for(int i = 1; i < m_pool_size; ++i) {
-              const DataType current_entry = im2col_buffer[i];
+              const TensorDataType current_entry = im2col_buffer[i];
               if(current_entry > max_entry) {
                 max_entry = current_entry;
                 max_index = i;
@@ -399,13 +433,13 @@ class pooling_layer : public transform_layer {
 
       if(m_pool_mode == pool_mode::average) {
         // Apply average pooling
-        DataType *output_buffer = local_output.Buffer(0, sample);
+        TensorDataType *output_buffer = local_output.Buffer(0, sample);
         LBANN_OMP_PARALLEL_FOR
         for(int channel = 0; channel < num_channels; ++channel) {
           for(int j = 0; j < num_per_output_channel; ++j) {
-            const DataType *im2col_buffer
+            const TensorDataType *im2col_buffer
               = im2col_mat.LockedBuffer(channel*m_pool_size, j);
-            DataType output_entry = 0;
+            TensorDataType output_entry = El::TypeTraits<TensorDataType>::Zero();
             for(int i = 0; i < m_pool_size; ++i) {
               output_entry += im2col_buffer[i];
             }
@@ -422,23 +456,24 @@ class pooling_layer : public transform_layer {
 
   /// Pooling forward propagation with im2col
   void bp_compute_im2col() {
+    using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
     if(m_pool_mode != pool_mode::max && m_pool_mode != pool_mode::average) {
       LBANN_ERROR("CPU pooling layer only supports max and average pooling");
     }
 
     // Local matrices
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    auto& local_gradient_wrt_input = this->get_local_error_signals();
 
     // Pool parameters
     const int local_width = local_gradient_wrt_output.Width();
-    const auto& input_dims = get_input_dims();
+    const auto& input_dims = this->get_input_dims();
     const int num_channels = input_dims[0];
-    const int num_per_input_channel = get_output_size() / num_channels;
+    const int num_per_input_channel = this->get_output_size() / num_channels;
 
     // Initialize matrices
-    CPUMat im2col_mat(m_pool_size * num_channels, num_per_input_channel);
-    CPUMat gradient_wrt_input_col;
+    CPUMatType im2col_mat(m_pool_size * num_channels, num_per_input_channel);
+    CPUMatType gradient_wrt_input_col;
 
     // Iterate through data samples
     for(int sample = 0; sample < local_width; ++sample) {
@@ -451,16 +486,16 @@ class pooling_layer : public transform_layer {
 
         // Copy previous error signal to im2col matrix entries
         // corresponding to max
-        const DataType *gradient_wrt_output_buffer
+        const TensorDataType *gradient_wrt_output_buffer
           = local_gradient_wrt_output.LockedBuffer(0, sample);
         const int *indices_buffer
-          = &m_max_pool_indices[sample * get_output_size()];
+          = &m_max_pool_indices[sample * this->get_output_size()];
         LBANN_OMP_PARALLEL_FOR
         for(int channel = 0; channel < num_channels; ++channel) {
           for(int j = 0; j < num_per_input_channel; ++j) {
             const int input_index = j + channel * num_per_input_channel;
             const int max_index = indices_buffer[input_index];
-            DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
+            TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
             im2col_buffer[max_index]
               = gradient_wrt_output_buffer[input_index];
           }
@@ -470,15 +505,15 @@ class pooling_layer : public transform_layer {
 
       // Compute gradient w.r.t. im2col matrix for average pooling
       if(m_pool_mode == pool_mode::average) {
-        const DataType *gradient_wrt_output_buffer
+        const TensorDataType *gradient_wrt_output_buffer
           = local_gradient_wrt_output.LockedBuffer(0, sample);
         LBANN_OMP_PARALLEL_FOR
         for(int channel = 0; channel < num_channels; ++channel) {
           for(int j = 0; j < num_per_input_channel; ++j) {
-            DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
+            TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j);
             const int input_index = j + channel * num_per_input_channel;
-            const DataType output_entry
-              = gradient_wrt_output_buffer[input_index] / m_pool_size;
+            const TensorDataType output_entry
+              = gradient_wrt_output_buffer[input_index] / El::To<TensorDataType>(m_pool_size);
             for(int i = 0; i < m_pool_size; ++i) {
               im2col_buffer[i] = output_entry;
             }
@@ -490,7 +525,7 @@ class pooling_layer : public transform_layer {
       // Compute error signal (i.e. gradient w.r.t. input)
       El::View(gradient_wrt_input_col, local_gradient_wrt_input,
                El::ALL, El::IR(sample));
-      col2im(im2col_mat,
+      col2im<TensorDataType>(im2col_mat,
              gradient_wrt_input_col,
              num_channels,
              input_dims.size() - 1,
@@ -503,6 +538,18 @@ class pooling_layer : public transform_layer {
 
   }
 
+#ifdef LBANN_HAS_DISTCONV
+  friend class pooling_distconv_adapter<TensorDataType, T_layout, Dev>;
+ protected:
+  bool is_distconv_supported() const override;
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<
+      pooling_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
+  }
+  pooling_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const pooling_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
+
 #ifdef LBANN_HAS_CUDNN
   /** Copy pooling cuDNN descriptor. */
   static void copy_pooling_cudnn_desc(const cudnnPoolingDescriptor_t& src,
@@ -553,6 +600,187 @@ class pooling_layer : public transform_layer {
 
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+pooling_distconv_adapter<TensorDataType, T_layout, Dev>&
+pooling_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<pooling_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const pooling_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const pooling_distconv_adapter<TensorDataType, T_layout, Dev>&
+pooling_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const pooling_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+bool pooling_layer<TensorDataType, T_layout, Dev>::is_distconv_supported() const {
+  if (Dev != El::Device::GPU || T_layout != data_layout::DATA_PARALLEL) {
+    return false;
+  }
+
+  bool cond = true;
+  for(int i = 0; i < dc::get_num_spatial_dims(*this); i++) {
+    cond &= (m_pool_dims[i] % 2 != 0) ||
+        (m_pool_dims[i] == m_strides[i]);
+  }
+  if (!cond) {
+    dc::MPIPrintStreamDebug() << "pooling: unsupported due to window shape: "
+                              << dc::util::join_xd_array(m_pool_dims);
+    return false;
+  }
+
+  for (int i = 0; i < dc::get_num_spatial_dims(*this); i++) {
+    bool odd = m_pool_dims[i] % 2;
+    if (odd) {
+      int stencil = (m_pool_dims[i] - 1) / 2;
+      if (!(m_pads[i] == 0 || m_pads[i] == stencil)) {
+        dc::MPIPrintStreamDebug() << "pooling: unsupported due to padding: "
+                                  << m_pads[i];
+        return false;
+      }
+      if (!(m_strides[i] == 1 || m_strides[i] == stencil + 1)) {
+        dc::MPIPrintStreamDebug() << "pooling: unsupported due to strides";
+        return false;
+      }
+    } else {
+      if (m_pads[i] != 0) return false;
+      if (m_pool_dims[i] != m_strides[i]) return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void pooling_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+  const auto &l = dynamic_cast<const pooling_layer<TensorDataType, T_layout, Dev>&>(
+      this->layer());
+  dc::IntVector overlap(dc::get_num_dims(l), 0);
+  const auto &ps = l.get_parallel_strategy();
+  auto pool_dims = l.m_pool_dims;
+  std::reverse(pool_dims.begin(), pool_dims.end());
+  for(int i = 0; i < dc::get_num_spatial_dims(l); i++) {
+    int splits = 0;
+    switch (i) {
+      case 0: splits = ps.width_splits; break;
+      case 1: splits = ps.height_splits; break;
+      case 2: splits = ps.depth_splits; break;
+    }
+    if(splits == 1) continue;
+    int ov = 0;
+    if (pool_dims[i] % 2) {
+      ov = (pool_dims[i] - 1) / 2;
+    } else {
+      // no halo dependency is assumed for now
+      ov = 0;
+    }
+    overlap[i] = ov;
+  }
+  auto &prev_activations_dist = this->get_prev_activations_dist();
+  auto &activations_dist = this->get_activations_dist();
+  auto &error_signals_dist = this->get_error_signals_dist();
+  auto &prev_error_signals_dist = this->get_prev_error_signals_dist();
+  prev_activations_dist.set_overlap(overlap);
+  constraints.mark_updated(prev_activations_dist);
+  constraints.mark_invariant(prev_activations_dist);
+  // cudnnPoolingBackward requires activations and
+  // prev_error_signals must have the same stride
+  constraints.mark_equivalent(activations_dist, prev_error_signals_dist);
+  // cudnnPoolingBackward requires prev_activations and
+  // error_signals must have the same stride
+  constraints.mark_equivalent(error_signals_dist, prev_activations_dist);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dc::Shape pooling_distconv_adapter<TensorDataType, Layout, Device>::
+get_activations_local_shape(int index) const {
+  assert_eq(index, 0);
+  const auto &layer = dynamic_cast<const pooling_layer<
+    TensorDataType, Layout, Device>&>(this->layer());
+  auto filter_dims = layer.m_pool_dims;
+  std::reverse(std::begin(filter_dims), std::end(filter_dims));
+  auto strides = layer.m_strides;
+  std::reverse(std::begin(strides), std::end(strides));
+  const std::vector<int> dilations(
+      dc::get_num_spatial_dims(layer), 1);
+  bool use_padding = layer.m_pads[0] != 0;
+  auto output_spatial_local_shape =
+      ::distconv::get_pooling_output_local_tensor_shape(
+          this->get_prev_activations(), filter_dims, strides, use_padding, dilations);
+  return output_spatial_local_shape;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void pooling_distconv_adapter<TensorDataType, Layout, Device>::
+setup_layer(size_t workspace_capacity) {
+  auto &l = dynamic_cast<pooling_layer<TensorDataType, Layout, Device>&>(
+      this->layer());
+
+  // Init the dc::Pooling layer
+  m_pooling = make_unique<dc::Pooling<TensorDataType>>(
+      dc::get_backend(), dc::get_num_dims(l),
+      dc::get_halo_exchange_method());
+
+  std::string mode;
+  switch(l.m_pool_mode) {
+    case pool_mode::max:
+      mode = "MAX"; break;
+    case pool_mode::average:
+      mode = "AVERAGE"; break;
+    case pool_mode::average_no_pad:
+      mode = "AVERAGE_NO_PAD"; break;
+    default:
+      LBANN_ERROR("pooling_layer: no DISTCONV implementation for pooling mode");
+  }
+
+  std::vector<int> pool_dims = l.m_pool_dims;
+  std::reverse(pool_dims.begin(), pool_dims.end());
+  std::vector<int> pads = l.m_pads;
+  std::reverse(pads.begin(), pads.end());
+  std::vector<int> strides = l.m_strides;
+  std::reverse(strides.begin(), strides.end());
+
+  m_pooling->setup(this->get_prev_activations(),
+                   this->get_activations(),
+                   this->get_error_signals(),
+                   this->get_prev_error_signals(),
+                   pool_dims, pads, strides,
+                   mode);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void pooling_distconv_adapter<TensorDataType, Layout, Device>::
+fp_compute() {
+  m_pooling->forward(TensorDataType{1}, this->get_prev_activations(),
+                     TensorDataType{0}, this->get_activations());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void pooling_distconv_adapter<TensorDataType, Layout, Device>::
+bp_compute() {
+  m_pooling->backward(TensorDataType{1}, this->get_activations(),
+                      this->get_prev_error_signals(),
+                      this->get_prev_activations(), TensorDataType{0},
+                      this->get_error_signals());
+}
+#endif // LBANN_HAS_DISTCONV
+
+LBANN_DEFINE_LAYER_BUILDER(pooling);
+
+#ifndef LBANN_POOLING_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class pooling_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_POOLING_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_POOLING_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/reduction.hpp b/include/lbann/layers/transform/reduction.hpp
index 15df56534e1..8cccc0ce13a 100644
--- a/include/lbann/layers/transform/reduction.hpp
+++ b/include/lbann/layers/transform/reduction.hpp
@@ -38,24 +38,26 @@ enum class reduction_mode {INVALID, SUM, AVERAGE};
  *
  *  @todo Reduction over specified dimensions.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class reduction_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class reduction_layer : public transform_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "reduction currently only supports DATA_PARALLEL");
 private:
 
   /** Reduction mode. */
   const reduction_mode m_mode;
 
   /** Vector composed of ones. */
-  DMat<Dev> m_ones;
+  El::Matrix<TensorDataType, Dev> m_ones;
 
 public:
 
   reduction_layer(lbann_comm *comm,
                   reduction_mode mode)
-    : transform_layer(comm),
+    : transform_layer<TensorDataType>(comm),
       m_mode(mode) {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "reduction currently only supports DATA_PARALLEL");
     if (mode == reduction_mode::INVALID) {
       LBANN_ERROR("invalid reduction mode");
     }
@@ -67,7 +69,7 @@ class reduction_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     std::string mode_str;
     switch (m_mode) {
     case reduction_mode::SUM:     mode_str = "sum";     break;
@@ -82,16 +84,16 @@ class reduction_layer : public transform_layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    set_output_dims({1});
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims({1});
   }
 
   void fp_compute() override {
 
     // Local matrices
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
+    const auto& local_input = this->get_local_prev_activations();
+    auto& local_output = this->get_local_activations();
     const El::Int input_size = local_input.Height();
 
     // Apply reduction
@@ -99,14 +101,15 @@ class reduction_layer : public transform_layer {
     case reduction_mode::SUM:
       El::Ones(m_ones, input_size, 1);
       El::Gemv(El::TRANSPOSE,
-               DataType(1), local_input, m_ones,
-               DataType(0), local_output);
+               El::TypeTraits<TensorDataType>::One(), local_input, m_ones,
+               El::TypeTraits<TensorDataType>::Zero(), local_output);
       break;
     case reduction_mode::AVERAGE:
       El::Ones(m_ones, input_size, 1);
       El::Gemv(El::TRANSPOSE,
-               DataType(1) / input_size, local_input, m_ones,
-               DataType(0), local_output);
+               El::TypeTraits<TensorDataType>::One() / El::To<TensorDataType>(input_size),
+               local_input, m_ones,
+               El::TypeTraits<TensorDataType>::Zero(), local_output);
       break;
     default:
       LBANN_ERROR("invalid reduction mode");
@@ -117,8 +120,8 @@ class reduction_layer : public transform_layer {
   void bp_compute() override {
 
     // Local matrices
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    auto& local_gradient_wrt_input = this->get_local_error_signals();
     const El::Int input_size = local_gradient_wrt_input.Height();
 
     // Compute gradients w.r.t. inputs
@@ -126,14 +129,15 @@ class reduction_layer : public transform_layer {
     case reduction_mode::SUM:
       El::Ones(m_ones, input_size, 1);
       El::Gemm(El::NORMAL, El::NORMAL,
-               DataType(1), m_ones, local_gradient_wrt_output,
-               DataType(0), local_gradient_wrt_input);
+               El::TypeTraits<TensorDataType>::One(), m_ones, local_gradient_wrt_output,
+               El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
       break;
     case reduction_mode::AVERAGE:
       El::Ones(m_ones, input_size, 1);
       El::Gemm(El::NORMAL, El::NORMAL,
-               DataType(1) / input_size, m_ones, local_gradient_wrt_output,
-               DataType(0), local_gradient_wrt_input);
+               El::TypeTraits<TensorDataType>::One() / El::To<TensorDataType>(input_size),
+               m_ones, local_gradient_wrt_output,
+               El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
       break;
     default:
       LBANN_ERROR("invalid reduction mode");
@@ -143,6 +147,14 @@ class reduction_layer : public transform_layer {
 
 };
 
+#ifndef LBANN_REDUCTION_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class reduction_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_REDUCTION_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_REDUCTION_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/reshape.hpp b/include/lbann/layers/transform/reshape.hpp
index 7770080ff69..55b933add64 100644
--- a/include/lbann/layers/transform/reshape.hpp
+++ b/include/lbann/layers/transform/reshape.hpp
@@ -36,13 +36,13 @@ namespace lbann {
  *  Forward and backward prop simply involve setting up tensor views,
  *  and hence are very cheap.
  */
-template <data_layout T_layout, El::Device Dev>
-class reshape_layer : public transform_layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class reshape_layer : public transform_layer<TensorDataType> {
 public:
   reshape_layer(lbann_comm *comm,
                 std::vector<int> dims)
-    : transform_layer(comm) {
-    set_output_dims(dims);
+    : transform_layer<TensorDataType>(comm) {
+    this->set_output_dims(dims);
   }
   reshape_layer* copy() const override { return new reshape_layer(*this); }
   std::string get_type() const override { return "reshape"; }
@@ -51,11 +51,11 @@ class reshape_layer : public transform_layer {
 
 protected:
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
 
-    const auto& input_dims = get_input_dims();
-    auto output_dims = get_output_dims();
+    const auto& input_dims = this->get_input_dims();
+    auto output_dims = this->get_output_dims();
 
     // Determine any unspecified dimensions
     int unspecified_dim = -1;
@@ -70,12 +70,12 @@ class reshape_layer : public transform_layer {
                                                    output_dims.end(),
                                                    1,
                                                    std::multiplies<int>());
-      output_dims[unspecified_dim] = get_input_size() / specified_size;
-      set_output_dims(output_dims);
+      output_dims[unspecified_dim] = this->get_input_size() / specified_size;
+      this->set_output_dims(output_dims);
     }
 
     // Check that reshape is valid
-    if (get_input_size() != get_output_size()) {
+    if (this->get_input_size() != this->get_output_size()) {
       std::stringstream err;
       err << "input tensor dimensions (";
       for (size_t i = 0; i < input_dims.size(); ++i) {
@@ -92,16 +92,25 @@ class reshape_layer : public transform_layer {
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    El::LockedView(get_activations(), get_prev_activations());
+    El::LockedView(this->get_activations(), this->get_prev_activations());
   }
   void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
-    El::LockedView(get_error_signals(), get_prev_error_signals());
+    El::LockedView(this->get_error_signals(), this->get_prev_error_signals());
   }
   void fp_compute() override {}
   void bp_compute() override {}
 
 };
 
+#ifndef LBANN_RESHAPE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class reshape_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class reshape_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_RESHAPE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // RESHAPE_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/slice.hpp b/include/lbann/layers/transform/slice.hpp
index 62143bc32b8..317b803ecc7 100644
--- a/include/lbann/layers/transform/slice.hpp
+++ b/include/lbann/layers/transform/slice.hpp
@@ -24,11 +24,14 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef LBANN_LAYER_SLICE_HPP_INCLUDED
-#define LBANN_LAYER_SLICE_HPP_INCLUDED
+#ifndef LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED
+#define LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED
 
-#include "lbann/layers/transform/transform.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/data_readers/data_reader_jag_conduit.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/trainers/trainer.hpp"
 
 namespace lbann {
 
@@ -44,248 +47,261 @@ namespace lbann {
  *    \cdots\times D_n @f$
  *  tensor.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class slice_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class slice_layer : public data_type_layer<TensorDataType> {
 public:
 
-  slice_layer(lbann_comm *comm,
-              El::Int slice_dim,
-              std::vector<El::Int> slice_points)
-    : transform_layer(comm),
-      m_slice_dim(slice_dim),
-      m_slice_points(slice_points) {
-    this->m_expected_num_child_layers = -1; // No limit on children
-  }
+  slice_layer(lbann_comm *comm);
+  slice_layer(const slice_layer& other) = default;
+  slice_layer& operator=(const slice_layer& other) = default;
 
-  slice_layer(const slice_layer& other)
-    : transform_layer(other),
-      m_slice_dim(other.m_slice_dim),
-      m_slice_points(other.m_slice_points) {
-    m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr);
-    m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr);
-  }
+  slice_layer* copy() const override;
+  std::string get_type() const override;
+  data_layout get_data_layout() const override;
+  El::Device get_device_allocation() const override;
 
-  slice_layer& operator=(const slice_layer& other) {
-    transform_layer::operator=(other);
-    m_slice_dim = other.m_slice_dim;
-    m_slice_points = other.m_slice_points;
-    m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr);
-    m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr);
+  description get_description() const override;
+
+  void setup_slice_points(size_t slice_dim,
+                          std::vector<size_t> slice_points) {
+    m_slice_dim = slice_dim;
+    m_slice_points = std::move(slice_points);
   }
 
-  slice_layer* copy() const override { return new slice_layer(*this); }
-  std::string get_type() const override { return "slice"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
-  /** Get slice points. */
-  std::vector<El::Int>& get_slice_points() { return m_slice_points; }
-  /** Get slice points (const). */
-  std::vector<El::Int> get_slice_points() const { return m_slice_points; }
-
-  description get_description() const override {
-    auto&& desc = transform_layer::get_description();
-    desc.add("Slice dimension", m_slice_dim);
-    std::stringstream ss;
-    for (size_t i = 0; i < m_slice_points.size(); ++i) {
-      ss << (i > 0 ? ", " : "") << m_slice_points[i];
-    }
-    desc.add("Slice points", ss.str());
-    return desc;
+  void setup_slice_points(size_t slice_dim,
+                          bool set_slice_points_from_data_reader,
+                          const slice_points_mode var_category) {
+    m_slice_dim = slice_dim;
+    m_set_slice_points_from_data_reader = set_slice_points_from_data_reader;
+    m_var_category = var_category;
   }
 
 protected:
 
-  void setup_matrices(const El::Grid& grid) override {
-    transform_layer::setup_matrices(grid);
-    const auto& input = get_prev_activations();
-    m_input_v.reset(input.Construct(input.Grid(), input.Root()));
-    m_output_v.reset(input.Construct(input.Grid(), input.Root()));
-  }
+  void setup_dims(DataReaderMetaData& dr_metadata) override;
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    const auto& input_dims = get_input_dims();
-    const auto& num_outputs = get_num_children();
-
-    // Check that slice parameters are valid
-    std::stringstream err;
-    if (m_slice_dim < 0 || m_slice_dim >= (El::Int) input_dims.size()) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has " << input_dims.size() << " dimensions, "
-          << "but attempted to slice along dimension " << m_slice_dim;
-      LBANN_ERROR(err.str());
-    }
-    if ((int) m_slice_points.size() <= num_outputs) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "requires more slice points than output tensors "
-          << "(found " << m_slice_points.size() << " slice points "
-          << "and " << m_child_layers.size() << " output tensors)";
-      LBANN_ERROR(err.str());
-    }
-    if (!std::is_sorted(m_slice_points.begin(), m_slice_points.end())) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "has unsorted slice points";
-      LBANN_ERROR(err.str());
-    }
-    if (m_slice_points.front() < 0
-        || m_slice_points.back() > input_dims[m_slice_dim]) {
-      err << get_type() << " layer \"" << get_name() << "\" "
-          << "expects slice points in the range "
-          << "[0, " << input_dims[m_slice_dim] << "], "
-          << "but found an invalid slice point ";
-      if (m_slice_points.front() < 0) {
-        err << "(" << m_slice_points.front() << ")";
-      } else {
-        err << "(" << m_slice_points.back() << ")";
-      }
-      LBANN_ERROR(err.str());
-    }
+  void fp_setup_outputs(El::Int mini_batch_size) override;
+  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override;
+  void fp_compute() override;
+  void bp_compute() override;
 
-    // Set output tensor dimensions
-    auto output_dims = input_dims;
-    for (int i = 0; i < num_outputs; ++i) {
-      output_dims[m_slice_dim] = m_slice_points[i+1] - m_slice_points[i];
-      set_output_dims(output_dims, i);
-    }
+private:
 
-  }
+  /** Tensor dimension to slice. */
+  size_t m_slice_dim;
+  /** Slice points for each child layer. */
+  std::vector<size_t> m_slice_points;
+  /** Slice points are automatically defined by the data reader */
+  bool m_set_slice_points_from_data_reader;
+  /** Category for retrieving slice points from data reader */
+  slice_points_mode m_var_category;
+
+#ifdef LBANN_HAS_GPU
+  /** @brief Workspace buffer.
+   *
+   *  Parameters for CUDA kernels are copied into this buffer and
+   *  asynchronously transferred to GPU.
+   */
+  std::vector<unsigned char> m_workspace;
+  /** @brief CUDA event for workspace buffer.
+   *
+   *  Makes sure asynchronous GPU memory transfers are completed
+   *  before modifying workspace buffer.
+   */
+  cuda::event_wrapper m_workspace_event;
+#endif // LBANN_HAS_GPU
+
+  template <typename U, El::Device D>
+  friend void fp_setup_outputs_impl(slice_layer<U,Layout,D>&);
+  template <typename U>
+  friend void fp_compute_impl(slice_layer<U,Layout,Device>&);
+  template <typename U>
+  friend void bp_compute_impl(slice_layer<U,Layout,Device>&);
 
-  void fp_setup_outputs(El::Int mini_batch_size) override {
-    const auto& num_outputs = get_num_children();
-    const auto& input_dims = get_input_dims();
-
-    // Divide input tensor into unit slices along slice dimension
-    // Note: Each unit slice is divided into contiguous "unit blocks"
-    const auto& input_num_unit_slices = input_dims[m_slice_dim];
-    const auto& blocks_per_slice
-      = std::accumulate(&input_dims[0], &input_dims[m_slice_dim],
-                        1, std::multiplies<int>());
-    const auto& unit_block_size
-      = std::accumulate(input_dims.begin() + m_slice_dim + 1,
-                        input_dims.end(),
-                        1, std::multiplies<int>());
-    const auto& input_block_stride = (input_num_unit_slices
-                                      * unit_block_size);
-
-    // Populate output tensors with slices of input tensor
-    const auto& input = get_prev_activations();
-    for (int i = 0; i < num_outputs; ++i) {
-      const auto& output_dims = get_output_dims(i);
-      const auto& output_size = get_output_size(i);
-      auto& output = get_activations(i);
-      output.Empty(false);
-
-      // Divide output tensor into unit slices
-      const auto& output_num_unit_slices = output_dims[m_slice_dim];
-
-      // Merge unit slices and get first contiguous input block
-      const auto& block_size = output_num_unit_slices * unit_block_size;
-      const auto& input_block_offset = m_slice_points[i] * unit_block_size;
-      El::LockedView(*m_input_v, input,
-                     El::IR(input_block_offset,
-                            input_block_offset + block_size),
-                     El::ALL);
-
-      // Populate output tensor one block at a time
-      // Note: If there is only one block, output can be a view
-      if (blocks_per_slice > 1) {
-        output.AlignWith(*m_input_v);
-        output.Resize(output_size, mini_batch_size);
-        for (int block = 0; block < blocks_per_slice; ++block) {
-          const auto& input_offset = (input_block_offset
-                                      + block * input_block_stride);
-          const auto& output_offset = block * block_size;
-          El::LockedView(*m_input_v, input,
-                         El::IR(input_offset, input_offset + block_size),
-                         El::ALL);
-          El::View(*m_output_v, output,
-                   El::IR(output_offset, output_offset + block_size),
-                   El::ALL);
-          El::Copy(*m_input_v, *m_output_v);
-        }
-      } else {
-        El::LockedView(output, *m_input_v);
-      }
+};
 
+// =========================================================
+// Implementation
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+slice_layer<TensorDataType,Layout,Device>::slice_layer(lbann_comm *comm)
+  : data_type_layer<TensorDataType>(comm),
+  m_set_slice_points_from_data_reader(false),
+  m_var_category(slice_points_mode::NA) {
+  this->m_expected_num_child_layers = -1; // No limit on children
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+slice_layer<TensorDataType,Layout,Device>* slice_layer<TensorDataType,Layout,Device>::copy() const {
+  return new slice_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string slice_layer<TensorDataType,Layout,Device>::get_type() const {
+  return "slice";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout slice_layer<TensorDataType,Layout,Device>::get_data_layout() const {
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device slice_layer<TensorDataType,Layout,Device>::get_device_allocation() const {
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description slice_layer<TensorDataType,Layout,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Slice dimension", m_slice_dim);
+  std::ostringstream ss;
+  for (size_t i = 0; i < m_slice_points.size(); ++i) {
+    ss << (i > 0 ? ", " : "") << m_slice_points[i];
+  }
+  desc.add("Slice points", ss.str());
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void slice_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+
+  // Setup the slice points if they are to be established by the data reader
+  if(m_set_slice_points_from_data_reader) {
+    std::vector<size_t> slice_points;
+    std::string slice_point_method_name = "'get_slice_points_from_reader'";
+    for (auto& slice_point
+           : dr_metadata.slice_points[m_var_category]) {
+      slice_points.push_back(slice_point);
     }
 
+    if (slice_points.size() < 2u) {
+      LBANN_ERROR(slice_point_method_name, " is not supported by the reader.");
+      return;
+    }
+    m_slice_points = std::move(slice_points);
   }
 
-  void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
-    const auto& num_outputs = get_num_children();
-    const auto& input_dims = get_input_dims();
-
-    // Initialize gradient w.r.t. input tensor
-    auto& gradient_wrt_input = get_error_signals();
-    gradient_wrt_input.Empty(false);
-    gradient_wrt_input.AlignWith(get_prev_activations());
-    gradient_wrt_input.Resize(get_input_size(), mini_batch_size);
-    if (m_slice_points[0] != 0
-        || m_slice_points[num_outputs] != input_dims[m_slice_dim]) {
-      El::Zero(gradient_wrt_input);
+  // Check that slice parameters are valid
+  const auto& input_dims = this->get_input_dims();
+  const size_t num_outputs = this->get_num_children();
+  if (m_slice_dim >= input_dims.size()) {
+    std::ostringstream err;
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "is slicing along dimension " << m_slice_dim << ", "
+        << "but it has a " << input_dims.size() << "-D input tensor "
+        << "(parent layer \"" << this->get_parent_layers()[0]->get_name() << "\" "
+        << "outputs with dimensions ";
+    for (size_t d=0; d<input_dims.size(); ++d) {
+      err << (d>0 ? " x " : "") << input_dims[d];
     }
+    err << ")";
+    LBANN_ERROR(err.str());
+  }
+  if (m_slice_points.size() <= num_outputs) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "has ",num_outputs," children, "
+                "but only ",m_slice_points.size()," slice points");
+  }
+  if (!std::is_sorted(m_slice_points.begin(), m_slice_points.end())) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "has unsorted slice points");
+  }
+  if (m_slice_points.back() > static_cast<size_t>(input_dims[m_slice_dim])) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "has a slice point of ",m_slice_points.back(),", ",
+                "which is outside the expected range "
+                "[0 ",input_dims[m_slice_dim],"]");
+  }
 
-    // Divide input tensor into unit slices along slice dimension
-    // Note: Each unit slice is divided into contiguous "unit blocks"
-    const auto& input_num_unit_slices = input_dims[m_slice_dim];
-    const auto& blocks_per_slice
-      = std::accumulate(&input_dims[0], &input_dims[m_slice_dim],
-                        1, std::multiplies<int>());
-    const auto& unit_block_size
-      = std::accumulate(input_dims.begin() + m_slice_dim + 1,
-                        input_dims.end(),
-                        1, std::multiplies<int>());
-    const auto& input_block_stride = (input_num_unit_slices
-                                      * unit_block_size);
-
-    // Populate slices of gradient w.r.t. input tensor
-    for (int i = 0; i < num_outputs; ++i) {
-      const auto& output_dims = get_output_dims(i);
-      const auto& gradient_wrt_output = get_prev_error_signals(i);
-
-      // Divide output tensor into unit slices
-      const auto& output_num_unit_slices = output_dims[m_slice_dim];
-
-      // Merge unit slices
-      const auto& block_size = output_num_unit_slices * unit_block_size;
-      const auto& input_block_offset = m_slice_points[i] * unit_block_size;
-
-      // Populate gradient w.r.t. input tensor one block at a time
-      for (int block = 0; block < blocks_per_slice; ++block) {
-        const auto& input_offset = (input_block_offset
-                                    + block * input_block_stride);
-        const auto& output_offset = block * block_size;
-        El::LockedView(*m_output_v, gradient_wrt_output,
-                       El::IR(output_offset, output_offset + block_size),
-                       El::ALL);
-        El::View(*m_input_v, gradient_wrt_input,
-                 El::IR(input_offset, input_offset + block_size),
-                 El::ALL);
-        El::Copy(*m_output_v, *m_input_v);
-      }
-
-    }
+  // Model-parallel implementation only supports flat data
+  if (Layout == data_layout::MODEL_PARALLEL && input_dims.size() != 1) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "attempted to slice along dimension ",m_slice_dim,", ",
+                "but model-parallel slice layer only supports flat data");
+  }
 
+  // Set output tensor dimensions
+  auto output_dims = input_dims;
+  for (size_t i = 0; i < num_outputs; ++i) {
+    output_dims[m_slice_dim] = m_slice_points[i+1] - m_slice_points[i];
+    this->set_output_dims(output_dims, i);
   }
 
-  void fp_compute() override {}
-  void bp_compute() override {}
+}
+
+template <typename TensorDataType, El::Device Device>
+void fp_setup_outputs_impl(
+  slice_layer<TensorDataType,data_layout::MODEL_PARALLEL,Device>& l) {
+
+  // Slice Elemental matrices
+  // Note: Assume each mini-batch sample is flat.
+  const size_t num_outputs = l.get_num_children();
+  const auto& input = l.get_prev_activations();
+  size_t offset = l.m_slice_points.front();
+  for (size_t j=0; j<num_outputs; ++j) {
+    auto& output = l.get_activations(j);
+    const auto& output_size = l.get_output_size(j);
+    El::LockedView(output, input,
+                   El::IR(offset, offset+output_size), El::ALL);
+    offset += output_size;
+  }
 
-private:
+}
 
-  /** Tensor dimension to slice. */
-  El::Int m_slice_dim;
-  /** Slice points for each child layer. */
-  std::vector<El::Int> m_slice_points;
+template <typename TensorDataType, El::Device Device>
+void fp_setup_outputs_impl(
+  slice_layer<TensorDataType,data_layout::DATA_PARALLEL,Device>& l) {
 
-  /** View into input tensor. */
-  std::unique_ptr<AbsDistMat> m_input_v;
-  /** View into output tensor. */
-  std::unique_ptr<AbsDistMat> m_output_v;
+  const size_t num_outputs = l.get_num_children();
+  const auto& input = l.get_prev_activations();
+  for (size_t j=0; j<num_outputs; ++j) {
+    auto& output = l.get_activations(j);
+    output.AlignWith(input);
+    output.Resize(l.get_output_size(j), input.Width());
+  }
 
-};
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void slice_layer<TensorDataType,Layout,Device>::fp_setup_outputs(El::Int mini_batch_size) {
+  fp_setup_outputs_impl(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void slice_layer<TensorDataType,Layout,Device>::fp_compute() {
+  fp_compute_impl(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void slice_layer<TensorDataType,Layout,Device>::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) {
+  const auto& output0_grad = this->get_prev_error_signals(0);
+  auto& input_grad = this->get_error_signals();
+  input_grad.Empty(false);
+  input_grad.AlignWith(output0_grad);
+  El::Zeros(input_grad, this->get_input_size(), output0_grad.Width());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void slice_layer<TensorDataType,Layout,Device>::bp_compute() {
+  bp_compute_impl(*this);
+}
+
+#ifndef LBANN_SLICE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device)             \
+  extern template class slice_layer<        \
+    T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class slice_layer<        \
+    T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_SLICE_LAYER_INSTANTIATE
 
 } // namespace lbann
 
-#endif // LBANN_LAYER_SLICE_HPP_INCLUDED
+#endif // LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/sort.hpp b/include/lbann/layers/transform/sort.hpp
index 8d04e25a795..91d10f8f5df 100644
--- a/include/lbann/layers/transform/sort.hpp
+++ b/include/lbann/layers/transform/sort.hpp
@@ -32,17 +32,19 @@
 namespace lbann {
 
 /** @brief Sort tensor entries. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class sort_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class sort_layer : public transform_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "sort layer only supports DATA_PARALLEL");
  public:
 
   sort_layer(lbann_comm *comm, bool descending = false)
-    : transform_layer(comm), m_descending(descending) {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "sort layer only supports DATA_PARALLEL");
+    : transform_layer<TensorDataType>(comm), m_descending(descending) {
   }
   sort_layer(const sort_layer& other)
-    : transform_layer(other),
+    : transform_layer<TensorDataType>(other),
       m_descending(other.m_descending) {
     if (other.m_indices) {
       switch (other.m_indices->GetDevice()) {
@@ -60,7 +62,7 @@ class sort_layer : public transform_layer {
     }
   }
   sort_layer& operator=(const sort_layer& other) {
-    transform_layer::operator=(other);
+    transform_layer<TensorDataType>::operator=(other);
     m_descending = other.m_descending;
     if (!other.m_indices) {
       m_indices.reset(nullptr);
@@ -87,21 +89,21 @@ class sort_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     desc.add("Descending", m_descending);
     return desc;
   }
 
  protected:
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    transform_layer::setup_matrices(grid);
-    const auto& dist = get_activations().DistData();
+    transform_layer<TensorDataType>::setup_matrices(grid);
+    const auto& dist = this->get_activations().DistData();
     switch (dist.device) {
     case El::Device::CPU:
       m_indices.reset(new El::Matrix<El::Int, El::Device::CPU>());
@@ -117,8 +119,8 @@ class sort_layer : public transform_layer {
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    transform_layer::fp_setup_outputs(mini_batch_size);
-    const auto& output = get_activations();
+    transform_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+    const auto& output = this->get_activations();
     m_indices->Resize(output.LocalHeight(), output.LocalWidth());
   }
 
@@ -138,6 +140,14 @@ class sort_layer : public transform_layer {
 
 };
 
+#ifndef LBANN_SORT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class sort_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_SORT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_SORT_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp
index a7f151f7452..90248ce99a8 100644
--- a/include/lbann/layers/transform/split.hpp
+++ b/include/lbann/layers/transform/split.hpp
@@ -30,15 +30,32 @@
 #include <vector>
 #include "lbann/layers/transform/transform.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class split_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  split_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~split_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints &constraints) override;
+  dc::Shape get_activations_local_shape(int index) const override;
+  std::unique_ptr<TensorDevType> setup_activations_i(int index) const override;
+  void bp_compute();
+};
+#endif // LBANN_HAS_DISTCONV
+
 /** @brief Present input tensor to multiple outputs. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class split_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class split_layer : public transform_layer<TensorDataType> {
 public:
 
-  split_layer(lbann_comm *comm) : transform_layer(comm) {
+  split_layer(lbann_comm *comm) : transform_layer<TensorDataType>(comm) {
     this->m_expected_num_child_layers = -1; // No limit on children
   }
 
@@ -49,37 +66,118 @@ class split_layer : public transform_layer {
 
 protected:
 
-  void setup_dims() override {
-    Layer::setup_dims();
-    for (int i = 0; i < get_num_children(); ++i) {
-      set_output_dims(get_input_dims(), i);
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+    for (int i = 0; i < this->get_num_children(); ++i) {
+      this->set_output_dims(this->get_input_dims(), i);
     }
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    const auto& input = get_prev_activations();
-    for (int i = 0; i < get_num_children(); ++i) {
-      El::LockedView(get_activations(i), input);
+    const auto& input = this->get_prev_activations();
+    for (int i = 0; i < this->get_num_children(); ++i) {
+      El::LockedView(this->get_activations(i), input);
     }
   }
 
   void fp_compute() override {}
 
   void bp_compute() override {
-    auto& gradient_wrt_input = get_error_signals();
-    if (get_num_children() > 0) {
-      El::Copy(get_prev_error_signals(0), gradient_wrt_input);
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      get_distconv_adapter().bp_compute();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    auto& gradient_wrt_input = this->get_error_signals();
+    if (this->get_num_children() > 0) {
+      El::Copy(this->get_prev_error_signals(0), gradient_wrt_input);
     } else {
       El::Zero(gradient_wrt_input);
     }
-    for (int i = 1; i < get_num_children(); ++i) {
-      El::Axpy(DataType(1), get_prev_error_signals(i),
+    for (int i = 1; i < this->get_num_children(); ++i) {
+      El::Axpy(DataType(1), this->get_prev_error_signals(i),
                gradient_wrt_input);
     }
   }
 
+#ifdef LBANN_HAS_DISTCONV
+ protected:
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<split_distconv_adapter<
+      TensorDataType, T_layout, Dev>>(*this);
+  }
+  split_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const split_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+split_distconv_adapter<TensorDataType, T_layout, Dev>&
+split_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<split_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const split_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const split_distconv_adapter<TensorDataType, T_layout, Dev>&
+split_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const split_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void split_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+
+  auto &x = this->get_prev_activations_dist();
+  auto &y = this->get_activations_dist();
+  auto &dx = this->get_error_signals_dist();
+  auto &dy = this->get_prev_error_signals_dist();
+
+  constraints.mark_equivalent(x, y);
+  constraints.mark_equivalent(dx, dy);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dc::Shape split_distconv_adapter<TensorDataType, T_layout, Dev>::
+get_activations_local_shape(int index) const {
+  return data_type_distconv_adapter<TensorDataType>::get_activations_local_shape(0);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+std::unique_ptr<typename split_distconv_adapter<TensorDataType, T_layout, Dev>::TensorDevType>
+split_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_activations_i(int index) const {
+  return make_unique<TensorDevType>(this->get_prev_activations(0));
+}
+#endif // LBANN_HAS_DISTCONV
+
+LBANN_DEFINE_LAYER_BUILDER(split);
+
+#ifndef LBANN_SPLIT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class split_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class split_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#ifdef LBANN_HAS_DISTCONV
+#define PROTO_DEVICE(T, Device) \
+  extern template class split_distconv_adapter<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class split_distconv_adapter<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_HAS_DISTCONV
+#endif // LBANN_SPLIT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_SPLIT_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/stop_gradient.hpp b/include/lbann/layers/transform/stop_gradient.hpp
index 4adeafbb205..b3f49339753 100644
--- a/include/lbann/layers/transform/stop_gradient.hpp
+++ b/include/lbann/layers/transform/stop_gradient.hpp
@@ -39,27 +39,38 @@ namespace lbann {
  *  means that computed gradients in preceeding layers are not exact
  *  gradients of the objective function.
  */
-template <data_layout T_layout, El::Device Dev>
-class stop_gradient_layer : public transform_layer {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class stop_gradient_layer : public transform_layer<TensorDataType> {
 public:
-  stop_gradient_layer(lbann_comm *comm) : transform_layer(comm) {}
+  stop_gradient_layer(lbann_comm *comm) : transform_layer<TensorDataType>(comm) {}
   stop_gradient_layer* copy() const override { return new stop_gradient_layer(*this); }
   std::string get_type() const override { return "stop_gradient"; }
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
 
 protected:
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
   }
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    El::LockedView(get_activations(), get_prev_activations());
+    El::LockedView(this->get_activations(), this->get_prev_activations());
   }
   void fp_compute() override {}
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(stop_gradient);
+
+#ifndef LBANN_STOP_GRADIENT_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class stop_gradient_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class stop_gradient_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_STOP_GRADIENT_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // STOP_GRADIENT_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp
index ab9ce9a4af6..7786f72f634 100644
--- a/include/lbann/layers/transform/sum.hpp
+++ b/include/lbann/layers/transform/sum.hpp
@@ -29,15 +29,30 @@
 
 #include "lbann/layers/transform/transform.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class sum_layer : public transform_layer {
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+class sum_distconv_adapter: public data_type_distconv_adapter<TensorDataType> {
+ public:
+  using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+  sum_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer) {}
+  virtual ~sum_distconv_adapter() = default;
+  std::unique_ptr<TensorDevType> setup_error_signals_i(int index) const override;
+  void fp_compute();
+};
+#endif // LBANN_HAS_DISTCONV
+
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class sum_layer : public transform_layer<TensorDataType> {
 public:
 
   sum_layer(lbann_comm *comm)
-    : transform_layer(comm) {
+    : transform_layer<TensorDataType>(comm) {
     this->m_expected_num_parent_layers = -1; // No limit on parents
   }
 
@@ -49,29 +64,29 @@ class sum_layer : public transform_layer {
 protected:
 
   void setup_pointers() override {
-    transform_layer::setup_pointers();
-    if (get_num_parents() < 1) {
+    transform_layer<TensorDataType>::setup_pointers();
+    if (this->get_num_parents() < 1) {
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has no parent layers";
       LBANN_ERROR(err.str());
     }
   }
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
 
     // Check that input dimensions match
-    const auto& output_dims = get_output_dims();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      if (get_input_dims(i) != output_dims) {
-        const auto& parents = get_parent_layers();
+    const auto& output_dims = this->get_output_dims();
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      if (this->get_input_dims(i) != output_dims) {
+        const auto& parents = this->get_parent_layers();
         std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
+        err << get_type() << " layer \"" << this->get_name() << "\" "
             << "has input tensors with incompatible dimensions (";
-        for (int j = 0; j < get_num_parents(); ++j) {
-          const auto& dims = get_input_dims(j);
+        for (int j = 0; j < this->get_num_parents(); ++j) {
+          const auto& dims = this->get_input_dims(j);
           err << (j > 0 ? ", " : "")
               << "layer \"" << parents[j]->get_name() << "\" outputs ";
           for (size_t k = 0; k < dims.size(); ++k) {
@@ -86,24 +101,83 @@ class sum_layer : public transform_layer {
   }
 
   void fp_compute() override {
-    auto& output = get_activations();
-    El::Copy(get_prev_activations(0), output);
-    for (int i = 1; i < get_num_parents(); ++i) {
-      El::Axpy(DataType(1), get_prev_activations(i), output);
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      get_distconv_adapter().fp_compute();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    auto& output = this->get_activations();
+    El::Copy(this->get_prev_activations(0), output);
+    for (int i = 1; i < this->get_num_parents(); ++i) {
+      El::Axpy(DataType(1), this->get_prev_activations(i), output);
     }
   }
 
   void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override {
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      El::LockedView(get_error_signals(i), gradient_wrt_output);
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      El::LockedView(this->get_error_signals(i), gradient_wrt_output);
     }
   }
 
   void bp_compute() override {}
 
+#ifdef LBANN_HAS_DISTCONV
+  friend class sum_distconv_adapter<TensorDataType, T_layout, Dev>;
+ protected:
+  bool is_distconv_supported() const override {
+    return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
+  }
+  void setup_distconv_adapter() override {
+    this->get_distconv_adapter_ptr() = make_unique<sum_distconv_adapter<TensorDataType, T_layout, Dev>>(*this);
+  }
+  sum_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() override;
+  const sum_distconv_adapter<TensorDataType, T_layout, Dev>& get_distconv_adapter() const override;
+#endif // LBANN_HAS_DISTCONV
 };
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+sum_distconv_adapter<TensorDataType, T_layout, Dev>&
+sum_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<sum_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      static_cast<const sum_layer<TensorDataType, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+const sum_distconv_adapter<TensorDataType, T_layout, Dev>&
+sum_layer<TensorDataType, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const sum_distconv_adapter<TensorDataType, T_layout, Dev>&>(
+      data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+std::unique_ptr<typename sum_distconv_adapter<TensorDataType, T_layout, Dev>::TensorDevType>
+sum_distconv_adapter<TensorDataType, T_layout, Dev>::setup_error_signals_i(int index) const {
+  return make_unique<TensorDevType>(this->get_prev_error_signals(0));
+}
+#endif // LBANN_HAS_DISTCONV
+
+LBANN_DEFINE_LAYER_BUILDER(sum);
+
+#ifndef LBANN_SUM_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class sum_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class sum_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#ifdef LBANN_HAS_DISTCONV
+#define PROTO_DEVICE(T, Device) \
+  extern template class sum_distconv_adapter<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class sum_distconv_adapter<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_HAS_DISTCONV
+#endif // LBANN_SUM_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_SUM_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/tessellate.hpp b/include/lbann/layers/transform/tessellate.hpp
index eafe02cb9df..6c6f92e3ffc 100644
--- a/include/lbann/layers/transform/tessellate.hpp
+++ b/include/lbann/layers/transform/tessellate.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED
 #define LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
@@ -57,20 +57,34 @@ namespace lbann {
  *  e_n@f$. Then, denoting the modulo operator with @f$ \% @f$,
  *  @f[ Y_{i_1,\cdots,i_n} = X_{i_1\% d_1,\cdots,i_n\% d_n} @f]
  */
-template <data_layout Layout = data_layout::DATA_PARALLEL, El::Device Device = El::Device::CPU>
-class tessellate_layer : public Layer {
+template <typename TensorDataType,
+          data_layout Layout = data_layout::DATA_PARALLEL,
+          El::Device Device = El::Device::CPU>
+class tessellate_layer : public data_type_layer<TensorDataType> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The local tensor type expected in this object. */
+  using AbsMatrixType = El::AbstractMatrix<TensorDataType>;
+
+  ///@}
+
 public:
 
   tessellate_layer(lbann_comm *comm, std::vector<int> dims = {})
-    : Layer(comm) {
-    set_output_dims(dims);
+    : data_type_layer<TensorDataType>(comm) {
+    this->set_output_dims(dims);
   }
 
   tessellate_layer(const tessellate_layer& other)
-    : Layer(other),
+    : data_type_layer<TensorDataType>(other),
       m_input_v(other.m_input_v ? other.m_input_v->Copy() : nullptr) {}
   tessellate_layer& operator=(const tessellate_layer& other) {
-    Layer::operator=(other);
+    data_type_layer<TensorDataType>::operator=(other);
     m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr);
     return *this;
   }
@@ -80,15 +94,15 @@ class tessellate_layer : public Layer {
   data_layout get_data_layout() const override { return Layout; }
   El::Device get_device_allocation() const override { return Device; }
 
-  void setup_dims() override {
-    Layer::setup_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    data_type_layer<TensorDataType>::setup_dims(dr_metadata);
     std::stringstream err;
 
     // Check input and output dimensions
-    const auto input_dims = get_input_dims();
-    const auto& output_dims = get_output_dims();
+    const auto input_dims = this->get_input_dims();
+    const auto& output_dims = this->get_output_dims();
     if (input_dims.size() != output_dims.size()) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "attempted to tessellate a ";
       for (size_t i = 0; i < input_dims.size(); ++i) {
         err << (i > 0 ? "x" : "") << input_dims[i];
@@ -103,7 +117,7 @@ class tessellate_layer : public Layer {
 
     /// @todo Support tessellation with >3 dimensions
     if (input_dims.size() > 3) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "attempted to tessellate a ";
       for (size_t i = 0; i < input_dims.size(); ++i) {
         err << (i > 0 ? "x" : "") << input_dims[i];
@@ -115,10 +129,10 @@ class tessellate_layer : public Layer {
   }
 
   void setup_matrices(const El::Grid& grid) override {
-    Layer::setup_matrices(grid);
-    auto dist_data = get_prev_activations().DistData();
+    data_type_layer<TensorDataType>::setup_matrices(grid);
+    auto dist_data = this->get_prev_activations().DistData();
     dist_data.colDist = El::STAR;
-    m_input_v.reset(AbsDistMat::Instantiate(dist_data));
+    m_input_v.reset(AbsDistMatrixType::Instantiate(dist_data));
   }
 
 protected:
@@ -126,14 +140,14 @@ class tessellate_layer : public Layer {
   void fp_compute() override {
 
     // Get input and output dimensions
-    auto input_dims = get_input_dims();
-    auto output_dims = get_output_dims();
+    auto input_dims = this->get_input_dims();
+    auto output_dims = this->get_output_dims();
     while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); }
     while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); }
 
     // Get input and output data
-    auto& output = get_activations();
-    const auto& input = get_prev_activations();
+    auto& output = this->get_activations();
+    const auto& input = this->get_prev_activations();
     m_input_v->Empty(false);
     m_input_v->AlignWith(output);
     if (m_input_v->DistData() == input.DistData()) {
@@ -155,14 +169,14 @@ class tessellate_layer : public Layer {
   void bp_compute() override {
 
     // Get input and output dimensions
-    auto input_dims = get_input_dims();
-    auto output_dims = get_output_dims();
+    auto input_dims = this->get_input_dims();
+    auto output_dims = this->get_output_dims();
     while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); }
     while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); }
 
     // Get input and output data
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    auto& gradient_wrt_input = get_error_signals();
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
+    auto& gradient_wrt_input = this->get_error_signals();
     m_input_v->Empty(false);
     m_input_v->AlignWith(gradient_wrt_output);
     if (m_input_v->DistData() == gradient_wrt_input.DistData()) {
@@ -180,7 +194,7 @@ class tessellate_layer : public Layer {
 
     // Accumulate local error signals, if needed
     if (m_input_v->DistData() != gradient_wrt_input.DistData()) {
-      m_comm->allreduce(*m_input_v, m_input_v->RedundantComm());
+      this->m_comm->allreduce(*m_input_v, m_input_v->RedundantComm());
       El::Copy(*m_input_v, gradient_wrt_input);
     }
 
@@ -189,28 +203,37 @@ class tessellate_layer : public Layer {
 private:
 
   /** View into input tensor. */
-  std::unique_ptr<AbsDistMat> m_input_v;
+  std::unique_ptr<AbsDistMatrixType> m_input_v;
 
   /** Apply tessellation.
    *  Columns of 'input' should be intact mini-batch samples. If the
    *  data layout is not purely data-parallel, this means input data
    *  is duplicated over the input matrix's column communicator.
    */
-  static void fp_compute_3d(const std::vector<int>& input_dims,
-                            const std::vector<int>& output_dims,
-                            const AbsMat& input,
-                            AbsDistMat& output);
+  void fp_compute_3d(const std::vector<int>& input_dims,
+                     const std::vector<int>& output_dims,
+                     const AbsMatrixType& input,
+                     AbsDistMatrixType& output);
   /** Compute local contribution to tessellation back prop
    *  The global gradient w.r.t. input can be obtained by performing
    *  an allreduce over the input matrix's column communicator.
    */
-  static void bp_compute_3d(const std::vector<int>& input_dims,
-                            const std::vector<int>& output_dims,
-                            const AbsDistMat& gradient_wrt_output,
-                            AbsMat& gradient_wrt_input);
+  void bp_compute_3d(const std::vector<int>& input_dims,
+                     const std::vector<int>& output_dims,
+                     const AbsDistMatrixType& gradient_wrt_output,
+                     AbsMatrixType& gradient_wrt_input);
 
 };
 
+#ifndef LBANN_TESSELLATE_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class tessellate_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class tessellate_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_TESSELLATE_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/transform.hpp b/include/lbann/layers/transform/transform.hpp
index 98b2a169ea3..23b579c4322 100644
--- a/include/lbann/layers/transform/transform.hpp
+++ b/include/lbann/layers/transform/transform.hpp
@@ -27,16 +27,17 @@
 #ifndef LBANN_LAYER_TRANSFORM_HPP_INCLUDED
 #define LBANN_LAYER_TRANSFORM_HPP_INCLUDED
 
-#include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 
 namespace lbann {
 
 /** @todo Remove. Layers should inherit directly from the base layer
  *  class.
  */
-class transform_layer : public Layer {
+template <typename TensorDataType>
+class transform_layer : public data_type_layer<TensorDataType> {
  public:
-  transform_layer(lbann_comm *comm) : Layer(comm) {}
+  transform_layer(lbann_comm *comm) : data_type_layer<TensorDataType>(comm) {}
 };
 
 } // namespace lbann
diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp
index b10bbb03375..bcb0845138f 100644
--- a/include/lbann/layers/transform/uniform.hpp
+++ b/include/lbann/layers/transform/uniform.hpp
@@ -28,31 +28,39 @@
 #define LBANN_LAYER_UNIFORM_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
 
-/** @brief Random values with uniform distribution.
- *
- *  During validation and testing, outputs are all equal to the
- *  distribution mean.
- */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class uniform_layer : public transform_layer {
+/** @brief Random values from uniform distribution. */
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class uniform_layer : public transform_layer<TensorDataType> {
 private:
-  /** Uniform distribution mean. */
-  DataType m_min;
-  /** Uniform distribution standard deviation. */
-  DataType m_max;
+  /** @brief Uniform distribution minimum. */
+  TensorDataType m_min;
+  /** @brief Uniform distribution maximum. */
+  TensorDataType m_max;
+  /** @brief Whether to have deterministic output when not training.
+   *
+   *  Applies to execution modes other than training, e.g. validation
+   *  and inference. If true, outputs are all equal to the
+   *  distribution mean when not training.
+   */
+  bool m_training_only;
 
 public:
 
   uniform_layer(lbann_comm *comm,
                 std::vector<int> dims,
-                DataType min = DataType(0),
-                DataType max = DataType(1))
-    : transform_layer(comm), m_min(min), m_max(max) {
-    set_output_dims(dims);
+                TensorDataType min = El::TypeTraits<TensorDataType>::Zero(),
+                TensorDataType max = El::TypeTraits<TensorDataType>::One(),
+                bool training_only = false)
+    : transform_layer<TensorDataType>(comm),
+      m_min(min), m_max(max), m_training_only(training_only) {
+    this->set_output_dims(dims);
     this->m_expected_num_parent_layers = 0;
   }
   uniform_layer* copy() const override { return new uniform_layer(*this); }
@@ -61,28 +69,40 @@ class uniform_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     std::stringstream ss;
     ss << "[" << m_min << "," << m_max << ")";
     desc.add("Range", ss.str());
+    desc.add("Training only", m_training_only);
     return desc;
   }
 
 protected:
 
   void fp_compute() override {
-    const auto& mean = (m_max + m_min) / 2;
-    const auto& radius = (m_max - m_min) / 2;
-    auto& output = get_activations();
-    if (this->m_model->get_execution_mode() == execution_mode::training) {
-      uniform_fill(output, output.Height(), output.Width(), mean, radius);
-    } else {
+    const auto& mean = (m_max + m_min) / El::To<TensorDataType>(2);
+    const auto& radius = (m_max - m_min) / El::To<TensorDataType>(2);
+    auto& output = this->get_activations();
+    const auto& mode = this->m_model->get_execution_context().get_execution_mode();
+    if (m_training_only && (mode != execution_mode::training)) {
       El::Fill(output, mean);
     }
+    else {
+      uniform_fill(output, output.Height(), output.Width(), mean, radius);
+    }
   }
 
 };
 
+#ifndef LBANN_UNIFORM_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class uniform_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class uniform_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_UNIFORM_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_UNIFORM_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/unpooling.hpp b/include/lbann/layers/transform/unpooling.hpp
index 9a88eabcc1a..c414014f8c0 100644
--- a/include/lbann/layers/transform/unpooling.hpp
+++ b/include/lbann/layers/transform/unpooling.hpp
@@ -37,24 +37,24 @@ namespace lbann {
 /** @brief Transpose of pooling layer.
  *  @todo GPU support.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class unpooling_layer : public transform_layer {
+template <typename TensorDataType, data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class unpooling_layer : public transform_layer<TensorDataType> {
+  static_assert(T_layout == data_layout::DATA_PARALLEL,
+                "unpooling only supports DATA_PARALLEL");
+  static_assert(Dev == El::Device::CPU,
+                "unpooling only supports CPU");
  private:
 
   /** Corresponding pooling layer. */
-  pooling_layer<T_layout, Dev>* m_pooling_layer;
+  pooling_layer<TensorDataType, T_layout, Dev>* m_pooling_layer;
 
  public:
 
   unpooling_layer(lbann_comm *comm,
-                  pooling_layer<T_layout, Dev>* pool = nullptr)
-    : transform_layer(comm),
-      m_pooling_layer(pool) {
-    static_assert(T_layout == data_layout::DATA_PARALLEL,
-                  "unpooling only supports DATA_PARALLEL");
-    static_assert(Dev == El::Device::CPU,
-                  "unpooling only supports CPU");
-  }
+                  pooling_layer<TensorDataType, T_layout, Dev>* pool = nullptr)
+    : transform_layer<TensorDataType>(comm),
+      m_pooling_layer(pool) { }
 
   unpooling_layer* copy() const override { return new unpooling_layer(*this); }
   std::string get_type() const override { return "unpooling"; }
@@ -74,21 +74,21 @@ class unpooling_layer : public transform_layer {
     }
   }
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
 
     // Check that input tensor is valid
-    const auto& input_dims = get_input_dims();
+    const auto& input_dims = this->get_input_dims();
     const auto& pool_output_dims = m_pooling_layer->get_output_dims();
     if (input_dims != pool_output_dims) {
       std::stringstream err;
-      err << get_type() << " layer \"" << get_name() << "\" "
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "expects input tensors with dimensions ";
       for (size_t i = 0; i < pool_output_dims.size(); ++i) {
         err << (i > 0 ? " x " : "") << pool_output_dims[i];
       }
       err << ", but parent layer "
-          << "\"" << m_parent_layers[0]->get_name() << "\" "
+          << "\"" << this->get_parent_layers()[0]->get_name() << "\" "
           << "outputs with dimensions ";
       for (size_t i = 0; i < input_dims.size(); ++i) {
         err << (i > 0 ? " x " : "") << input_dims[i];
@@ -97,22 +97,22 @@ class unpooling_layer : public transform_layer {
     }
 
     // Initialize output tensor based on corresponding pooling layer
-    set_output_dims(m_pooling_layer->get_input_dims());
+    this->set_output_dims(m_pooling_layer->get_input_dims());
 
   }
 
-  void set_pooling_layer(pooling_layer<T_layout, Dev>* pool) {
+  void set_pooling_layer(pooling_layer<TensorDataType, T_layout, Dev>* pool) {
     m_pooling_layer = pool;
   }
 
   std::vector<Layer*> get_layer_pointers() override {
-    std::vector<Layer*> layers = transform_layer::get_layer_pointers();
+    std::vector<Layer*> layers = transform_layer<TensorDataType>::get_layer_pointers();
     layers.push_back((Layer*) m_pooling_layer);
     return layers;
   }
 
   void set_layer_pointers(std::vector<Layer*> layers) override {
-    m_pooling_layer = dynamic_cast<pooling_layer<T_layout, Dev>*>(layers.back());
+    m_pooling_layer = dynamic_cast<pooling_layer<TensorDataType, T_layout, Dev>*>(layers.back());
     if (m_pooling_layer == nullptr) {
       std::stringstream err;
       err << __FILE__ << " " << __LINE__
@@ -120,7 +120,7 @@ class unpooling_layer : public transform_layer {
       throw lbann_exception(err.str());
     }
     layers.pop_back();
-    transform_layer::set_layer_pointers(layers);
+    transform_layer<TensorDataType>::set_layer_pointers(layers);
   }
 
   protected:
@@ -146,19 +146,21 @@ class unpooling_layer : public transform_layer {
   /// Unpooling forward propagation with im2col
   void fp_compute_im2col() {
 
+    using DMatDT = El::Matrix<TensorDataType, Dev>;
+
     // Get local matrices
-    const DMat<Dev>& prev_activations_local = get_local_prev_activations();
-    DMat<Dev>& activations_local = get_local_activations();
+    const DMatDT& prev_activations_local = this->get_local_prev_activations();
+    DMatDT& activations_local = this->get_local_activations();
 
     // Get parameters
     const int local_width = prev_activations_local.Width();
-    const auto& output_dims = get_output_dims();
+    const auto& output_dims = this->get_output_dims();
     const int num_channels = output_dims[0];
-    const int num_per_input_channel = get_input_size() / num_channels;
+    const int num_per_input_channel = this->get_input_size() / num_channels;
     const int pool_size = m_pooling_layer->m_pool_size;
 
     // Initialize im2col matrix
-    DMat<Dev> im2col_mat(pool_size * num_channels, num_per_input_channel);
+    DMatDT im2col_mat(pool_size * num_channels, num_per_input_channel);
 
     // Iterate through data samples
     for(int sample = 0; sample < local_width; ++sample) {
@@ -167,16 +169,16 @@ class unpooling_layer : public transform_layer {
       El::Zero(im2col_mat);
 
       // Populate im2col matrix
-      const DataType *prev_activations_buffer
+      const TensorDataType *prev_activations_buffer
         = prev_activations_local.LockedBuffer(0, sample);
       const int *indices_buffer
-        = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()];
+        = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()];
       LBANN_OMP_PARALLEL_FOR
       for(int channel = 0; channel < num_channels; ++channel) {
         for(int j = 0; j < num_per_input_channel; ++j) {
           const int input_index = j + channel * num_per_input_channel;
           const int max_index = indices_buffer[input_index];
-          DataType *im2col_buffer
+          TensorDataType *im2col_buffer
             = im2col_mat.Buffer(channel * pool_size, j);
           im2col_buffer[max_index]
             = prev_activations_buffer[input_index];
@@ -184,8 +186,9 @@ class unpooling_layer : public transform_layer {
       }
 
       // Convert im2col matrix to output matrix
-      DMat<Dev> output_mat = El::View(activations_local, El::ALL, El::IR(sample));
-      col2im(im2col_mat,
+      DMatDT output_mat =
+        El::View(activations_local, El::ALL, El::IR(sample));
+      col2im<TensorDataType>(im2col_mat,
              output_mat,
              num_channels,
              output_dims.size() - 1,
@@ -193,36 +196,38 @@ class unpooling_layer : public transform_layer {
              m_pooling_layer->m_pads.data(),
              m_pooling_layer->m_pool_dims.data(),
              m_pooling_layer->m_strides.data(),
-             static_cast<const DataType&(*)(const DataType&,const DataType&)>(&std::max<DataType>));
-
+             [](TensorDataType const& a, TensorDataType const& b) {
+               return std::max(a, b);
+             });
     }
-
   }
 
   /// Unpooling backward propagation with im2col
   void bp_compute_im2col() {
 
+    using DMatDT = El::Matrix<TensorDataType, Dev>;
+
     // Get local matrices
-    const DMat<Dev>& prev_error_signal_local = get_local_prev_error_signals();
-    DMat<Dev>& error_signal_local = get_local_error_signals();
+    const DMatDT& prev_error_signal_local = this->get_local_prev_error_signals();
+    DMatDT& error_signal_local = this->get_local_error_signals();
 
     // Get parameters
     const int local_width = prev_error_signal_local.Width();
-    const auto& output_dims = get_output_dims();
+    const auto& output_dims = this->get_output_dims();
     const int num_channels = output_dims[0];
-    const int num_per_output_channel = get_input_size() / num_channels;
+    const int num_per_output_channel = this->get_input_size() / num_channels;
     const int pool_size = m_pooling_layer->m_pool_size;
 
     // Initialize im2col matrix
-    DMat<Dev> im2col_mat(pool_size * num_channels, num_per_output_channel);
+    DMatDT im2col_mat(pool_size * num_channels, num_per_output_channel);
 
     // Iterate through data samples
     for(int sample = 0; sample < local_width; ++sample) {
 
       // Construct im2col matrix from input
-      const DMat<Dev>& input_mat = El::LockedView(prev_error_signal_local,
-                                                  El::ALL, El::IR(sample));
-      im2col(input_mat,
+      const DMatDT& input_mat = El::LockedView(prev_error_signal_local,
+                                               El::ALL, El::IR(sample));
+      im2col<TensorDataType>(input_mat,
              im2col_mat,
              num_channels,
              output_dims.size() - 1,
@@ -232,15 +237,15 @@ class unpooling_layer : public transform_layer {
              m_pooling_layer->m_strides.data());
 
       // Propagate error signal based on pooling layer
-      DataType *output_buffer = error_signal_local.Buffer(0, sample);
+      TensorDataType *output_buffer = error_signal_local.Buffer(0, sample);
       const int *indices_buffer
-        = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()];
+        = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()];
       LBANN_OMP_PARALLEL_FOR
       for(int channel = 0; channel < num_channels; ++channel) {
         for(int j = 0; j < num_per_output_channel; ++j) {
           const int output_index = j + channel * num_per_output_channel;
           const int max_index = indices_buffer[output_index];
-          DataType *im2col_buffer
+          TensorDataType *im2col_buffer
             = im2col_mat.Buffer(channel * pool_size, j);
           output_buffer[output_index] = im2col_buffer[max_index];
         }
@@ -252,6 +257,16 @@ class unpooling_layer : public transform_layer {
 
 };
 
+#ifndef LBANN_UNPOOLING_LAYER_INSTANTIATE
+#define PROTO(T)                           \
+  extern template class unpooling_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#endif // LBANN_UNPOOLING_LAYER_INSTANTIATE
+
 }  // namespace lbann
 
-#endif  // LBANN_LAYER_POOLING_HPP_INCLUDED
+#endif  // LBANN_LAYER_UNPOOLING_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/weighted_sum.hpp b/include/lbann/layers/transform/weighted_sum.hpp
index 5f77caeaa9a..3bc8575b6a7 100644
--- a/include/lbann/layers/transform/weighted_sum.hpp
+++ b/include/lbann/layers/transform/weighted_sum.hpp
@@ -34,8 +34,10 @@
 namespace lbann {
 
 /** @brief Add tensors with specified scaling factors. */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class weighted_sum_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class weighted_sum_layer : public transform_layer<TensorDataType> {
 private:
 
   /** Scaling factors for weighted sum. */
@@ -44,7 +46,7 @@ class weighted_sum_layer : public transform_layer {
 public:
   weighted_sum_layer(lbann_comm *comm,
                      std::vector<DataType> scaling_factors)
-    : transform_layer(comm),
+    : transform_layer<TensorDataType>(comm),
       m_scaling_factors(scaling_factors) {
     this->m_expected_num_parent_layers = -1; // No limit on parents
   }
@@ -55,7 +57,7 @@ class weighted_sum_layer : public transform_layer {
   El::Device get_device_allocation() const override { return Dev; }
 
   description get_description() const override {
-    auto&& desc = transform_layer::get_description();
+    auto desc = transform_layer<TensorDataType>::get_description();
     std::stringstream ss;
     for (size_t i = 0; i < m_scaling_factors.size(); ++i) {
       ss << (i > 0 ? ", " : "") << m_scaling_factors[i];
@@ -67,36 +69,36 @@ class weighted_sum_layer : public transform_layer {
 protected:
 
   void setup_pointers() override {
-    transform_layer::setup_pointers();
+    transform_layer<TensorDataType>::setup_pointers();
     std::stringstream err;
-    if (get_num_parents() < 1) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+    if (this->get_num_parents() < 1) {
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has no parent layers";
       LBANN_ERROR(err.str());
     }
-    if ((int) m_scaling_factors.size() != get_num_parents()) {
-      err << get_type() << " layer \"" << get_name() << "\" "
+    if ((int) m_scaling_factors.size() != this->get_num_parents()) {
+      err << get_type() << " layer \"" << this->get_name() << "\" "
           << "has an invalid number of scaling factors "
           << "(found " << m_scaling_factors.size() << ", "
-          << "but there are " << get_num_parents() << " parent layers)";
+          << "but there are " << this->get_num_parents() << " parent layers)";
       LBANN_ERROR(err.str());
     }
   }
 
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    set_output_dims(get_input_dims());
+  void setup_dims(DataReaderMetaData& dr_metadata) override {
+    transform_layer<TensorDataType>::setup_dims(dr_metadata);
+    this->set_output_dims(this->get_input_dims());
 
     // Check that input dimensions match
-    const auto& output_dims = get_output_dims();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      if (get_input_dims(i) != output_dims) {
-        const auto& parents = get_parent_layers();
+    const auto& output_dims = this->get_output_dims();
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      if (this->get_input_dims(i) != output_dims) {
+        const auto& parents = this->get_parent_layers();
         std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
+        err << get_type() << " layer \"" << this->get_name() << "\" "
             << "has input tensors with incompatible dimensions (";
-        for (int j = 0; j < get_num_parents(); ++j) {
-          const auto& dims = get_input_dims(j);
+        for (int j = 0; j < this->get_num_parents(); ++j) {
+          const auto& dims = this->get_input_dims(j);
           err << (j > 0 ? ", " : "")
               << "layer \"" << parents[j]->get_name() << "\" outputs ";
           for (size_t k = 0; k < dims.size(); ++k) {
@@ -111,17 +113,17 @@ class weighted_sum_layer : public transform_layer {
   }
 
   void fp_compute() override {
-    auto& output = get_activations();
+    auto& output = this->get_activations();
     El::Zero(output);
-    for (int i = 0; i < get_num_parents(); ++i) {
-      El::Axpy(m_scaling_factors[i], get_prev_activations(i), output);
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      El::Axpy(m_scaling_factors[i], this->get_prev_activations(i), output);
     }
   }
 
   void bp_compute() override {
-    const auto& gradient_wrt_output = get_prev_error_signals();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      auto& gradient_wrt_input = get_error_signals(i);
+    const auto& gradient_wrt_output = this->get_prev_error_signals();
+    for (int i = 0; i < this->get_num_parents(); ++i) {
+      auto& gradient_wrt_input = this->get_error_signals(i);
       El::Zero(gradient_wrt_input);
       El::Axpy(m_scaling_factors[i], gradient_wrt_output,
                gradient_wrt_input);
@@ -130,6 +132,17 @@ class weighted_sum_layer : public transform_layer {
 
 };
 
+LBANN_DEFINE_LAYER_BUILDER(weighted_sum);
+
+#ifndef LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class weighted_sum_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class weighted_sum_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_WEIGHTED_SUM_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp
index f6d74931347..fd48affe0c6 100644
--- a/include/lbann/layers/transform/weights.hpp
+++ b/include/lbann/layers/transform/weights.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_WEIGHTS_HPP_INCLUDED
 
 #include "lbann/layers/transform/transform.hpp"
+#include "lbann/models/model.hpp"
 
 namespace lbann {
 
@@ -35,46 +36,55 @@ namespace lbann {
  *
  *  Interfaces with a @c weights object and outputs its tensor.
  */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class weights_layer : public transform_layer {
+template <typename TensorDataType,
+          data_layout T_layout = data_layout::DATA_PARALLEL,
+          El::Device Dev = El::Device::CPU>
+class weights_layer : public transform_layer<TensorDataType> {
+
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The local tensor type expected in this object. */
+  using AbsMatrixType = El::AbstractMatrix<TensorDataType>;
+
+  /** @brief The device-specific local tensor type. */
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+#ifdef LBANN_HAS_GPU
+  /** @brief The GPU device-specific local tensor type. */
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+#endif
+
+  /** @brief The concrete optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
 
  public:
   weights_layer(lbann_comm *comm, std::vector<El::Int> dims)
-    : transform_layer(comm) {
+    : transform_layer<TensorDataType>(comm) {
     std::vector<int> dims_;
     for (const auto& d : dims) { dims_.push_back(d); }
-    set_output_dims(dims_);
+    this->set_output_dims(dims_);
     this->m_expected_num_parent_layers = 0;
   }
 
   weights_layer(const weights_layer& other)
-    : transform_layer(other),
+    : transform_layer<TensorDataType>(other),
       m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr) {
-    if (other.m_workspace) {
-      switch (other.m_workspace->GetDevice()) {
-      case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
-#ifdef LBANN_HAS_GPU
-      case El::Device::GPU: m_workspace.reset(new GPUMat()); break;
-#endif // LBANN_HAS_GPU
-      default: LBANN_ERROR("unknown device type");
-      }
-      m_workspace->SetMemoryMode(other.m_workspace->MemoryMode());
-    }
+    m_workspace.SetMemoryMode(other.m_workspace.MemoryMode());
   }
   weights_layer& operator=(const weights_layer& other){
-    transform_layer::operator=(other);
+    transform_layer<TensorDataType>::operator=(other);
     m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr);
-    m_workspace.reset();
-    if (other.m_workspace) {
-      switch (other.m_workspace->GetDevice()) {
-      case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
-#ifdef LBANN_HAS_GPU
-      case El::Device::GPU: m_workspace.reset(new GPUMat()); break;
-#endif // LBANN_HAS_GPU
-      default: LBANN_ERROR("unknown device type");
-      }
-      m_workspace->SetMemoryMode(other.m_workspace->MemoryMode());
-    }
+    m_workspace.SetMemoryMode(other.m_workspace.MemoryMode());
     return *this;
   }
   weights_layer* copy() const override { return new weights_layer(*this); }
@@ -85,70 +95,56 @@ class weights_layer : public transform_layer {
  protected:
 
   void setup_matrices(const El::Grid& grid) override {
-    transform_layer::setup_matrices(grid);
+    transform_layer<TensorDataType>::setup_matrices(grid);
 
     // Initialize weights gradient
-    auto dist = get_activations().DistData();
+    auto dist = this->get_activations().DistData();
     dist.rowDist = El::STAR;
-    m_gradient.reset(AbsDistMat::Instantiate(dist));
+    m_gradient.reset(AbsDistMatrixType::Instantiate(dist));
 
     // Initialize workspace
-    switch (Dev) {
-    case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
-#ifdef LBANN_HAS_GPU
-    case El::Device::GPU:
-      m_workspace.reset(new GPUMat());
-#ifdef HYDROGEN_HAVE_CUB
-      m_workspace->SetMemoryMode(1); // Use CUB GPU memory pool if possible
-#endif // HYDROGEN_HAVE_CUB
-      break;
-#endif // LBANN_HAS_GPU
-    default: LBANN_ERROR("unknown device type");
-    }
-
+#if defined HYDROGEN_HAVE_CUB
+    if (Dev == El::Device::GPU)
+      m_workspace.SetMemoryMode(1); // Use CUB GPU memory pool if possible
+#endif // defined HYDROGEN_HAVE_CUB
   }
 
-  void setup_data() override {
-    transform_layer::setup_data();
+  void setup_data(size_t max_mini_batch_size) override {
+    transform_layer<TensorDataType>::setup_data(max_mini_batch_size);
 
     // Initialize default weights if none are provided
-    if (this->m_weights.size() > 1) {
-      std::stringstream err;
-      err << "attempted to setup "
-          << get_type() << " layer \"" << get_name() << "\" "
-          << "with an invalid number of weights "
-          << "(expected at most 1, "
-          << "but found " << this->m_weights.size() << ")";
-      LBANN_ERROR(err.str());
+    if (!this->has_weights()) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto init = make_unique<constant_initializer<DataType>>(DataType(0));
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_weights");
+      w->set_initializer(std::move(init));
+      w->set_optimizer(std::move(opt));
+      this->add_weights(w.get());
+      this->m_model->add_weights(std::move(w));
     }
-    this->m_weights.resize(1, nullptr);
-    auto& w = this->m_weights[0];
-    if (w == nullptr) {
-      w = new weights(get_comm());
-      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(0)));
-      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
-      w->set_name(get_name() + "_weights");
-      w->set_initializer(init);
-      w->set_optimizer(opt);
-      this->m_model->add_weights(w);
+    if (this->num_weights() != 1) {
+      LBANN_ERROR("attempted to setup ",
+                  this->get_type()," layer \"",this->get_name(),"\" ",
+                  "with an invalid number of weights ",
+                  "(expected at most 1, ",
+                  "but found ",this->num_weights(),")");
     }
 
     // Setup weights and weights gradient
-    m_gradient->AlignWith(get_activations());
-    m_gradient->Resize(get_output_size(), 1);
-    w->set_dims(get_output_dims());
-    w->set_matrix_distribution(m_gradient->DistData());
+    m_gradient->AlignWith(this->get_activations());
+    m_gradient->Resize(this->get_output_size(), 1);
+    this->get_weights(0).set_dims(this->get_output_dims());
+    this->get_weights(0).set_matrix_distribution(m_gradient->DistData());
 
     // Initialize freeze state
-    if (this->m_frozen) { w->freeze(); }
-    else                { w->unfreeze(); }
-    if (w->is_frozen() != this->m_frozen) {
-      std::stringstream err;
-      err << (m_frozen ? "" : "un") << "frozen "
-          << "layer \"" << get_name() << "\" has "
-          << (w->is_frozen() ? "" : "un") << "frozen "
-          << "weights \"" << w->get_name() << "\"";
-      LBANN_ERROR(err.str());
+    if (this->m_frozen) { this->get_weights(0).freeze(); }
+    else                { this->get_weights(0).unfreeze(); }
+    if (this->get_weights(0).is_frozen() != this->m_frozen) {
+      LBANN_ERROR((this->m_frozen ? "" : "un"),"frozen ",
+                  "layer \"",this->get_name(),"\" has ",
+                  (this->get_weights(0).is_frozen() ? "" : "un"),"frozen ",
+                  "weights \"",this->get_weights(0).get_name(),"\"");
     }
 
   }
@@ -156,56 +152,65 @@ class weights_layer : public transform_layer {
   void fp_compute() override {
 
     // Matrices
-    const auto& local_weights = m_weights[0]->get_values().LockedMatrix();
-    auto& local_output = get_local_activations();
-    m_workspace->Resize(local_output.Width(), 1);
-    El::Fill(*m_workspace, DataType(1));
+    const auto& local_weights = this->weights_values(0).LockedMatrix();
+    auto& local_output = this->get_local_activations();
+    El::Ones(m_workspace, local_output.Width(), 1);
 
     // Duplicate weights across matrix columns
     El::Gemm(El::NORMAL, El::TRANSPOSE,
-             DataType(1), local_weights, *m_workspace,
-             DataType(0), local_output);
+             El::TypeTraits<TensorDataType>::One(), local_weights, m_workspace,
+             El::TypeTraits<TensorDataType>::Zero(), local_output);
 
     // Clean up
-    m_workspace->Empty();
+    m_workspace.Empty();
 
   }
 
   void bp_compute() override {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
 
     // Get optimizer
     // Note: Nothing needs to be done if there is no optimizer
-    auto* opt = this->m_weights[0]->get_optimizer();
+    auto* opt = this->get_weights(0).get_optimizer();
     if (opt == nullptr) { return; }
 
     // Matrices
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    m_workspace->Resize(local_gradient_wrt_output.Width(), 1);
-    El::Fill(*m_workspace, one);
+    const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+    El::Ones(m_workspace, local_gradient_wrt_output.Width(), 1);
 
-    // Compute gradient contribution and accumulate
-    const auto& scale = one / this->m_model->get_effective_mini_batch_size();
     El::Gemv(El::NORMAL,
-             scale, local_gradient_wrt_output, *m_workspace,
-             zero, m_gradient->Matrix());
-    opt->add_to_gradient(*m_gradient, one, true);
+             El::TypeTraits<TensorDataType>::One(),
+             local_gradient_wrt_output, m_workspace,
+             El::TypeTraits<TensorDataType>::Zero(),
+             m_gradient->Matrix());
+
+    opt->add_to_gradient(*m_gradient,
+                         El::TypeTraits<TensorDataType>::One(),
+                         true);
 
     // Clean up
-    m_workspace->Empty();
+    m_workspace.Empty();
 
   }
 
  private:
 
   /** Weights gradient. */
-  std::unique_ptr<AbsDistMat> m_gradient;
+  std::unique_ptr<AbsDistMatrixType> m_gradient;
   /** Workspace. */
-  std::unique_ptr<AbsMat> m_workspace;
-
+  El::Matrix<TensorDataType, Dev> m_workspace;
 };
 
+LBANN_DEFINE_LAYER_BUILDER(weights);
+
+#ifndef LBANN_WEIGHTS_LAYER_INSTANTIATE
+#define PROTO_DEVICE(T, Device) \
+  extern template class weights_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  extern template class weights_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+#endif // LBANN_WEIGHTS_LAYER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_LAYER_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp
index 5b1ba94d470..a9dbdf2c553 100644
--- a/include/lbann/lbann.hpp
+++ b/include/lbann/lbann.hpp
@@ -27,6 +27,12 @@
 #ifndef LBANN_LBANN_HPP_INCLUDED
 #define LBANN_LBANN_HPP_INCLUDED
 
+/// Trainers
+#include "lbann/trainers/trainer.hpp"
+
+/// Training Algorithms
+#include "lbann/training_algorithms/training_algorithm.hpp"
+
 /// Models
 #include "lbann/models/directed_acyclic_graph.hpp"
 
@@ -45,6 +51,9 @@
 #include "lbann/layers/learning/fully_connected.hpp"
 #include "lbann/layers/learning/convolution.hpp"
 #include "lbann/layers/learning/deconvolution.hpp"
+#include "lbann/layers/learning/embedding.hpp"
+#include "lbann/layers/learning/channelwise_scale_bias.hpp"
+#include "lbann/layers/learning/entrywise_scale_bias.hpp"
 
 /// Loss layers
 #include "lbann/layers/loss/categorical_accuracy.hpp"
@@ -69,7 +78,7 @@
 #include "lbann/layers/transform/sum.hpp"
 #include "lbann/layers/transform/weighted_sum.hpp"
 #include "lbann/layers/transform/slice.hpp"
-#include "lbann/layers/transform/concatenation.hpp"
+#include "lbann/layers/transform/concatenate.hpp"
 #include "lbann/layers/transform/constant.hpp"
 #include "lbann/layers/transform/dummy.hpp"
 #include "lbann/layers/transform/hadamard.hpp"
@@ -92,6 +101,9 @@
 #include "lbann/layers/regularizers/dropout.hpp"
 #include "lbann/layers/regularizers/selu_dropout.hpp"
 #include "lbann/layers/regularizers/batch_normalization.hpp"
+#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp"
+#include "lbann/layers/regularizers/layer_norm.hpp"
+#include "lbann/layers/regularizers/instance_norm.hpp"
 
 /// Input layer
 #include "lbann/layers/io/input/input_layer.hpp"
@@ -100,19 +112,19 @@
 #include "lbann/layers/misc/covariance.hpp"
 #include "lbann/layers/misc/variance.hpp"
 #include "lbann/layers/misc/channelwise_mean.hpp"
+#include "lbann/layers/misc/channelwise_softmax.hpp"
 #include "lbann/layers/misc/mini_batch_index.hpp"
 #include "lbann/layers/misc/mini_batch_size.hpp"
+#include "lbann/layers/misc/argmax.hpp"
+#include "lbann/layers/misc/argmin.hpp"
+#include "lbann/layers/misc/one_hot.hpp"
 
 /// Data readers
+#include "lbann/data_readers/data_reader_npz_ras_lipid.hpp"
 #include "lbann/data_readers/data_reader_imagenet.hpp"
-#include "lbann/data_readers/data_reader_imagenet_patches.hpp"
 #include "lbann/data_readers/data_reader_cifar10.hpp"
 #include "lbann/data_readers/data_reader_mnist.hpp"
-#include "lbann/data_readers/data_reader_multi_images.hpp"
-#include "lbann/data_readers/data_reader_mnist_siamese.hpp"
-#include "lbann/data_readers/data_reader_multihead_siamese.hpp"
 #include "lbann/data_readers/data_reader_synthetic.hpp"
-#include "lbann/data_readers/data_reader_jag.hpp"
 #include "lbann/data_readers/data_reader_jag_conduit.hpp"
 #include "lbann/data_readers/data_reader_nci.hpp"
 #include "lbann/data_readers/data_reader_numpy.hpp"
@@ -121,51 +133,53 @@
 #include "lbann/data_readers/data_reader_csv.hpp"
 #include "lbann/data_readers/data_reader_merge_samples.hpp"
 #include "lbann/data_readers/data_reader_merge_features.hpp"
-#include "lbann/data_readers/data_reader_ascii.hpp"
 #include "lbann/data_readers/data_reader_pilot2_molecular.hpp"
 #include "lbann/data_readers/data_reader_mesh.hpp"
-#include "lbann/data_readers/data_reader_moving_mnist.hpp"
 #include "lbann/data_readers/data_reader_python.hpp"
+#include "lbann/data_readers/data_reader_smiles.hpp"
 
 /// Data stores
 #include "lbann/data_store/data_store_conduit.hpp"
 
 /// Callbacks
-#include "lbann/callbacks/callback_check_init.hpp"
-#include "lbann/callbacks/callback_checknan.hpp"
-#include "lbann/callbacks/callback_checksmall.hpp"
-#include "lbann/callbacks/callback_check_dataset.hpp"
-#include "lbann/callbacks/callback_print.hpp"
-#include "lbann/callbacks/callback_timer.hpp"
-#include "lbann/callbacks/callback_io.hpp"
-#include "lbann/callbacks/callback_summary.hpp"
-#include "lbann/callbacks/callback_learning_rate.hpp"
-#include "lbann/callbacks/callback_debug.hpp"
-#include "lbann/callbacks/callback_debug_io.hpp"
-#include "lbann/callbacks/callback_imcomm.hpp"
-#include "lbann/callbacks/callback_dump_weights.hpp"
-#include "lbann/callbacks/callback_dump_outputs.hpp"
-#include "lbann/callbacks/callback_dump_error_signals.hpp"
-#include "lbann/callbacks/callback_dump_gradients.hpp"
-#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp"
-#include "lbann/callbacks/callback_early_stopping.hpp"
-#include "lbann/callbacks/callback_ltfb.hpp"
-#include "lbann/callbacks/callback_save_images.hpp"
-#include "lbann/callbacks/callback_save_model.hpp"
+#include "lbann/callbacks/check_dataset.hpp"
+#include "lbann/callbacks/check_gradients.hpp"
+#include "lbann/callbacks/check_init.hpp"
+#include "lbann/callbacks/check_metric.hpp"
+#include "lbann/callbacks/check_nan.hpp"
+#include "lbann/callbacks/check_small.hpp"
+#include "lbann/callbacks/checkpoint.hpp"
+#include "lbann/callbacks/confusion_matrix.hpp"
+#include "lbann/callbacks/debug.hpp"
+#include "lbann/callbacks/debug_io.hpp"
+#include "lbann/callbacks/dump_error_signals.hpp"
+#include "lbann/callbacks/dump_gradients.hpp"
+#include "lbann/callbacks/dump_minibatch_sample_indices.hpp"
+#include "lbann/callbacks/dump_outputs.hpp"
+#include "lbann/callbacks/dump_weights.hpp"
+#include "lbann/callbacks/early_stopping.hpp"
+#include "lbann/callbacks/gpu_memory_usage.hpp"
+#include "lbann/callbacks/hang.hpp"
+#include "lbann/callbacks/imcomm.hpp"
+#include "lbann/callbacks/learning_rate.hpp"
+#include "lbann/callbacks/ltfb.hpp"
+#include "lbann/callbacks/mixup.hpp"
+#include "lbann/callbacks/monitor_io.hpp"
+#include "lbann/callbacks/perturb_adam.hpp"
+#include "lbann/callbacks/perturb_dropout.hpp"
+#include "lbann/callbacks/print_model_description.hpp"
+#include "lbann/callbacks/print_statistics.hpp"
 #include "lbann/callbacks/profiler.hpp"
-#include "lbann/callbacks/callback_hang.hpp"
-#include "lbann/callbacks/callback_variable_minibatch.hpp"
-#include "lbann/callbacks/callback_timeline.hpp"
-#include "lbann/callbacks/callback_checkpoint.hpp"
-#include "lbann/callbacks/callback_save_model.hpp"
-#include "lbann/callbacks/callback_replace_weights.hpp"
-#include "lbann/callbacks/callback_gpu_memory_usage.hpp"
-#include "lbann/callbacks/callback_sync_layers.hpp"
-#include "lbann/callbacks/callback_sync_selected.hpp"
-#include "lbann/callbacks/callback_confusion_matrix.hpp"
-#include "lbann/callbacks/callback_check_gradients.hpp"
-#include "lbann/callbacks/callback_check_metric.hpp"
-#include "lbann/callbacks/callback_perturb_adam.hpp"
+#include "lbann/callbacks/replace_weights.hpp"
+#include "lbann/callbacks/save_images.hpp"
+#include "lbann/callbacks/save_model.hpp"
+#include "lbann/callbacks/load_model.hpp"
+#include "lbann/callbacks/save_topk_models.hpp"
+#include "lbann/callbacks/summary.hpp"
+#include "lbann/callbacks/sync_layers.hpp"
+#include "lbann/callbacks/timeline.hpp"
+#include "lbann/callbacks/timer.hpp"
+#include "lbann/callbacks/variable_minibatch.hpp"
 
 /// Weights and weight initializers
 #include "lbann/weights/weights.hpp"
@@ -200,7 +214,5 @@
 #include "lbann/utils/peek_map.hpp"
 #include "lbann/utils/stack_trace.hpp"
 #include "lbann/utils/stack_profiler.hpp"
-#include "lbann/utils/threads/thread_pool.hpp"
-#include "lbann/utils/threads/thread_utils.hpp"
 
 #endif // LBANN_LBANN_HPP_INCLUDED
diff --git a/include/lbann/macros/CMakeLists.txt b/include/lbann/macros/CMakeLists.txt
new file mode 100644
index 00000000000..45b42256ed3
--- /dev/null
+++ b/include/lbann/macros/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  instantiate.hpp
+  instantiate_device.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/macros/instantiate.hpp b/include/lbann/macros/instantiate.hpp
new file mode 100644
index 00000000000..51e3f038476
--- /dev/null
+++ b/include/lbann/macros/instantiate.hpp
@@ -0,0 +1,40 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+PROTO(float);
+PROTO(double);
+
+#ifdef LBANN_HAS_HALF
+#ifdef LBANN_INSTANTIATE_CPU_HALF
+PROTO(cpu_fp16);
+#endif // LBANN_INSTANTIATE_CPU_HALF
+#endif // LBANN_HAS_HALF
+
+#ifdef LBANN_HAS_GPU_FP16
+#ifdef LBANN_INSTANTIATE_GPU_HALF
+PROTO(fp16);
+#endif // LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_HAS_GPU_FP16
diff --git a/include/lbann/macros/instantiate_device.hpp b/include/lbann/macros/instantiate_device.hpp
new file mode 100644
index 00000000000..9b3d8dcd5ee
--- /dev/null
+++ b/include/lbann/macros/instantiate_device.hpp
@@ -0,0 +1,43 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define PROTO(T)                    \
+  PROTO_DEVICE(T, El::Device::CPU)
+
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+
+#ifdef LBANN_HAS_GPU
+#define LBANN_INSTANTIATE_GPU_HALF
+#define PROTO(T)                    \
+  PROTO_DEVICE(T, El::Device::GPU)
+
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_HAS_GPU
diff --git a/include/lbann/metrics/layer_metric.hpp b/include/lbann/metrics/layer_metric.hpp
index f0f9c811504..d7700b735c9 100644
--- a/include/lbann/metrics/layer_metric.hpp
+++ b/include/lbann/metrics/layer_metric.hpp
@@ -47,6 +47,11 @@ class layer_metric : public metric {
   std::string name() const override;
   std::string get_unit() const override { return m_unit; }
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(cereal::base_class<metric>(this), CEREAL_NVP(m_name), CEREAL_NVP(m_unit));
+  }
+
   /** Set corresponding layer. */
   void set_layer(Layer& l);
   /** Get corresponding layer. */
@@ -59,6 +64,14 @@ class layer_metric : public metric {
   /** Set list of pointers to layers. */
   void set_layer_pointers(std::vector<Layer*> layers) override;
 
+    /** Save metric state to checkpoint. */
+  bool save_to_checkpoint_shared(persist& p);
+  /** Load metric state from checkpoint. */
+  bool load_from_checkpoint_shared(persist& p);
+
+  bool save_to_checkpoint_distributed(persist& p);
+  bool load_from_checkpoint_distributed(persist& p);
+
  protected:
 
   void setup(model& m) override;
@@ -86,7 +99,7 @@ class layer_metric : public metric {
   Layer* m_layer;
 
   /** Get corresponding evaluation layer. */
-  abstract_evaluation_layer& get_evaluation_layer();
+  /*abstract_evaluation_*/Layer& get_evaluation_layer();
 
 };
 
diff --git a/include/lbann/metrics/metric.hpp b/include/lbann/metrics/metric.hpp
index d270c361bb5..4a8be6c5e98 100644
--- a/include/lbann/metrics/metric.hpp
+++ b/include/lbann/metrics/metric.hpp
@@ -31,6 +31,8 @@
 #include "lbann/comm.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/io/persist.hpp"
+#include <cereal/types/base_class.hpp>
+#include <cereal/types/map.hpp>
 
 namespace lbann {
 
@@ -56,6 +58,13 @@ struct metric_statistics {
   metric_statistics& operator=(const metric_statistics& other) = default;
   /** Destructor. */
   ~metric_statistics() = default;
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_sum),
+       CEREAL_NVP(m_num_samples));
+  }
+
   /** Add metric value to statistics. */
   void add_value(EvalType value, int num_samples = 1);
   /** Get mean metric value.
@@ -67,19 +76,6 @@ struct metric_statistics {
   int get_num_samples() const { return m_num_samples; }
   /** Reset statistics. */
   void reset();
-
-  //************************************************************************
-  // Checkpointing
-  //************************************************************************
-  /** struct used to serialize mode fields in file and MPI transfer */
-  struct packing_header {
-    double sum;
-    uint64_t num_samples;
-  };
-  bool pack_scalars(persist& p);
-  bool unpack_scalars(persist& p, struct packing_header *header);
-  void unpack_header(struct packing_header& header);
-
 };
 
 /** Abstract base class for metric functions.
@@ -102,6 +98,11 @@ class metric {
   /** Copy function. */
   virtual metric* copy() const = 0;
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_statistics));
+  }
+
   /** Return a string name for this metric. */
   virtual std::string name() const = 0;
   /** Return a display unit for this metric.
@@ -122,9 +123,13 @@ class metric {
   virtual EvalType evaluate(execution_mode mode, int mini_batch_size) = 0;
 
   /** Clear all statistics. */
-  void reset_statistics() { m_statistics.clear(); }
+  void reset_statistics() {
+    for (auto& stats : m_statistics) {
+      stats.second.reset();
+    }
+  }
   /** Clear statistics for an execution mode. */
-  void reset_statistics(execution_mode mode) { m_statistics.erase(mode); }
+  void reset_statistics(execution_mode mode) { m_statistics[mode].reset(); }
 
   /** Get mean metric value.
    *  If mini-batch sizes are not identical, the mean is over the
@@ -150,12 +155,12 @@ class metric {
   }
 
   /** Save metric state to checkpoint. */
-  virtual bool save_to_checkpoint_shared(persist& p);
+  virtual bool save_to_checkpoint_shared(persist& p) = 0;
   /** Load metric state from checkpoint. */
-  virtual bool load_from_checkpoint_shared(persist& p);
+  virtual bool load_from_checkpoint_shared(persist& p) = 0;
 
-  virtual bool save_to_checkpoint_distributed(persist& p);
-  virtual bool load_from_checkpoint_distributed(persist& p);
+  virtual bool save_to_checkpoint_distributed(persist& p) = 0;
+  virtual bool load_from_checkpoint_distributed(persist& p) = 0;
 
  protected:
 
diff --git a/include/lbann/models/directed_acyclic_graph.hpp b/include/lbann/models/directed_acyclic_graph.hpp
index a47c6a8f123..8949bdf248d 100644
--- a/include/lbann/models/directed_acyclic_graph.hpp
+++ b/include/lbann/models/directed_acyclic_graph.hpp
@@ -30,6 +30,8 @@
 #include "lbann/models/model.hpp"
 #include "lbann/layers/layer.hpp"
 
+#include <optimizers.pb.h>
+
 namespace lbann {
 
 /** Neural network model with a DAG layer graph. */
@@ -37,13 +39,12 @@ class directed_acyclic_graph_model : public model {
 public:
 
   directed_acyclic_graph_model(lbann_comm *comm,
-                               El::Int max_mini_batch_size,
-                               objective_function *obj_fn,
-                               optimizer *default_optimizer);
+                               std::unique_ptr<objective_function> obj_fn,
+                               std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg);
   directed_acyclic_graph_model(const directed_acyclic_graph_model& other) = default;
   directed_acyclic_graph_model& operator=(const directed_acyclic_graph_model& other) = default;
   ~directed_acyclic_graph_model() override = default;
-  directed_acyclic_graph_model* copy() const override { return new directed_acyclic_graph_model(*this); }
+  std::unique_ptr<model> copy_model() const override { return make_unique<directed_acyclic_graph_model>(*this); }
   std::string get_type() const override { return "directed acyclic graph"; }
 
 protected:
diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp
index 7e8671a5289..de30914d0c8 100644
--- a/include/lbann/models/model.hpp
+++ b/include/lbann/models/model.hpp
@@ -30,24 +30,42 @@
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
 #include "lbann/layers/layer.hpp"
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include "lbann/execution_contexts/execution_context.hpp"
 #include "lbann/utils/summary.hpp"
 #include "lbann/utils/graph.hpp"
 #include "lbann/io/file_io.hpp"
 #include "lbann/io/persist.hpp"
-#include "lbann/objective_functions/objective_function.hpp"
 #include "lbann/metrics/metric.hpp"
-#include "lbann/weights/weights.hpp"
+#include "lbann/objective_functions/objective_function.hpp"
 #include "lbann/optimizers/optimizer.hpp"
+#include "lbann/proto/factories.hpp"
+#include "lbann/weights/weights.hpp"
 #include "lbann/utils/threads/thread_pool.hpp"
-#include <lbann.pb.h>
+#include <cereal/types/utility.hpp>
+
+// Note (trb): There's what is, IMO, an STL error in GCC in which the
+// dtor for unique_ptr is checking sizeof(T), so this must be a
+// complete type. Sigh. (The greater implication of this is that you
+// cannot have `unique_ptr<IncompleteType>` as a drop-in for
+// `IncompleteType*`, which is annoying.
+#include <optimizers.pb.h>
+
 #include <vector>
 #include <string>
 #include <unordered_map>
 
+// Forward-declare protobuf class
+namespace lbann_data {
+class Model;
+}
+
 namespace lbann {
 
 // Forward declarations
 class lbann_callback;
+class training_algorithm;
+class callback_base;
 
 /** @brief Abstract base class for neural network models. */
 class model {
@@ -58,13 +76,17 @@ class model {
   // ===========================================
 
   model(lbann_comm* comm,
-        El::Int mini_batch_size,
-        objective_function* obj_fn,
-        optimizer* default_optimizer = nullptr);
+        std::unique_ptr<objective_function> obj_fn,
+        std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg = nullptr);
   model(const model& other);
   model& operator=(const model& other);
   virtual ~model();
-  virtual model* copy() const = 0;
+  virtual std::unique_ptr<model> copy_model() const = 0;
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(CEREAL_NVP(*m_objective_function));
+  }
 
   // ===========================================
   // Access functions
@@ -91,8 +113,8 @@ class model {
   virtual description get_description() const;
 
   /** @brief Mathematical function to be minimized during training. */
-  objective_function* get_objective_function() const {
-    return m_objective_function;
+  observer_ptr<objective_function> get_objective_function() const {
+    return m_objective_function.get();
   }
 
   /** @brief Return the model's metrics. */
@@ -120,63 +142,40 @@ class model {
   std::vector<weights*> get_weights();
 
   /** @brief Get the list of callbacks for the model. */
-  virtual std::vector<lbann_callback*>& get_callbacks() {
-    return m_callbacks;
+  virtual std::vector<observer_ptr<callback_base>> get_callbacks() {
+    std::vector<observer_ptr<callback_base>> callback_list;
+    callback_list.reserve(m_callbacks.size());
+    for (const auto& ptr : m_callbacks) {
+      callback_list.push_back(ptr.get());
+    }
+    return callback_list;
   }
 
-  /** @brief Return the I/O thread pool */
-  std::shared_ptr<thread_pool> get_io_thread_pool() { return m_io_thread_pool; }
+  virtual std::vector<std::shared_ptr<callback_base>>& get_callbacks_with_ownership() {
+    return m_callbacks;
+  }
 
   /** @brief Get the model's comm. */
-  inline lbann_comm *get_comm() const {
+  lbann_comm *get_comm() const {
     return m_comm;
   }
 
-  void set_execution_mode(execution_mode mode);
-  execution_mode get_execution_mode() const noexcept;
-
-  /** @brief Number of times the training set has been traversed. */
-  inline El::Int get_epoch() const noexcept { return m_epoch; }
-
-  /** @brief Current mini-batch step for current execution mode.
-   *  @details Step counts are not reset after each epoch.
-   */
-  El::Int get_step() const noexcept;
-
-  /** @brief Current mini-batch step for given execution mode.
-   *  @details Step counts are not reset after each epoch.
-   */
-  El::Int get_step(execution_mode mode) const noexcept;
-
-  /** @brief Set the model's current mini-batch size. */
-  inline void set_current_mini_batch_size(int mini_batch_size) {
-    m_current_mini_batch_size = mini_batch_size;
-  }
-  /** @brief Get the model's current mini-batch size. */
-  inline int get_current_mini_batch_size() const {
-    return m_current_mini_batch_size;
-  }
-  /** @brief Get the model's maximum mini-batch size. */
-  inline int get_max_mini_batch_size() const {
-    return m_max_mini_batch_size;
+  /** Check to see if there is a valid training context for the model */
+  bool has_valid_execution_context() const {
+    return (m_execution_context != nullptr);
   }
-  /** @brief Get the model's effective mini-batch size. */
-  inline int get_effective_mini_batch_size() const {
-    return m_effective_mini_batch_size;
-  }
-  /** @brief Set the model's effective mini-batch size. */
-  inline void set_effective_mini_batch_size(int mini_batch_size) {
-    m_effective_mini_batch_size = mini_batch_size;
-  }
-  int get_num_iterations_per_epoch(execution_mode mode) const;
 
-  /** @brief Return true if the flag to stop training is set. */
-  bool get_terminate_training() const {
-    return m_terminate_training;
+  /** Grab the training context of the model */
+  const execution_context& get_execution_context() const {
+    if(m_execution_context == nullptr) {
+      LBANN_ERROR("execution context is not set");
+    }
+    return *m_execution_context;
   }
-  /** @brief Set the terminate training flag (on or off). */
-  void set_terminate_training(bool f) {
-    m_terminate_training = f;
+
+  /** Grab the training context of the model */
+  execution_context& get_execution_context() {
+    return const_cast<execution_context&>(static_cast<const model&>(*this).get_execution_context());
   }
 
   // ===========================================
@@ -187,10 +186,13 @@ class model {
   virtual void add_layer(std::unique_ptr<Layer> l);
 
   /** @brief Add weights to model. */
-  void add_weights(weights *w);
+  void add_weights(std::unique_ptr<weights> w);
+
+  /** @brief Register a new callback for the model. */
+  void add_callback(std::shared_ptr<callback_base> cb);
 
   /** @brief Register a new callback for the model. */
-  void add_callback(lbann_callback *cb);
+  //  void add_callbacks(std::vector<std::shared_ptr<callback_base>>& cb);
 
   /** @brief Register a new metric for the model. */
   void add_metric(metric *m);
@@ -209,7 +211,14 @@ class model {
    *
    *  If there is no default optimizer, a null pointer is returned.
    */
-  optimizer* create_optimizer() const;
+  template <typename TensorDataType>
+  std::unique_ptr<optimizer> create_optimizer() const
+  {
+    if (m_default_optimizer_msg)
+      return proto::construct_optimizer<TensorDataType>(
+        *m_default_optimizer_msg);
+    return nullptr;
+  }
 
   /** @brief Set a flag that can be used to enable / disable the
    *         background I/O activities
@@ -219,27 +228,15 @@ class model {
   /** @brief Are background I/O activities enabled by the input layers */
   bool background_io_activity_allowed() { return m_background_io_allowed; }
 
+  size_t get_num_iterations_per_epoch(execution_mode mode) const;
+
   // ===========================================
   // Setup
   // ===========================================
 
   /** @details Must be called after model specification and before
    *  execution. */
-  virtual void setup(std::shared_ptr<thread_pool> io_thread_pool);
-
-  // ===========================================
-  // Execution
-  // ===========================================
-
-  /** @brief Evaluate model. */
-  virtual void evaluate(execution_mode mode, int num_batches=0);
-
-  /** @brief Train model. */
-  virtual void train(int num_epochs, int num_batches=0);
-
-  /** @brief Complete any background I/O data fetch for the execution
-      mode requested */
-  virtual void collect_background_data_fetch(execution_mode mode);
+  virtual void setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata);
 
   virtual void make_data_store_preloaded(execution_mode mode);
 
@@ -287,9 +284,6 @@ class model {
 
 protected:
 
-  /** @brief Check if the model execution mode is valid. */
-  virtual bool is_execution_mode_valid(execution_mode mode) const;
-
   /** @brief Reorder layer list with a gather.
    *
    *  The new layer list is the same length as @c gather_indices and
@@ -339,7 +333,7 @@ class model {
    *
    *  Called in setup function.
    */
-  virtual void setup_layers();
+  virtual void setup_layers(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata);
   /** @brief Set up weights.
    *
    *  Called in setup function. All weights being used by layers or
@@ -348,19 +342,31 @@ class model {
    */
   virtual void setup_weights();
 
+public:
+  // ===========================================
+  // Execution
+  // ===========================================
+
   /** @brief Reset model pointer and execution mode. */
-  virtual void reset_mode_and_model(execution_mode mode);
+  virtual void reset_mode(execution_context& context, execution_mode mode);
   /** @brief Reset model statistics for an epoch. */
   virtual void reset_epoch_statistics(execution_mode mode);
-  /** @brief Evaluate model on a mini-batch */
-  virtual bool evaluate_mini_batch(execution_mode mode);
-  /** @brief Train model on a mini-batch. */
-  virtual bool train_mini_batch();
+
+  /** @brief Check if the trainer execution mode is valid for this model.
+    @todo this should be moved to the trainer when the data readers move. */
+  virtual bool is_execution_mode_valid(execution_mode mode) const;
+
+  /** @brief Complete any background I/O data fetch for the execution
+      mode requested */
+  virtual void collect_background_data_fetch(execution_mode mode);
 
   /** @brief Forward propagation step. */
   virtual void forward_prop(execution_mode mode);
   /** @brief Backward propagation step. */
   virtual void backward_prop();
+  /** Evaluate any metrics in the model */
+  virtual void evaluate_metrics(execution_mode mode,
+                                size_t current_mini_batch_size);
   /** @brief Clear each optimizer's gradient.
    *
    *  This must be called before training forward prop since layers
@@ -382,22 +388,8 @@ class model {
   // Callbacks
   // ===========================================
 
-  /** @brief Execute callbacks at start of training. */
-  virtual void do_train_begin_cbs();
-  /** @brief Execute callbacks at end of training. */
-  virtual void do_train_end_cbs();
-  /** @brief Execute callbacks at start of evaluation. */
-  virtual void do_evaluate_begin_cbs(execution_mode mode);
-  /** @brief Execute callbacks at end of evaluation. */
-  virtual void do_evaluate_end_cbs(execution_mode mode);
-  /** @brief Execute callbacks at start of epoch. */
-  virtual void do_epoch_begin_cbs();
-  /** @brief Execute callbacks at end of epoch. */
-  virtual void do_epoch_end_cbs();
-  /** @brief Execute callbacks at start of mini-batch. */
-  virtual void do_batch_begin_cbs(execution_mode mode);
-  /** @brief Execute callbacks at end of mini-batch. */
-  virtual void do_batch_end_cbs(execution_mode mode);
+  /** @brief Execute callbacks at end of setup. */
+  virtual void do_setup_end_cbs();
   /** @brief Execute callbacks at start of model forward propagation. */
   virtual void do_model_forward_prop_begin_cbs(execution_mode mode);
   /** @brief Execute callbacks at end of model forward propagation. */
@@ -425,6 +417,9 @@ class model {
 
 private:
 
+  /** Pointer to the execution context object used for training or evaluating this model */
+  observer_ptr<execution_context> m_execution_context;
+
   /** @brief LBANN communicator. */
   lbann_comm* m_comm;
 
@@ -434,54 +429,23 @@ class model {
    */
   std::string m_name;
 
-  /** @brief Current execution mode. */
-  execution_mode m_execution_mode = execution_mode::training;
-
-  /** @brief Number of times the training data set has been traversed. */
-  El::Int m_epoch = 0;
-
-  /** @brief Number of mini-batch steps performed.
-   *  @details Step counts are not reset after each epoch.
-   */
-  std::map<execution_mode, El::Int> m_step;
-
-  /** @brief Whether to terminate training.
-   *  @details If true, training will terminate immediately before
-   *  the next epoch.
-   */
-  bool m_terminate_training = false;
-
-  /** @brief Size of the current mini-batch in the model. */
-  int m_current_mini_batch_size;
-  /** @details Maximum possible minibatch size supported by layers in
-   *  this model.  Note that this is local to the particular model,
-   *  not across multiple models.
-   */
-  int m_max_mini_batch_size;
-  /** @brief The "effective" size of a minibatch.
-   *
-   *  This is the size of the minibatch across all models and used for
-   *  e.g.  correctly averaging gradients from multiple models.
-   */
-  int m_effective_mini_batch_size;
-
   /** @brief Tensor operations.
    *  @details The list is in execution order for forward propagation.
    */
   std::vector<std::unique_ptr<Layer>> m_layers;
 
   /** @brief Trainable parameters. */
-  std::vector<weights*> m_weights;
+  std::vector<std::unique_ptr<weights>> m_weights;
 
   /** @details If a layer needs to construct an optimizer during
    *  setup, it will make a copy of the default optimizer. This object
    *  is just used to create copies and is not actually used for
    *  optimization.
    */
-  optimizer* m_default_optimizer = nullptr;
+  std::unique_ptr<lbann_data::Optimizer> m_default_optimizer_msg;
 
   /** @brief Mathematical function to be minimized during training. */
-  objective_function* m_objective_function;
+  std::unique_ptr<objective_function> m_objective_function;
 
   /** @brief Numerical quantities to evaluate model performance.
    *  @details Does not affect training.
@@ -489,14 +453,16 @@ class model {
   std::vector<metric*> m_metrics;
 
   /** @brief Current callbacks to process. */
-  std::vector<lbann_callback*> m_callbacks;
-
-  /** @brief Threads available for I/O */
-  std::shared_ptr<thread_pool> m_io_thread_pool;
+  std::vector<std::shared_ptr<callback_base>> m_callbacks;
 
   /** @brief Flag that allows input layers to fetch data in the background */
   bool m_background_io_allowed = true;
 
+  /** @brief Is the model setup
+   *  @details Flag to indicate if the setup function has been called
+   */
+  bool m_model_is_setup = false;
+
   // ===========================================
   // Functions to add utility layers
   // ===========================================
@@ -534,6 +500,11 @@ class model {
    */
   void add_split_layers(std::unordered_set<std::string>& layer_names);
 
+#ifdef LBANN_HAS_DISTCONV
+  void setup_distconv();
+  void setup_distributions();
+  void print_distributions() const;
+#endif // LBANN_HAS_DISTCONV
 };
 
 } // namespace lbann
diff --git a/include/lbann/objective_functions/layer_term.hpp b/include/lbann/objective_functions/layer_term.hpp
index 7a3622537fe..7d8aa1508a5 100644
--- a/include/lbann/objective_functions/layer_term.hpp
+++ b/include/lbann/objective_functions/layer_term.hpp
@@ -58,7 +58,7 @@ class layer_term : public objective_function_term {
 private:
 
   /** Get corresponding evaluation layer. */
-  abstract_evaluation_layer& get_evaluation_layer();
+  /*abstract_evaluation_*/Layer& get_evaluation_layer();
 
 };
 
diff --git a/include/lbann/objective_functions/objective_function.hpp b/include/lbann/objective_functions/objective_function.hpp
index 9e0195bdb80..ad55f33cd13 100644
--- a/include/lbann/objective_functions/objective_function.hpp
+++ b/include/lbann/objective_functions/objective_function.hpp
@@ -48,6 +48,17 @@ class objective_function {
   /** Copy function. */
   objective_function* copy() const { return new objective_function(*this); }
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_statistics));
+
+    // Serialized each objective function term object explicitly, not the pointer to
+    // the objective function term
+    for(auto&& t : m_terms) {
+      ar(CEREAL_NVP(*t));
+    }
+  }
+
   /** Add a term to the objective function.
    *  The objective function takes ownership of the objective function
    *  term and deallocates it during destruction.
@@ -84,9 +95,13 @@ class objective_function {
   void compute_weight_regularization();
 
   /** Clear all statistics. */
-  void reset_statistics() { m_statistics.clear(); }
+  void reset_statistics() {
+    for (auto& stats : m_statistics) {
+      stats.second.reset();
+    }
+  }
   /** Clear statistics for an execution mode. */
-  void reset_statistics(execution_mode mode) { m_statistics.erase(mode); }
+  void reset_statistics(execution_mode mode) { m_statistics[mode].reset(); }
 
   /** Get mean objective function value.
    *  This is a weighted average such that each mini-batch sample makes
diff --git a/include/lbann/objective_functions/objective_function_term.hpp b/include/lbann/objective_functions/objective_function_term.hpp
index 1fa13bff220..e5f4546729b 100644
--- a/include/lbann/objective_functions/objective_function_term.hpp
+++ b/include/lbann/objective_functions/objective_function_term.hpp
@@ -49,6 +49,11 @@ class objective_function_term {
   /** Copy function. */
   virtual objective_function_term* copy() const = 0;
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_scale_factor));
+  }
+
   /** Get the name of the objective function term. */
   virtual std::string name() const = 0;
 
diff --git a/include/lbann/objective_functions/weight_regularization/l2.hpp b/include/lbann/objective_functions/weight_regularization/l2.hpp
index d8ef6fa47c0..75c4eece8f8 100644
--- a/include/lbann/objective_functions/weight_regularization/l2.hpp
+++ b/include/lbann/objective_functions/weight_regularization/l2.hpp
@@ -31,6 +31,9 @@
 
 namespace lbann {
 
+template <typename> class data_type_optimizer;
+template <typename> class data_type_weights;
+
 /** @class l2_weight_regularization
  *  @brief Apply L2 regularization to a set of weights.
  *
@@ -40,12 +43,29 @@ namespace lbann {
  */
 class l2_weight_regularization : public objective_function_term {
 public:
+  using AccumulateDataType = DataType;
+
+  using OptimizerType = data_type_optimizer<DataType>;
+
+  using WeightsType = data_type_weights<DataType>;
+
+  template <El::Device D>
+  using DMatType = El::Matrix<AccumulateDataType, D>;
 
+  using CPUMatType = DMatType<El::Device::CPU>;
+
+public:
   /** @param scale_factor   The objective function term is
    *                        @f$ \text{scale\_factor} \times \sum L2(w_i) @f$
    */
   l2_weight_regularization(EvalType scale_factor = 1);
   l2_weight_regularization* copy() const override { return new l2_weight_regularization(*this); }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(cereal::base_class<objective_function_term>(this));
+  }
+
   std::string name() const override { return "L2 weight regularization"; }
   void setup(model& m) override;
   void start_evaluation() override;
@@ -69,7 +89,7 @@ class l2_weight_regularization : public objective_function_term {
 private:
 
   /** Contributions to evaluated value. */
-  std::map<El::Device, CPUMat> m_contributions;
+  std::map<El::Device, CPUMatType> m_contributions;
 
   /** For non-blocking allreduces. */
   Al::request m_allreduce_req;
@@ -85,8 +105,8 @@ class l2_weight_regularization : public objective_function_term {
    *                        accumulation variable.
    */
   template <El::Device Device>
-  static void accumulate_contribution(const DMat<Device>& vals,
-                                      DMat<Device>& contribution);
+  static void accumulate_contribution(const DMatType<Device>& vals,
+                                      DMatType<Device>& contribution);
 
 };
 
diff --git a/include/lbann/optimizers/CMakeLists.txt b/include/lbann/optimizers/CMakeLists.txt
index 877cc8bb815..3147d074337 100644
--- a/include/lbann/optimizers/CMakeLists.txt
+++ b/include/lbann/optimizers/CMakeLists.txt
@@ -2,6 +2,7 @@
 set_full_path(THIS_DIR_HEADERS
   adagrad.hpp
   adam.hpp
+  data_type_optimizer.hpp
   hypergradient_adam.hpp
   optimizer.hpp
   rmsprop.hpp
diff --git a/include/lbann/optimizers/adagrad.hpp b/include/lbann/optimizers/adagrad.hpp
index 9a5cc8adbe6..c0255ed7eac 100644
--- a/include/lbann/optimizers/adagrad.hpp
+++ b/include/lbann/optimizers/adagrad.hpp
@@ -27,7 +27,9 @@
 #ifndef LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED
 
-#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/io/persist.hpp"
+#include <optimizers.pb.h>
 
 namespace lbann {
 
@@ -39,39 +41,63 @@ namespace lbann {
  *  methods for online learning and stochastic optimization." Journal
  *  of Machine Learning Research 12, no. Jul (2011): 2121-2159.
  */
-class adagrad : public optimizer {
+template <typename TensorDataType>
+class adagrad : public Cloneable<adagrad<TensorDataType>,
+                                 data_type_optimizer<TensorDataType>> {
+  using BaseType = Cloneable<adagrad<TensorDataType>,
+                             data_type_optimizer<TensorDataType>>;
 public:
+  /** @name Public Types */
+  ///@{
 
-  adagrad(lbann_comm* comm, DataType learning_rate, DataType eps = 1e-8);
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The optimizer base type of this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
+
+public:
+
+  adagrad(TensorDataType learning_rate, TensorDataType eps = 1e-8);
   adagrad(const adagrad& other);
   adagrad& operator=(const adagrad& other);
   ~adagrad() override = default;
-  adagrad* copy() const override { return new adagrad(*this); }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(cereal::base_class<data_type_optimizer<TensorDataType>>(this),
+       CEREAL_NVP(m_eps));
+  }
 
   /** Human-readable type name. */
   std::string get_type() const override { return "AdaGrad"; }
   /** Human-readable description. */
   description get_description() const override;
 
-  void setup(weights* w = nullptr) override;
+  void setup(WeightsType* w = nullptr) override;
 
 protected:
 
   /** Computation for an optimization step. */
-  void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override;
+  void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override;
 
 private:
 
   /** Small factor to avoid division by zero. */
-  DataType m_eps;
+  TensorDataType m_eps;
   /** AdaGrad cache. */
-  std::unique_ptr<AbsDistMat> m_cache;
+  std::unique_ptr<AbsDistMatrixType> m_cache;
 
   /** CPU implementation of optimization step. */
-  void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #ifdef LBANN_HAS_CUDNN
   /** GPU implementation of optimization step. */
-  void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #endif // LBANN_HAS_CUDNN
 
   // ===========================================
@@ -85,6 +111,11 @@ class adagrad : public optimizer {
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_adagrad_optimizer_from_pbuf(
+  google::protobuf::Message const&);
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED
diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp
index 696c8416599..e64c67929f1 100644
--- a/include/lbann/optimizers/adam.hpp
+++ b/include/lbann/optimizers/adam.hpp
@@ -27,9 +27,16 @@
 #ifndef LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED
 
-#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/io/persist.hpp"
+#include <optimizers.pb.h>
+#include <cereal/types/base_class.hpp>
+//#include <cereal/types/utility.hpp>
 
 namespace lbann {
+namespace callback {
+class perturb_adam;
+} // namespace callback
 
 /** @brief Adam optimizer.
  *
@@ -38,22 +45,48 @@ namespace lbann {
  *  Diederik P. Kingma and Jimmy Ba. "Adam: A method for stochastic
  *  optimization." arXiv preprint arXiv:1412.6980 (2014).
  */
-class adam : public optimizer {
+template <typename TensorDataType>
+class adam : public Cloneable<adam<TensorDataType>,
+                              data_type_optimizer<TensorDataType>> {
+  using BaseType = Cloneable<adam<TensorDataType>,
+                             data_type_optimizer<TensorDataType>>;
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The optimizer base type of this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
+
 public:
 
   /** @name Life cycle functions */
   ///@{
 
-  adam(lbann_comm* comm,
-       DataType learning_rate,
-       DataType beta1 = 0.9,
-       DataType beta2 = 0.99,
-       DataType eps = 1e-8);
+  adam(TensorDataType learning_rate,
+       TensorDataType beta1 = 0.9,
+       TensorDataType beta2 = 0.99,
+       TensorDataType eps = 1e-8);
   adam(const adam& other);
   adam& operator=(const adam& other);
   ~adam() = default;
-  adam* copy() const override { return new adam(*this); }
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(cereal::base_class<data_type_optimizer<TensorDataType>>(this),
+       CEREAL_NVP(m_beta1),
+       CEREAL_NVP(m_beta2),
+       CEREAL_NVP(m_eps),
+       CEREAL_NVP(m_current_beta1),
+       CEREAL_NVP(m_current_beta2));
+  }
   ///@}
 
   /** @name Descriptions */
@@ -70,132 +103,91 @@ class adam : public optimizer {
   ///@{
 
   /** Update factor for first moment estimate. */
-  DataType get_beta1() const noexcept { return m_beta1; }
+  TensorDataType get_beta1() const noexcept { return m_beta1; }
   /** Update factor for first moment estimate. */
-  void set_beta1(DataType beta1) { m_beta1 = beta1; }
+  void set_beta1(TensorDataType beta1) { m_beta1 = beta1; }
   /** Update factor for second moment estimate. */
-  DataType get_beta2() const noexcept { return m_beta2; }
+  TensorDataType get_beta2() const noexcept { return m_beta2; }
   /** Update factor for second moment estimate. */
-  void set_beta2(DataType beta2) { m_beta2 = beta2; }
+  void set_beta2(TensorDataType beta2) { m_beta2 = beta2; }
   /** Small factor to avoid division by zero. */
-  DataType get_eps() const noexcept { return m_eps; }
+  TensorDataType get_eps() const noexcept { return m_eps; }
   /** Small factor to avoid division by zero. */
-  void set_eps(DataType eps) { m_eps = eps; }
+  void set_eps(TensorDataType eps) { m_eps = eps; }
 
   /** First moment estimates. */
-  const AbsDistMat& get_moment1() const;
+  const AbsDistMatrixType& get_moment1() const;
   /** First moment estimates. */
-  AbsDistMat& get_moment1();
+  AbsDistMatrixType& get_moment1();
   /** Second moment estimates. */
-  const AbsDistMat& get_moment2() const;
+  const AbsDistMatrixType& get_moment2() const;
   /** Second moment estimates. */
-  AbsDistMat& get_moment2();
+  AbsDistMatrixType& get_moment2();
 
   /** beta1 ^ iteration.
    *  @todo This probably shouldn't be exposed.
    */
-  DataType get_current_beta1() const noexcept { return m_current_beta1; }
+  TensorDataType get_current_beta1() const noexcept { return m_current_beta1; }
   /** beta1 ^ iteration.
    *  @todo This probably shouldn't be exposed.
    */
-  void set_current_beta1(DataType current_beta1) { m_current_beta1 = current_beta1; }
+  void set_current_beta1(TensorDataType current_beta1) { m_current_beta1 = current_beta1; }
   /** beta2 ^ iteration.
    *  @todo This probably shouldn't be exposed.
    */
-  DataType get_current_beta2() const noexcept { return m_current_beta2; }
+  TensorDataType get_current_beta2() const noexcept { return m_current_beta2; }
   /** beta2 ^ iteration.
    *  @todo This probably shouldn't be exposed.
    */
-  void set_current_beta2(DataType current_beta2) { m_current_beta2 = current_beta2; }
+  void set_current_beta2(TensorDataType current_beta2) { m_current_beta2 = current_beta2; }
 
   ///@}
 
   /** @name Setup */
   ///@{
 
-  void setup(weights* w = nullptr) override;
+  void setup(WeightsType* w = nullptr) override;
 
   ///@}
 
 protected:
 
   /** Computation for an optimization step. */
-  void step_compute(AbsDistMat& values,
-                    const AbsDistMat& gradient) override;
+  void step_compute(AbsDistMatrixType& values,
+                    const AbsDistMatrixType& gradient) override;
 
 private:
 
   /** Update factor for first moment estimate. */
-  DataType m_beta1;
+  TensorDataType m_beta1;
   /** Update factor for second moment estimate. */
-  DataType m_beta2;
+  TensorDataType m_beta2;
   /** Small factor to avoid division by zero. */
-  DataType m_eps;
+  TensorDataType m_eps;
   /** beta1 ^ iteration. */
-  DataType m_current_beta1 = 1;
+  TensorDataType m_current_beta1 = TensorDataType(1.);
   /** beta2 ^ iteration. */
-  DataType m_current_beta2 = 1;
+  TensorDataType m_current_beta2 = TensorDataType(1.);
   /** First moment estimates. */
-  std::unique_ptr<AbsDistMat> m_moment1;
+  std::unique_ptr<AbsDistMatrixType> m_moment1;
   /** Second moment estimates. */
-  std::unique_ptr<AbsDistMat> m_moment2;
+  std::unique_ptr<AbsDistMatrixType> m_moment2;
 
   /** Hyperparameter exploration. */
-  friend class lbann_callback_perturb_adam;
+  friend class callback::perturb_adam;
 
   /** CPU implementation of optimization step. */
-  void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient,
+                        const TensorDataType& correction);
 #ifdef LBANN_HAS_CUDA
   /** GPU implementation of optimization step. */
-  void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient,
+                        const TensorDataType& correction);
 #endif // LBANN_HAS_CUDA
 
   /** @name Checkpointing */
   ///@{
 
-  /* struct used to serialize mode fields in file and MPI transfer */
-  struct packing_header {
-    DataType beta1;
-    DataType beta2;
-    DataType eps;
-    DataType current_beta1;
-    DataType current_beta2;
-  };
-
-  bool pack_scalars(persist& p) {
-    p.write_datatype(persist_type::train, "beta1", m_beta1);
-    p.write_datatype(persist_type::train, "beta2", m_beta2);
-    p.write_datatype(persist_type::train, "eps",   m_eps);
-    p.write_datatype(persist_type::train, "current_beta1", m_current_beta1);
-    p.write_datatype(persist_type::train, "current_beta2", m_current_beta2);
-    return true;
-  }
-
-  bool unpack_scalars(persist& p, struct packing_header *header) {
-    p.read_datatype(persist_type::train, "beta1", &m_beta1);
-    p.read_datatype(persist_type::train, "beta2", &m_beta2);
-    p.read_datatype(persist_type::train, "eps",   &m_eps);
-    p.read_datatype(persist_type::train, "current_beta1", &m_current_beta1);
-    p.read_datatype(persist_type::train, "current_beta2", &m_current_beta2);
-
-    if(header != nullptr) {
-      header->beta1 = m_beta1;
-      header->beta2 = m_beta2;
-      header->eps = m_eps;
-      header->current_beta1 = m_current_beta1;
-      header->current_beta2 = m_current_beta2;
-    }
-    return true;
-  }
-
-  void unpack_header(struct packing_header& header) {
-    m_beta1 = header.beta1;
-    m_beta2 = header.beta2;
-    m_eps = header.eps;
-    m_current_beta1 = header.current_beta1;
-    m_current_beta2 = header.current_beta2;
-  }
-
   bool save_to_checkpoint_shared(persist& p, std::string m_name) override;
   bool load_from_checkpoint_shared(persist& p, std::string m_name) override;
   bool save_to_checkpoint_distributed(persist& p, std::string m_name) override;
@@ -205,6 +197,11 @@ class adam : public optimizer {
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_adam_optimizer_from_pbuf(
+  google::protobuf::Message const&);
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED
diff --git a/include/lbann/optimizers/data_type_optimizer.hpp b/include/lbann/optimizers/data_type_optimizer.hpp
new file mode 100644
index 00000000000..9aadbbffde3
--- /dev/null
+++ b/include/lbann/optimizers/data_type_optimizer.hpp
@@ -0,0 +1,180 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED
+#define LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED
+
+#include "lbann/optimizers/optimizer.hpp"
+
+namespace lbann {
+
+// Forward declarations
+template <typename TensorDataType>
+class data_type_weights;
+
+template <typename TensorDataType>
+class data_type_optimizer
+  : public Cloneable<
+             HasAbstractFunction<data_type_optimizer<TensorDataType>>,
+             optimizer> {
+
+  using BaseType =
+    Cloneable<HasAbstractFunction<data_type_optimizer<TensorDataType>>,
+              optimizer>;
+
+  friend class data_type_weights<TensorDataType>;
+
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
+
+public:
+  data_type_optimizer(TensorDataType learning_rate = 0);
+  virtual ~data_type_optimizer() = default;
+
+  /** @brief Human-readable description. */
+  virtual description get_description() const override;
+
+  /** @brief Must be called before training.
+   *
+   *  @param w Weights being optimized. If null, no change is made to
+   *  the weights.
+   */
+  virtual void setup(data_type_weights<TensorDataType>* w = nullptr);
+
+  /** @name Weights management */
+  ///@{
+
+  /** @brief Weights being optimized. */
+  data_type_weights<TensorDataType>& get_weights();
+  /** @brief Weights being optimized. */
+  const data_type_weights<TensorDataType>& get_weights() const;
+  /** @brief Weights being optimized. */
+  void set_weights(data_type_weights<TensorDataType>* w) { m_weights = w; }
+
+  ///@}
+  /** @name Gradient update management */
+  ///@{
+
+  /** @brief Objective function gradient w.r.t. the weights.
+   *
+   *  An allreduce may be launched and/or synchronized if needed.
+   */
+  AbsDistMatrixType& get_gradient();
+
+  /** @brief Optimization step. */
+  void step() override;
+  ///@}
+
+  /** @brief Access the scaling factor for optimization step sizes. */
+  TensorDataType get_learning_rate() const;
+  /** @brief Set the scaling factor for optimization step sizes. */
+  void set_learning_rate(TensorDataType learning_rate);
+
+  /** @name Checkpointing functionality */
+  ///@{
+  /** @brief Archive for checkpoint and restart */
+  template <class Archive>
+  void serialize(Archive & ar) {
+    ar(cereal::base_class<optimizer>(this), CEREAL_NVP(m_learning_rate));
+  }
+  ///@}
+
+protected:
+
+  data_type_optimizer(const data_type_optimizer& other);
+  data_type_optimizer& operator=(const data_type_optimizer& other);
+
+  /** @brief Computation for an optimization step.
+   *
+   *  @c values and @c gradient can be assumed to have the same
+   *  distribution.
+   */
+  virtual void step_compute(AbsDistMatrixType& values,
+                            const AbsDistMatrixType& gradient) = 0;
+
+  /** @brief Get the info needed to construct a new gradient matrix.
+   *  @return Tuple of height, width, and DistData.
+   */
+  std::tuple<El::Int,El::Int,El::DistData> get_matrix_info() const final;
+
+private:
+
+  /** @brief Weights being optimized. */
+  data_type_weights<TensorDataType>* m_weights = nullptr;
+
+  /** @brief Objective function gradient w.r.t. weights. */
+  std::unique_ptr<AbsDistMatrixType> m_gradient;
+
+  /** @brief Workspace matrix.
+   *
+   *  Helps ensure gradient contributions are in the right
+   *  distribution. Most of the time, this should just be a matrix
+   *  view.
+   */
+  std::unique_ptr<AbsDistMatrixType> m_gradient_v;
+
+  /** @brief Communication request object for gradient allreduce.
+   *
+   *  Used to synchronize non-blocking allreduce.
+   */
+  Al::request m_gradient_allreduce_req;
+
+  /** @brief Scaling factor for optimization step sizes.
+   *
+   *  This is not used by the base optimizer class, but is currently
+   *  used by all derived optimizer classes. There are several cases
+   *  where it is convenient to expose this in the base class,
+   *  e.g. for variable learning rate schedules.
+   *
+   *  @todo Consider moving this to the derived classes.
+   */
+  TensorDataType m_learning_rate;
+};
+
+#ifndef LBANN_DATA_TYPE_OPTIMIZER_INSTANTIATE
+#define PROTO(T)                                \
+  extern template class data_type_optimizer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_DATA_TYPE_OPTIMIZER_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED
diff --git a/include/lbann/optimizers/hypergradient_adam.hpp b/include/lbann/optimizers/hypergradient_adam.hpp
index b0d362ad02e..57e9a7b7845 100644
--- a/include/lbann/optimizers/hypergradient_adam.hpp
+++ b/include/lbann/optimizers/hypergradient_adam.hpp
@@ -27,7 +27,9 @@
 #ifndef LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED
 
-#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/io/persist.hpp"
+#include <optimizers.pb.h>
 
 namespace lbann {
 
@@ -39,7 +41,27 @@ namespace lbann {
  *  Baydin et al. "Online Learning Rate Adaptation with Hypergradient
  *  Descent", 2017.
  */
-class hypergradient_adam : public optimizer {
+template <typename TensorDataType>
+class hypergradient_adam
+  : public Cloneable<hypergradient_adam<TensorDataType>,
+                     data_type_optimizer<TensorDataType>> {
+  using BaseType = Cloneable<hypergradient_adam<TensorDataType>,
+                             data_type_optimizer<TensorDataType>>;
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The base optimizer type for this class. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  ///@}
+
 public:
 
   /** @brief Construct a Hypergradient Adam optimizer object
@@ -55,105 +77,63 @@ class hypergradient_adam : public optimizer {
    *  @param eps                    Small factor to avoid division by
    *                                zero.
    */
-  hypergradient_adam(lbann_comm *comm,
-                     DataType init_learning_rate = 1e-3,
-                     DataType hyper_learning_rate = 1e-7,
-                     DataType beta1 = 0.9,
-                     DataType beta2 = 0.99,
-                     DataType eps = 1e-8);
+  hypergradient_adam(TensorDataType init_learning_rate = 1e-3,
+                     TensorDataType hyper_learning_rate = 1e-7,
+                     TensorDataType beta1 = 0.9,
+                     TensorDataType beta2 = 0.99,
+                     TensorDataType eps = 1e-8);
   hypergradient_adam(const hypergradient_adam& other);
   hypergradient_adam& operator=(const hypergradient_adam& other);
   ~hypergradient_adam() override = default;
-  hypergradient_adam* copy() const override { return new hypergradient_adam(*this); }
+
+    /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(cereal::base_class<data_type_optimizer<TensorDataType>>(this),
+       CEREAL_NVP(m_hyper_learning_rate),
+       CEREAL_NVP(m_beta1),
+       CEREAL_NVP(m_beta2),
+       CEREAL_NVP(m_eps),
+       CEREAL_NVP(m_current_beta1),
+       CEREAL_NVP(m_current_beta2));
+  }
 
   /** @brief Human-readable type name. */
   std::string get_type() const override { return "hypergradient Adam"; }
   /** @brief Human-readable description. */
   description get_description() const override;
 
-  void setup(weights* w = nullptr) override;
+  void setup(WeightsType* w = nullptr) override;
 
 protected:
 
   /** @brief Computation for an optimization step. */
-  void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override;
+  void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override;
 
 private:
 
   /** @brief Hypergradient learning rate. */
-  DataType m_hyper_learning_rate;
+  TensorDataType m_hyper_learning_rate;
   /** @brief Update factor for first moment estimate. */
-  DataType m_beta1;
+  TensorDataType m_beta1;
   /** @brief Update factor for second moment estimate. */
-  DataType m_beta2;
+  TensorDataType m_beta2;
   /** @brief Small factor to avoid division by zero. */
-  DataType m_eps;
+  TensorDataType m_eps;
   /** @brief beta1 ^ iteration. */
-  DataType m_current_beta1;
+  TensorDataType m_current_beta1;
   /** @brief beta2 ^ iteration. */
-  DataType m_current_beta2;
+  TensorDataType m_current_beta2;
   /** @brief First moment estimates. */
-  std::unique_ptr<AbsDistMat> m_moment1;
+  std::unique_ptr<AbsDistMatrixType> m_moment1;
   /** @brief Second moment estimates. */
-  std::unique_ptr<AbsDistMat> m_moment2;
+  std::unique_ptr<AbsDistMatrixType> m_moment2;
   /** @brief Gradient estimate from the prior step (for hypergradient). */
-  std::unique_ptr<AbsDistMat> m_old_gradient;
+  std::unique_ptr<AbsDistMatrixType> m_old_gradient;
 
   // ===========================================
   // Checkpointing
   // ===========================================
 
-  /** @struct packing_header
-   *  @brief Used to serialize mode fields in file and MPI transfer
-   */
-  struct packing_header {
-    DataType hyper_learning_rate;
-    DataType beta1;
-    DataType beta2;
-    DataType eps;
-    DataType current_beta1;
-    DataType current_beta2;
-  };
-
-  bool pack_scalars(persist& p) {
-    p.write_datatype(persist_type::train, "hyper_learning_rate", m_hyper_learning_rate);
-    p.write_datatype(persist_type::train, "beta1", m_beta1);
-    p.write_datatype(persist_type::train, "beta2", m_beta2);
-    p.write_datatype(persist_type::train, "eps",   m_eps);
-    p.write_datatype(persist_type::train, "current_beta1", m_current_beta1);
-    p.write_datatype(persist_type::train, "current_beta2", m_current_beta2);
-    return true;
-  }
-
-  bool unpack_scalars(persist& p, struct packing_header *header) {
-    p.read_datatype(persist_type::train, "hyper_learning_rate", &m_hyper_learning_rate);
-    p.read_datatype(persist_type::train, "beta1", &m_beta1);
-    p.read_datatype(persist_type::train, "beta2", &m_beta2);
-    p.read_datatype(persist_type::train, "eps",   &m_eps);
-    p.read_datatype(persist_type::train, "current_beta1", &m_current_beta1);
-    p.read_datatype(persist_type::train, "current_beta2", &m_current_beta2);
-
-    if(header != nullptr) {
-      header->hyper_learning_rate = m_hyper_learning_rate;
-      header->beta1 = m_beta1;
-      header->beta2 = m_beta2;
-      header->eps = m_eps;
-      header->current_beta1 = m_current_beta1;
-      header->current_beta2 = m_current_beta2;
-    }
-
-    return true;
-  }
-
-  void unpack_header(struct packing_header& header) {
-    m_hyper_learning_rate = header.hyper_learning_rate;
-    m_beta1 = header.beta1;
-    m_beta2 = header.beta2;
-    m_eps = header.eps;
-    m_current_beta1 = header.current_beta1;
-    m_current_beta2 = header.current_beta2;
-  }
-
   bool save_to_checkpoint_shared(persist& p, std::string m_name) override;
   bool load_from_checkpoint_shared(persist& p, std::string m_name) override;
   bool save_to_checkpoint_distributed(persist& p, std::string m_name) override;
@@ -161,6 +141,11 @@ class hypergradient_adam : public optimizer {
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_hypergradient_adam_optimizer_from_pbuf(
+  google::protobuf::Message const&);
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZER_HYPERGRADIENT_ADAM_HPP_INCLUDED
diff --git a/include/lbann/optimizers/optimizer.hpp b/include/lbann/optimizers/optimizer.hpp
index 6e0e9ee6712..d433423b0b4 100644
--- a/include/lbann/optimizers/optimizer.hpp
+++ b/include/lbann/optimizers/optimizer.hpp
@@ -27,18 +27,23 @@
 #ifndef LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED
 
-#include <string>
-#include <memory>
-#include <unordered_set>
-#include "lbann/utils/compiler_control.hpp"
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/utils/description.hpp"
-#include "lbann/weights/weights.hpp"
+#include "lbann/utils/cloneable.hpp"
+#include "lbann/utils/compiler_control.hpp"
 #ifdef LBANN_HAS_GPU
 #include "lbann/utils/cuda.hpp"
 #endif // LBANN_HAS_GPU
+#include "lbann/utils/description.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/weights/weights.hpp"
+
+#include <cereal/types/utility.hpp>
+
+#include <memory>
+#include <string>
+#include <unordered_set>
 
 namespace lbann {
 
@@ -56,14 +61,13 @@ enum class optimizer_gradient_status {
    *  @details Non-blocking allreduce must be synchronized before
    *  accessing.
    */
-  allreduce_started
+  allreduce_started,
 };
 
 /** @brief Human-readable string for status of gradient in optimizer. */
 std::string to_string(optimizer_gradient_status status);
 
 // Forward declarations
-class weights;
 class persist;
 
 /** @brief Abstract base class for gradient-based optimization algorithms.
@@ -74,37 +78,24 @@ class persist;
  *  optimization step requires the objective function gradient
  *  w.r.t. the weights.
  */
-class optimizer {
+class optimizer : public Cloneable<HasAbstractFunction<optimizer>> {
 public:
 
-  optimizer(lbann_comm* comm, DataType learning_rate = 0);
-  optimizer(const optimizer& other);
-  optimizer& operator=(const optimizer& other);
+  /** @name Constructors and Destructor */
+  ///@{
+
+  optimizer();
   virtual ~optimizer() = default;
 
-  /** @brief Create a copy of the class instance.
-   *
-   *  The caller is responsible for deallocating the returned object.
-   */
-  virtual optimizer* copy() const = 0;
+  ///@}
 
   /** @brief Human-readable type name. */
   virtual std::string get_type() const = 0;
   /** @brief Human-readable description. */
   virtual description get_description() const;
 
-  /** @brief Weights being optimized. */
-  weights& get_weights();
-  /** @brief Weights being optimized. */
-  const weights& get_weights() const;
-  /** @brief Weights being optimized. */
-  void set_weights(weights* w) { m_weights = w; }
-
-  /** @brief Objective function gradient w.r.t. the weights.
-   *
-   *  An allreduce may be launched and/or synchronized if needed.
-   */
-  AbsDistMat& get_gradient();
+  /** @name Gradient update management */
+  ///@{
 
   /** @brief Add to the objective function gradient w.r.t. the weights.
    *  @param gradient           Contribution to gradient.
@@ -118,36 +109,30 @@ class optimizer {
    *                            allreduce is performed lazily when the
    *                            gradient is accessed.
    */
-  void add_to_gradient(const AbsDistMat& gradient,
-                       DataType scale = DataType(1),
-                       bool allreduce_needed = false);
+  template <typename TensorDataType>
+  void add_to_gradient(El::AbstractDistMatrix<TensorDataType> const& contrib,
+                       TensorDataType scale = 1.f,
+                       bool allreduce_needed = false) {
+    TensorDataType buf_scale, in_scale;
+    auto& grad = get_gradient_buffer(buf_scale, in_scale, allreduce_needed);
+    El::Scale(buf_scale, grad);
+    El::Axpy(in_scale*scale, contrib, grad);
+  }
+
   /** @brief Zero out the objective function gradient w.r.t. the weights. */
-  void clear_gradient();
-  /** @brief Get the gradient buffer.
-   *
-   *  This provides access to the underlying gradient buffer, which may be
-   *  directly summed into. This buffer should be considered ephemeral and not
-   *  stored. The caller must also ensure the buffer has an appropriate
-   *  distribution. buf_scale provides the caller with a scale factor that must
-   *  be applied to the gradient buffer before writing to it, and in_scale
-   *  provides a scaling factor that must be applied to the user's data.
-   *  Essentially, this enables computations of the form
-   *  gradient = buf_scale*gradient + in_scale*new_gradient
-   *  This is an expert-mode function and is intended to help eliminate copies
-   *  and facilitate kernel fusion.
-   *
-   *  @param buf_scale A scale factor provided to the caller to scale the
-   *  returned buffer by.
-   *  @param in_scale A scale factor provided to the caller to scale their
-   *  gradient contributions by.
-   *  @param allreduce_needed Whether this gradient contribution will need to
-   *  be allreduced.
-   */
-  AbsDistMat& get_gradient_buffer(DataType& buf_scale,
-                                  DataType& in_scale,
-                                  bool allreduce_needed = false);
+  void clear_gradient() {
+    for (auto& g : gradients_) {
+      if (g.second->get_status() ==
+          optimizer_gradient_status::allreduce_started) {
+        g.second->complete_allreduce(*m_comm);
+      }
+      g.second->clear();
+    }
+    this->get_gradient_sources().clear();
+  }
 
   /** @brief Objects that are expected to contribute to the gradient. */
+
   El::Int get_num_gradient_sources() const;
   /** @brief Register a gradient source.
    *
@@ -157,6 +142,7 @@ class optimizer {
    *  forward prop.
    */
   void add_gradient_source(const void* source);
+
   /** @brief Unregister a gradient source.
    *
    *  When an object adds its contribution to the objective function
@@ -166,59 +152,196 @@ class optimizer {
    */
   void remove_gradient_source(const void* source);
 
-  /** @brief Must be called before training.
+  /** @brief Perform optimization step. */
+  virtual void step() = 0;
+
+  /** @brief Get the gradient buffer.
+   *
+   *  This provides access to the underlying gradient buffer, which
+   *  may be directly summed into. This buffer should be considered
+   *  ephemeral and not stored. The caller must also ensure the buffer
+   *  has an appropriate distribution. buf_scale provides the caller
+   *  with a scale factor that must be applied to the gradient buffer
+   *  before writing to it, and in_scale provides a scaling factor
+   *  that must be applied to the user's data.  Essentially, this
+   *  enables computations of the form
+   *  @verbatim
+   *    gradient = buf_scale*gradient + in_scale*new_gradient
+   *  @endverbatim
+   *  This is an expert-mode function and is intended to help
+   *  eliminate copies and facilitate kernel fusion.
    *
-   *  @param w Weights being optimized. If null, no change is made to
-   *  the weights.
+   *  @param buf_scale A scale factor provided to the caller to scale
+   *                   the returned buffer by.
+   *  @param in_scale A scale factor provided to the caller to scale
+   *                  their gradient contributions by.
+   *  @param allreduce_needed Whether this gradient contribution will need to
+   *                          be allreduced.
    */
-  virtual void setup(weights* w = nullptr);
+  template <typename TensorDataType>
+  El::AbstractDistMatrix<TensorDataType>& get_gradient_buffer(
+    TensorDataType& buf_scale,
+    TensorDataType& in_scale,
+    bool allreduce_needed = false);
 
-  /** @brief Optimization step. */
-  void step();
+  ///@}
+  /** @brief Communicator access */
+  ///@{
 
-  /** @brief LBANN communicator. */
+  /** @brief Access LBANN communicator. */
   lbann_comm& get_comm() { return *m_comm; }
-  /** @brief LBANN communicator. */
+
+  /** @brief Access LBANN communicator. */
   const lbann_comm& get_comm() const { return *m_comm; }
 
-  /** @brief Scaling factor for optimization step sizes. */
-  DataType get_learning_rate() const;
-  /** @brief Scaling factor for optimization step sizes. */
-  void set_learning_rate(DataType learning_rate);
+  ///@}
+  /** @brief Statistics access and management */
+  ///@{
 
   /** @brief Time spent in optimization step. */
   EvalType get_step_time() const { return m_step_time; }
+
   /** @brief Reset stats counters. */
   virtual void reset_counters() { m_step_time = 0; }
 
+  ///@}
+  /** @name Checkpointing */
+  ///@{
+
+  /** @brief Store state to archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    // Do not save the optimizer's step time
+  }
+
+  virtual bool save_to_checkpoint_shared(persist& p, std::string m_name) = 0;
+  virtual bool load_from_checkpoint_shared(persist& p, std::string m_name) = 0;
+  virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name) = 0;
+  virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name) = 0;
+  ///@}
+
 protected:
+  /** @brief Manage gradient information. */
+  class GradientHelper {
+  public:
+    virtual ~GradientHelper() = default;
+    optimizer_gradient_status get_status() const noexcept { return status_; }
+    void set_status(optimizer_gradient_status s) noexcept { status_ = s; }
+    virtual El::BaseDistMatrix& gradient() noexcept = 0;
+    virtual El::BaseDistMatrix const& gradient() const noexcept = 0;
+    virtual void start_allreduce(lbann_comm&) = 0;
+    virtual void complete_allreduce(lbann_comm&) = 0;
+    virtual void clear() = 0;
+  private:
+    optimizer_gradient_status status_ = optimizer_gradient_status::cleared;
+  };// class GradientHelper
+
+  template <typename TensorDataType>
+  class GradientHelperImpl : public GradientHelper {
+  public:
+    using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;
+  public:
+    GradientHelperImpl(El::Int height, El::Int width, El::DistData dist_data)
+      : gradient_{AbsDistMatType::Instantiate(dist_data)}
+    {
+      El::Zeros(*gradient_, height, width);
+    }
+    AbsDistMatType& gradient() noexcept override { return *gradient_; }
+    AbsDistMatType const& gradient() const noexcept override {
+      return *gradient_;
+    }
+    void start_allreduce(lbann_comm& comm) override {
+      switch (this->get_status()) {
+      case optimizer_gradient_status::allreduce_needed:
+        comm.nb_allreduce(*gradient_,
+                          gradient_->RedundantComm(),
+                          allreduce_req_);
+        this->set_status(optimizer_gradient_status::allreduce_started);
+        break;
+      case optimizer_gradient_status::ready:
+      case optimizer_gradient_status::cleared:
+      case optimizer_gradient_status::allreduce_started:
+        break;
+      default: LBANN_ERROR("unexpected gradient status "
+                           "(" + to_string(this->get_status()) + ")");
+      }
+    }
+    void complete_allreduce(lbann_comm& comm) override {
+      switch (this->get_status()) {
+      case optimizer_gradient_status::allreduce_started:
+        comm.wait(allreduce_req_);
+        this->set_status(optimizer_gradient_status::ready);
+        break;
+      case optimizer_gradient_status::ready:
+      case optimizer_gradient_status::cleared:
+        break;
+      case optimizer_gradient_status::allreduce_needed:
+        LBANN_ERROR("attempted to finish gradient allreduce "
+                    "before starting it");
+        break;
+      default:
+        LBANN_ERROR("unexpected gradient status "
+                    "(" + to_string(this->get_status()) + ")");
+      }
+    }
+    void clear() {
+      this->set_status(optimizer_gradient_status::cleared);
+    }
+  private:
+    std::unique_ptr<AbsDistMatType> gradient_;
+    Al::request allreduce_req_;
+  };// class GradientHelperImpl
+
+  /** @brief Copy construct/copy assign */
+  optimizer(const optimizer& other);
+  optimizer& operator=(const optimizer& other);
 
-  /** @brief Computation for an optimization step.
-   *
-   *  @c values and @c gradient can be assumed to have the same
-   *  distribution.
-   */
-  virtual void step_compute(AbsDistMat& values,
-                            const AbsDistMat& gradient) = 0;
+  /** @brief Return the current gradient status */
+  optimizer_gradient_status get_gradient_status() const {
+    return m_gradient_status;
+  }
+  void set_gradient_status(const optimizer_gradient_status status) {
+    m_gradient_status = status;
+  }
+  std::unordered_set<const void*>& get_gradient_sources() {
+    return m_gradient_sources;
+  }
+  void set_comm(lbann_comm& comm) { m_comm = &comm; }
 
-private:
+  void set_step_time(EvalType time) { m_step_time = time; }
 
-  /** @brief LBANN communicator. */
-  lbann_comm* m_comm;
+  void inc_step_time(EvalType time) { m_step_time += time; }
 
-  /** @brief Weights being optimized. */
-  weights* m_weights = nullptr;
+  virtual std::tuple<El::Int,El::Int,El::DistData> get_matrix_info() const = 0;
 
-  /** @brief Objective function gradient w.r.t. weights. */
-  std::unique_ptr<AbsDistMat> m_gradient;
+  template <typename TensorDataType>
+  void accumulate_all_gradient_contributions(
+    El::AbstractDistMatrix<TensorDataType>& gradient);
 
-  /** @brief Workspace matrix.
+  /** @brief Launch non-blocking allreduce on the gradient, if needed.
    *
-   *  Helps ensure gradient contributions are in the right
-   *  distribution. Most of the time, this should just be a matrix
-   *  view.
+   *  Does nothing if an allreduce is not needed or has already been
+   *  started.
    */
-  std::unique_ptr<AbsDistMat> m_gradient_v;
+  void start_gradient_allreduce() {
+    for (auto& grad_mgr : gradients_) {
+      grad_mgr.second->start_allreduce(*m_comm);
+    }
+  }
+
+  /** @brief Synchronize non-blocking allreduce on the gradient, if needed.
+   *
+   *  Does nothing if an allreduce isn't needed. Throws an exception
+   *  if an allreduce is needed but hasn't been started.
+   */
+  void finish_gradient_allreduce() {
+    for (auto& grad_mgr : gradients_) {
+      grad_mgr.second->complete_allreduce(*m_comm);
+    }
+  }
+private:
+
+  /** @brief LBANN communicator. */
+  lbann_comm* m_comm;
 
   /** @brief Sources of gradient contributions.
    *
@@ -235,51 +358,156 @@ class optimizer {
   /** @brief Status of values in objective function gradient. */
   optimizer_gradient_status m_gradient_status = optimizer_gradient_status::cleared;
 
-  /** @brief Communication request object for gradient allreduce.
-   *
-   *  Used to synchronize non-blocking allreduce.
-   */
-  Al::request m_gradient_allreduce_req;
-
-  /** @brief Scaling factor for optimization step sizes.
-   *
-   *  This is not used by the base optimizer class, but is currently
-   *  used by all derived optimizer classes. There are several cases
-   *  where it is convenient to expose this in the base class,
-   *  e.g. for variable learning rate schedules.
-   *  @todo Consider moving this to the derived classes.
-   */
-  DataType m_learning_rate;
-
   /** @brief Time spent in optimization step. */
   EvalType m_step_time = 0;
 
-  /** @brief Launch non-blocking allreduce on the gradient, if needed.
-   *
-   *  Does nothing if an allreduce is not needed or has already been
-   *  started.
-   */
-  void start_gradient_allreduce();
-
-  /** @brief Synchronize non-blocking allreduce on the gradient, if needed.
-   *
-   *  Does nothing if an allreduce isn't needed. Throws an exception
-   *  if an allreduce is needed but hasn't been started.
+  /** @brief Map from data types to gradient contributions.
+   *  @todo Refactor this out. It's a hack.
    */
-  void finish_gradient_allreduce();
-
-public:
-
-  // ===========================================
-  // Checkpointing
-  // ===========================================
-  virtual bool save_to_checkpoint_shared(persist& p, std::string m_name);
-  virtual bool load_from_checkpoint_shared(persist& p, std::string m_name);
-  virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name);
-  virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name);
+  using gradient_manager_type = GradientHelper;
+  using gradient_manager_ptr = std::unique_ptr<gradient_manager_type>;
+  std::unordered_map<std::type_index, gradient_manager_ptr> gradients_;
 
 };
 
+template <typename TensorDataType>
+El::AbstractDistMatrix<TensorDataType>& optimizer::get_gradient_buffer(
+  TensorDataType& buf_scale,
+  TensorDataType& in_scale,
+  bool allreduce_needed) {
+
+  // Anon enum to clarify "get<#>" calls below.
+  enum { HEIGHT=0, WIDTH, DISTDATA };
+  using GradMgrType = GradientHelperImpl<TensorDataType>;
+
+  auto& grad_mgr_ptr = gradients_[std::type_index(typeid(TensorDataType))];
+  // If the manager hasn't been created, let's make it.
+  if (!grad_mgr_ptr) {
+    auto mat_info = this->get_matrix_info();
+    grad_mgr_ptr = make_unique<GradMgrType>(
+      std::get<HEIGHT>(mat_info),
+      std::get<WIDTH>(mat_info),
+      std::get<DISTDATA>(mat_info));
+    grad_mgr_ptr->set_status(optimizer_gradient_status::cleared);
+  }
+  // Get the underlying matrix back out.
+  auto& grad_mgr = static_cast<GradMgrType&>(*grad_mgr_ptr);
+  // Complete outstanding allreduce, if needed.
+  if (grad_mgr.get_status() == optimizer_gradient_status::allreduce_started) {
+    grad_mgr.complete_allreduce(*(this->m_comm));
+  }
+  auto& buffer = grad_mgr.gradient();
+
+  // Determine scaling factor and transition state.
+  switch (grad_mgr.get_status()) {
+  case optimizer_gradient_status::ready:
+    buf_scale = DataType(1);
+    in_scale = DataType(1);
+    if (allreduce_needed) {
+      buf_scale /= buffer.RedundantSize();
+      grad_mgr.set_status(optimizer_gradient_status::allreduce_needed);
+    }
+    break;
+  case optimizer_gradient_status::cleared:
+    buf_scale = DataType(0);
+    in_scale = DataType(1);
+    grad_mgr.set_status(allreduce_needed ?
+                        optimizer_gradient_status::allreduce_needed :
+                        optimizer_gradient_status::ready);
+    break;
+  case optimizer_gradient_status::allreduce_needed:
+    buf_scale = DataType(1);
+    // Properly scale data that does not need to be allreduced.
+    in_scale = (allreduce_needed ?
+                DataType(1) :
+                DataType(1) / buffer.RedundantSize());
+    break;
+  case optimizer_gradient_status::allreduce_started:
+  default:
+    LBANN_ERROR("unexpected gradient status ("
+                + to_string(grad_mgr.get_status()) + ")");
+  }
+  return buffer;
+}
+
+template <typename TensorDataType>
+void optimizer::accumulate_all_gradient_contributions(
+  El::AbstractDistMatrix<TensorDataType>& gradient)
+{
+  using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;
+  static const TensorDataType one = TensorDataType(1.f);
+
+  // There are a few cases to note here:
+  //   1. One update of the same type.
+  //   2. One update of a different type.
+  //   3. Multiple updates of multiple types. In this case, some work
+  //      can be saved if one of the updates has the same type as
+  //      "gradient".
+
+  // Some general information
+  auto num_updates = this->gradients_.size();
+  auto const this_type_idx = std::type_index(typeid(TensorDataType));
+
+  if (num_updates == 0UL)
+    return;
+
+  // Handle the case that one of the updates is TensorDataType. In
+  // this case, the input gradients matrix can be made to "view" the
+  // update, rather than requiring a copy.
+  auto this_type_contrib = this->gradients_.find(this_type_idx);
+  if (this_type_contrib != this->gradients_.end()) {
+    // Check for invariant consistency.
+    auto const& grad_mgr = *(this_type_contrib->second);
+    if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
+      LBANN_ERROR("Expected ready status. Got: ",
+                  to_string(grad_mgr.get_status()));
+    }
+    // Sync the input gradient with the contribution, one way or another.
+    auto const& contrib =
+      dynamic_cast<AbsDistMatType const&>(grad_mgr.gradient());
+    if (contrib.DistData() == gradient.DistData()) {
+      El::LockedView(gradient, contrib);
+    }
+    else {
+      LBANN_ERROR("Should never need this copy.");
+      El::Copy(contrib, gradient);
+    }
+    --num_updates;
+  }
+  else {
+    // No sync possible; zero out the matrix instead
+    El::Zero(gradient);
+  }
+
+  // Handle the case that only 1 update of a different type is needed.
+  if (num_updates == 1UL && this->gradients_.size() == 1UL) {
+    auto const& grad_mgr = *(this->gradients_.begin()->second);
+    if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
+      LBANN_ERROR("Expected ready status. Got: ",
+                  to_string(grad_mgr.get_status()));
+    }
+    El::Copy(grad_mgr.gradient(), gradient);
+  }
+  else if (this->gradients_.size() > 1UL) {
+    // Need a temporary matrix for the type-casted copy.
+    auto tmp = std::unique_ptr<AbsDistMatType>{
+      gradient.Construct(gradient.Grid(), gradient.Root())};
+
+    for (auto const& grad_mgr_v : this->gradients_) {
+      if (grad_mgr_v.first == this_type_idx)
+        continue;
+      auto const& grad_mgr = *(grad_mgr_v.second);
+      if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
+        LBANN_ERROR("Expected ready status. Got: ",
+                    to_string(grad_mgr.get_status()));
+      }
+      auto const& grad_base = grad_mgr.gradient();
+      El::Copy(grad_base, *tmp);
+      El::Axpy(one, *tmp, gradient);
+    }
+  }
+}
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZERS_OPTIMIZER_HPP_INCLUDED
diff --git a/include/lbann/optimizers/rmsprop.hpp b/include/lbann/optimizers/rmsprop.hpp
index a8debaa076c..dd0b63b6ecd 100644
--- a/include/lbann/optimizers/rmsprop.hpp
+++ b/include/lbann/optimizers/rmsprop.hpp
@@ -27,8 +27,10 @@
 #ifndef LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED
 
-#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
 #include <sys/stat.h>
+#include "lbann/io/persist.hpp"
+#include <optimizers.pb.h>
 
 namespace lbann {
 
@@ -37,74 +39,74 @@ namespace lbann {
  *  See
  *  https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf.
  */
-class rmsprop : public optimizer {
+template <typename TensorDataType>
+class rmsprop : public Cloneable<rmsprop<TensorDataType>,
+                                 data_type_optimizer<TensorDataType>> {
+  using BaseType = Cloneable<rmsprop<TensorDataType>,
+                             data_type_optimizer<TensorDataType>>;
 public:
+  /** @name Public Types */
+  ///@{
 
-  rmsprop(lbann_comm* comm,
-          DataType learning_rate,
-          DataType decay_rate,
-          DataType eps = 1e-8);
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The optimizer base type of this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
+
+public:
+
+  rmsprop(TensorDataType learning_rate,
+          TensorDataType decay_rate,
+          TensorDataType eps = 1e-8);
   rmsprop(const rmsprop& other);
   rmsprop& operator=(const rmsprop& other);
   ~rmsprop() override = default;
-  rmsprop* copy() const override { return new rmsprop(*this); }
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(cereal::base_class<data_type_optimizer<TensorDataType>>(this),
+       CEREAL_NVP(m_decay_rate));
+  }
 
   /** Human-readable type name. */
   std::string get_type() const override { return "RMSprop"; }
   /** Human-readable description. */
   description get_description() const override;
 
-  void setup(weights* w = nullptr) override;
+  void setup(WeightsType* w = nullptr) override;
 
 protected:
 
   /** Computation for an optimization step. */
-  void step_compute(AbsDistMat& values,
-                    const AbsDistMat& gradient) override;
+  void step_compute(AbsDistMatrixType& values,
+                    const AbsDistMatrixType& gradient) override;
 
 private:
 
   /** Decay rate. */
-  DataType m_decay_rate;
+  TensorDataType m_decay_rate;
   /** Small factor to avoid division by zero. */
-  DataType m_eps;
+  TensorDataType m_eps;
   /** RMSprop cache. */
-  std::unique_ptr<AbsDistMat> m_cache;
+  std::unique_ptr<AbsDistMatrixType> m_cache;
 
   /** CPU implementation of optimization step. */
-  void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #ifdef LBANN_HAS_CUDA
   /** GPU implementation of optimization step. */
-  void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #endif // LBANN_HAS_CUDA
 
   // ===========================================
   // Checkpointing
   // ===========================================
 
-  struct packing_header {
-    DataType decay_rate;
-  };
-
-  bool pack_scalars(persist& p) {
-    p.write_datatype(persist_type::train, "decay_rate", m_decay_rate);
-    return true;
-  }
-
-  bool unpack_scalars(persist& p, struct packing_header *header){
-    p.read_datatype(persist_type::train, "momentum",  &m_decay_rate);
-
-    if(header != nullptr){
-      header->decay_rate = m_decay_rate;
-    }
-
-  return true;
-  }
-
-  void unpack_header(struct packing_header& header){
-    m_decay_rate = header.decay_rate;
-  }
-
   bool save_to_checkpoint_shared(persist& p, std::string m_name) override;
   bool load_from_checkpoint_shared(persist& p, std::string m_name) override;
   bool save_to_checkpoint_distributed(persist& p, std::string m_name) override;
@@ -112,6 +114,11 @@ class rmsprop : public optimizer {
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_rmsprop_optimizer_from_pbuf(
+  google::protobuf::Message const&);
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED
diff --git a/include/lbann/optimizers/sgd.hpp b/include/lbann/optimizers/sgd.hpp
index 2d59b8c2ffe..3cfc66f952a 100644
--- a/include/lbann/optimizers/sgd.hpp
+++ b/include/lbann/optimizers/sgd.hpp
@@ -27,7 +27,9 @@
 #ifndef LBANN_OPTIMIZERS_SGD_HPP_INCLUDED
 #define LBANN_OPTIMIZERS_SGD_HPP_INCLUDED
 
-#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/io/persist.hpp"
+#include <optimizers.pb.h>
 
 namespace lbann {
 
@@ -35,22 +37,44 @@ namespace lbann {
  *  @details Supports momentum and Nesterov acceleration.
  *  @todo Dedicated optimizers for momentum or Nesterov SGD.
  */
-class sgd : public optimizer {
+template <typename TensorDataType>
+class sgd : public Cloneable<sgd<TensorDataType>,
+                             data_type_optimizer<TensorDataType>> {
+  using BaseType = Cloneable<sgd<TensorDataType>,
+                             data_type_optimizer<TensorDataType>>;
+
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief The optimizer base type of this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The concrete weights type used by this object. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  ///@}
 
 public:
 
   /** @name Life cycle functions */
   ///@{
 
-  sgd(lbann_comm *comm,
-      DataType learning_rate,
-      DataType momentum = 0,
+  sgd(TensorDataType learning_rate,
+      TensorDataType momentum = 0,
       bool nesterov = false);
   sgd(const sgd& other);
   sgd& operator=(const sgd& other);
   ~sgd() override = default;
-  sgd* copy() const override { return new sgd(*this); }
 
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(cereal::base_class<data_type_optimizer<TensorDataType>>(this),
+       CEREAL_NVP(m_momentum));
+  }
   ///@}
 
   /** @name Descriptions */
@@ -69,11 +93,11 @@ class sgd : public optimizer {
   /** @brief Decay rate for gradient accumulation.
    *  @details A momentum of zero corresponds to vanilla SGD.
    */
-  DataType get_momentum() const noexcept { return m_momentum; }
+  TensorDataType get_momentum() const noexcept { return m_momentum; }
   /** @brief Decay rate for gradient accumulation.
    *  @details A momentum of zero corresponds to vanilla SGD.
    */
-  void set_momentum(DataType momentum) { m_momentum = momentum; }
+  void set_momentum(TensorDataType momentum) { m_momentum = momentum; }
 
   /** Whether Nesterov acceleration is applied. */
   bool using_nesterov() const noexcept { return m_nesterov; }
@@ -81,70 +105,47 @@ class sgd : public optimizer {
   void set_nesterov(bool nesterov) { m_nesterov = nesterov; }
 
   /** Accumulated gradients for momentum optimizer. */
-  const AbsDistMat& get_velocity() const;
+  const AbsDistMatrixType& get_velocity() const;
   /** Accumulated gradients for momentum optimizer. */
-  AbsDistMat& get_velocity();
+  AbsDistMatrixType& get_velocity();
 
   ///@}
 
   /** @name Setup */
   ///@{
 
-  void setup(weights* w = nullptr) override;
+  void setup(WeightsType* w = nullptr) override;
 
   ///@}
 
 protected:
 
   /** Computation for an optimization step. */
-  void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override;
+  void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override;
 
 private:
 
   /** @brief Decay rate for gradient accumulation.
    *  @details A momentum of zero corresponds to vanilla SGD.
    */
-  DataType m_momentum;
+  TensorDataType m_momentum;
   /** Whether Nesterov acceleration is used. */
   bool m_nesterov;
   /** @brief Accumulated gradients.
    *  @details Not used for vanilla SGD.
    */
-  std::unique_ptr<AbsDistMat> m_velocity;
+  std::unique_ptr<AbsDistMatrixType> m_velocity;
 
   /** CPU implementation of momentum or Nesterov step. */
-  void momentum_step_cpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void momentum_step_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #ifdef LBANN_HAS_CUDA
   /** GPU implementation of momentum or Nesterov step. */
-  void momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient);
+  void momentum_step_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient);
 #endif // LBANN_HAS_CUDA
 
   /** @name Checkpointing */
   ///@{
 
-  struct packing_header {
-    DataType momentum;
-  };
-
-  bool pack_scalars(persist& p) {
-    p.write_datatype(persist_type::train, "momentum", m_momentum);
-    return true;
-  }
-
-  bool unpack_scalars(persist& p, struct packing_header *header){
-    p.read_datatype(persist_type::train, "momentum",  &m_momentum);
-
-    if(header != nullptr){
-      header->momentum = m_momentum;
-    }
-
-  return true;
-  }
-
-  void unpack_header(struct packing_header& header){
-    m_momentum = header.momentum;
-  }
-
   bool save_to_checkpoint_shared(persist& p, std::string m_name) override;
   bool load_from_checkpoint_shared(persist& p, std::string m_name) override;
   bool save_to_checkpoint_distributed(persist& p, std::string m_name) override;
@@ -154,6 +155,11 @@ class sgd : public optimizer {
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_sgd_optimizer_from_pbuf(
+  google::protobuf::Message const&);
+
 } // namespace lbann
 
 #endif // LBANN_OPTIMIZERS_SGD_HPP_INCLUDED
diff --git a/include/lbann/proto/CMakeLists.txt b/include/lbann/proto/CMakeLists.txt
index 59dbee3097d..6ac2825f3e7 100644
--- a/include/lbann/proto/CMakeLists.txt
+++ b/include/lbann/proto/CMakeLists.txt
@@ -2,6 +2,8 @@
 set_full_path(THIS_DIR_HEADERS
   init_image_data_readers.hpp
   proto_common.hpp
+  helpers.hpp
+  datatype_helpers.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/proto/datatype_helpers.hpp b/include/lbann/proto/datatype_helpers.hpp
new file mode 100644
index 00000000000..0c91d878de5
--- /dev/null
+++ b/include/lbann/proto/datatype_helpers.hpp
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED
+#define LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED
+
+#include <model.pb.h>
+
+namespace lbann
+{
+namespace proto
+{
+
+template <typename T>
+struct TypeToProtoDataType;
+
+template <>
+struct TypeToProtoDataType<float>
+{
+  static constexpr auto value = lbann_data::FLOAT;
+};
+
+template <>
+struct TypeToProtoDataType<double>
+{
+  static constexpr auto value = lbann_data::DOUBLE;
+};
+
+#ifdef LBANN_HAS_HALF
+template <>
+struct TypeToProtoDataType<cpu_fp16>
+{
+  static constexpr auto value = lbann_data::FP16;
+};
+#endif // LBANN_HAS_HALF
+
+#ifdef LBANN_HAS_GPU_FP16
+template <>
+struct TypeToProtoDataType<fp16>
+{
+  static constexpr auto value = lbann_data::FP16;
+};
+#endif // LBANN_HAS_GPU_FP16
+
+}// namespace proto
+}// namespace lbann
+#endif /* LBANN_PROTO_DATATYPE_HELPERS_HPP_INCLUDED */
diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp
index ca68f30975d..2b76613be66 100644
--- a/include/lbann/proto/factories.hpp
+++ b/include/lbann/proto/factories.hpp
@@ -27,81 +27,104 @@
 #ifndef LBANN_PROTO_FACTORIES_HPP_INCLUDED
 #define LBANN_PROTO_FACTORIES_HPP_INCLUDED
 
-#include "lbann/proto/proto_common.hpp"
 #include "lbann/data_readers/data_reader.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/transforms/transform.hpp"
+#include "lbann/transforms/transform_pipeline.hpp"
+
+#include <google/protobuf/message.h>
+
+#include <map>
+#include <memory>
+
+namespace lbann_data {
+class Layer;
+class Model;
+class ObjectiveFunction;
+class Optimizer;
+class Reader;
+class Transform;
+class Weights;
+}// namespace lbann_data
 
 namespace lbann {
+
+// Forward declarations
+class callback_base;
+class Layer;
+class lbann_summary;
+class model;
+class objective_function;
+class optimizer;
+class trainer;
+class weights;
+
 namespace proto {
 
+/** Construct a trainer specified with a prototext. */
+std::unique_ptr<trainer> construct_trainer(lbann_comm* comm,
+                                           const lbann_data::Trainer& proto_trainer);
+
 /** Construct a model specified with a prototext. */
-model* construct_model(lbann_comm* comm,
-                       const std::map<execution_mode, generic_data_reader*>& data_readers,
-                       const lbann_data::Optimizer& proto_opt,
-                       const lbann_data::Model& proto_model);
+std::unique_ptr<model> construct_model(
+  lbann_comm* comm,
+  int training_dr_linearized_data_size,
+  const lbann_data::Optimizer& proto_opt,
+  const lbann_data::Trainer& proto_trainer,
+  const lbann_data::Model& proto_model);
 
 /** Construct a layer graph specified with a prototext. */
 std::vector<std::unique_ptr<Layer>> construct_layer_graph(
   lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader *>& data_readers,
+  int training_dr_linearized_data_size,
+  const lbann_data::Trainer& proto_trainer,
   const lbann_data::Model& proto_model);
 
 /** Construct a layer specified with prototext. */
-template <data_layout layout, El::Device Dev>
+template <typename TensorDataType, data_layout layout, El::Device Dev>
 std::unique_ptr<Layer> construct_layer(
   lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
+  int training_dr_linearized_data_size,
   int num_parallel_readers,
   const lbann_data::Layer& proto_layer);
 
 /** Construct weights specified with prototext. */
-weights* construct_weights(lbann_comm* comm,
-                           const lbann_data::Optimizer& proto_opt,
-                           const lbann_data::Weights& proto_weights);
+std::unique_ptr<weights> construct_weights(
+  lbann_comm* comm,
+  const lbann_data::Optimizer& proto_opt,
+  const lbann_data::Weights& proto_weights);
 
 /** Construct a callback specified with prototext. */
-lbann_callback* construct_callback(lbann_comm* comm,
-                                   const lbann_data::Callback& proto_cb,
-                                   const std::map<execution_mode, generic_data_reader*>& data_readers,
-                                   std::vector<Layer*> layer_list,
-                                   std::vector<weights*> weights_list,
-                                   lbann_summary* summarizer);
+std::unique_ptr<callback_base>
+construct_callback(const google::protobuf::Message& proto_cb);
+
+/** Construct a callback specified with prototext. */
+std::unique_ptr<callback_base>
+construct_callback(const google::protobuf::Message& proto_cb,
+                   std::shared_ptr<lbann_summary> const& summarizer);
 
 /** Construct a summarizer specified with prototext.
  *  The summarizer is only constructed if the summarizer callback is
  *  enabled.
  */
-lbann_summary* construct_summarizer(lbann_comm* comm,
-                                    const lbann_data::Model& m);
+std::unique_ptr<lbann_summary> construct_summarizer(lbann_comm* comm,
+                                                    const lbann_data::Model& m);
 
 /** Construct an optimizer specified with prototext. */
-optimizer* construct_optimizer(lbann_comm* comm,
-                               const lbann_data::Optimizer& proto_opt);
+template <typename T>
+std::unique_ptr<optimizer> construct_optimizer(
+  const lbann_data::Optimizer& proto_opt);
 
 /** Construct an objective function specified with prototext. */
-objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj);
-
-/** Parse a space-separated list. */
-template <typename T = std::string>
-std::vector<T> parse_list(std::string str) {
-  std::vector<T> list;
-  std::stringstream ss(str);
-  for (T entry; ss >> entry;) {
-    list.push_back(entry);
-  }
-  return list;
-}
-template <>
-std::vector<execution_mode> parse_list<execution_mode>(std::string str);
-
-/** Parse a space-separated set. */
-template <typename T = std::string>
-std::set<T> parse_set(std::string str) {
-  std::set<T> set;
-  for (const auto& entry : parse_list<T>(str)) {
-    set.insert(entry);
-  }
-  return set;
-}
+std::unique_ptr<objective_function>
+construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj);
+
+/** Construct a transform given a prototext. */
+std::unique_ptr<transform::transform> construct_transform(
+  const lbann_data::Transform& trans);
+/** Construct a transform pipeline given a data reader prototext. */
+transform::transform_pipeline construct_transform_pipeline(
+  const lbann_data::Reader& data_reader);
 
 } // namespace proto
 } // namespace lbann
diff --git a/include/lbann/proto/helpers.hpp b/include/lbann/proto/helpers.hpp
new file mode 100644
index 00000000000..0a8cf656409
--- /dev/null
+++ b/include/lbann/proto/helpers.hpp
@@ -0,0 +1,66 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_PROTO_HELPERS_HPP_INCLUDED
+#define LBANN_PROTO_HELPERS_HPP_INCLUDED
+
+#include <google/protobuf/message.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+
+namespace lbann
+{
+namespace proto
+{
+
+template <typename OutT, typename... Args>
+struct GenerateBuilderType_struct
+{
+  using type = std::function<std::unique_ptr<OutT>(Args...)>;
+};
+
+template <typename OutT, typename... Args>
+using generate_builder_type =
+  typename GenerateBuilderType_struct<OutT, Args...>::type;
+
+namespace helpers
+{
+
+/** @brief Test whether the message has the oneof field. */
+bool has_oneof(
+  google::protobuf::Message const& msg, std::string const& oneof_name);
+
+/** @brief Get a "derived type" message from the given message. */
+google::protobuf::Message const&
+get_oneof_message(
+  google::protobuf::Message const& msg_in, std::string const& oneof_name);
+
+}// namespace helpers
+}// namespace proto
+}// namespace lbann
+#endif /* LBANN_PROTO_HELPERS_HPP_INCLUDED */
diff --git a/include/lbann/proto/init_image_data_readers.hpp b/include/lbann/proto/init_image_data_readers.hpp
index f35a5797e2b..4b585998599 100644
--- a/include/lbann/proto/init_image_data_readers.hpp
+++ b/include/lbann/proto/init_image_data_readers.hpp
@@ -26,13 +26,18 @@
 
 #ifndef LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED
 #define LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED
+
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/comm.hpp"
 
+namespace lbann_data {
+class Reader;
+class DataSetMetaData;
+}
+
 namespace lbann {
 
 extern void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader);
-extern void init_generic_preprocessor(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* reader);
 extern void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* &reader);
 
 } // namespace lbann
diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp
index b9986dfcc99..8bb4d50fc25 100644
--- a/include/lbann/proto/proto_common.hpp
+++ b/include/lbann/proto/proto_common.hpp
@@ -1,16 +1,50 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
 #ifndef LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED
 #define LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED
 
-#include "lbann/lbann.hpp"
-#include <lbann.pb.h>
-#include "lbann/proto/factories.hpp"
+#include "lbann/data_readers/data_reader.hpp"
 
-namespace lbann {
+#define LBANN_ASSERT_MSG_HAS_FIELD(MSG, FIELD)                          \
+  do {                                                                  \
+    if (!MSG.has_##FIELD()) {                                           \
+      LBANN_ERROR("No field \"" #FIELD "\" in the given message:\n{\n", \
+                  MSG.DebugString(), "\n}\n");                          \
+    }                                                                   \
+  }                                                                     \
+  while(false)
 
-/** @brief Returns true if the Model contains at least one MotifLayer */
-bool has_motifs(const lbann_comm& comm, const lbann_data::LbannPB& p);
+// Forward declaration of protobuf classes
+namespace lbann_data {
+class LbannPB;
+class Trainer;
+}
 
-void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb);
+namespace lbann {
 
 /** @brief Customize the name of the index list
  *
@@ -25,27 +59,31 @@ void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb);
     <model name>_t<ID>_<basename>.<extension> @endverbatim
  */
 void customize_data_readers_index_list(const lbann_comm& comm,
-                                       lbann_data::LbannPB& p);
+                                       ::lbann_data::LbannPB& p);
 
 /** @brief instantiates one or more generic_data_readers and inserts
  *         them in &data_readers
  */
 void init_data_readers(
   lbann_comm *comm,
-  const lbann_data::LbannPB& p,
+  const ::lbann_data::LbannPB& p,
   std::map<execution_mode, generic_data_reader *>& data_readers,
   bool is_shareable_training_data_reader,
   bool is_shareable_testing_data_reader,
   bool is_shareable_validation_data_reader = false);
 
 /** @brief adjusts the number of parallel data readers */
-void set_num_parallel_readers(const lbann_comm& comm, lbann_data::LbannPB& p);
+void set_num_parallel_readers(const lbann_comm& comm, ::lbann_data::LbannPB& p);
 
 /** @brief adjusts the values in p by querying the options db */
-void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p);
+void get_cmdline_overrides(const lbann_comm& comm, ::lbann_data::LbannPB& p);
 
 /** @brief print various params (learn_rate, etc) to cout */
-void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p);
+void print_parameters(const lbann_comm& comm,
+                      ::lbann_data::LbannPB& p,
+                      std::vector<int>& root_random_seeds,
+                      std::vector<int>& random_seeds,
+                      std::vector<int>& data_seq_random_seeds);
 
 /** @brief prints usage information */
 void print_help(const lbann_comm& comm);
@@ -56,18 +94,85 @@ void print_help(std::ostream& os);
 /** @brief prints prototext file, cmd line, etc to file */
 void save_session(const lbann_comm& comm,
                   const int argc, char * const* argv,
-                  lbann_data::LbannPB& p);
+                  ::lbann_data::LbannPB& p);
 
 /** @brief Read prototext from a file into a protobuf message. */
 void read_prototext_file(
   const std::string& fn,
-  lbann_data::LbannPB& pb,
+  ::lbann_data::LbannPB& pb,
   const bool master);
 
 /** @brief Write a protobuf message into a prototext file. */
 bool write_prototext_file(
   const std::string& fn,
-  lbann_data::LbannPB& pb);
+  ::lbann_data::LbannPB& pb);
+
+/** @brief Trim leading and trailing whitespace from a string. */
+std::string trim(std::string const& str);
+
+// These functions work on trimmed, nonempty strings
+namespace details {
+
+template <typename T>
+std::vector<T> parse_list_impl(std::string const& str) {
+#ifdef LBANN_HAS_GPU_FP16
+  using ParseType = typename std::conditional<std::is_same<T, fp16>::value, float, T>::type;
+#else
+  using ParseType = T;
+#endif
+  ParseType entry;
+  std::vector<T> list;
+  std::istringstream iss(str);
+  while (iss.good()) {
+    iss >> entry;
+    list.emplace_back(std::move(entry));
+  }
+  return list;
+}
+
+template <typename T>
+std::set<T> parse_set_impl(std::string const& str) {
+#ifdef LBANN_HAS_GPU_FP16
+  using ParseType = typename std::conditional<std::is_same<T, fp16>::value, float, T>::type;
+#else
+  using ParseType = T;
+#endif
+  ParseType entry;
+  std::set<T> set;
+  std::istringstream iss(str);
+  while(iss.good()) {
+    iss >> entry;
+    set.emplace(std::move(entry));
+  }
+  return set;
+}
+
+// TODO (trb 07/25/19): we should think about what to do about bad
+// input. That is, if a user calls parse_list<int>("one two three"),
+// the result is undefined (one test I did gave [0,0,0] and another
+// gave [INT_MAX,INT_MAX,INT_MAX]). In most cases in LBANN, I would
+// guess that this will result in a logic error further down the
+// codepath, but we shouldn't count on it.
+
+}// namespace details
+
+/** @brief Parse a space-separated list. */
+template <typename T = std::string>
+std::vector<T> parse_list(std::string const& str) {
+  auto trim_str = trim(str);
+  if (trim_str.size())
+    return details::parse_list_impl<T>(trim_str);
+  return {};
+}
+
+/** @brief Parse a space-separated set. */
+template <typename T = std::string>
+std::set<T> parse_set(std::string const& str) {
+  auto trim_str = trim(str);
+  if (trim_str.size())
+    return details::parse_set_impl<T>(trim_str);
+  return {};
+}
 
 } // namespace lbann
 
diff --git a/include/lbann/trainers/CMakeLists.txt b/include/lbann/trainers/CMakeLists.txt
new file mode 100644
index 00000000000..827647c3c7a
--- /dev/null
+++ b/include/lbann/trainers/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  trainer.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/trainers/trainer.hpp b/include/lbann/trainers/trainer.hpp
new file mode 100644
index 00000000000..9d4ade530ba
--- /dev/null
+++ b/include/lbann/trainers/trainer.hpp
@@ -0,0 +1,251 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRAINER_HPP
+#define LBANN_TRAINER_HPP
+
+#include "lbann/base.hpp"
+#include "lbann/comm.hpp"
+#include "lbann/data_coordinator/data_coordinator.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/execution_contexts/execution_context.hpp"
+#include "lbann/io/persist.hpp"
+#include "lbann/utils/threads/thread_pool.hpp"
+#include "lbann/utils/hash.hpp"
+#include <lbann.pb.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+
+namespace lbann {
+
+// Forward-declare this.
+class lbann_callback;
+class training_algorithm;
+class termination_criteria;
+
+/** Represents an LBANN trainer and its context. */
+class trainer {
+public:
+
+  /** Constructor. */
+  trainer(lbann_comm *comm,
+          size_t mini_batch_size);
+
+  /** Copy constructor. */
+  trainer(const trainer& other);
+  /** Copy assignment operator. */
+  trainer& operator=(const trainer& other);
+  /** Destructor. */
+  ~trainer();
+
+  /** Archive for checkpoint and restart */
+  template <class Archive> void serialize(Archive & ar) {
+    ar(CEREAL_NVP(m_persist),
+       CEREAL_NVP(m_max_mini_batch_size),
+       CEREAL_NVP(m_root_random_seed),
+       CEREAL_NVP(m_random_seed),
+       CEREAL_NVP(m_data_seq_random_seed));
+  }
+
+  /** Set the trainer's name; this is an arbitrary string
+   *  that may be useful in multi-trainer scenarios, e.g,
+   *  LTFB, jag
+   */
+  void set_name(std::string const& name);
+
+  /** Return the trainer's name; this is an arbitrary string
+   *  that may be useful in multi-trainer scenarios, e.g,
+   *  LTFB, jag
+   */
+  std::string get_name() const {
+    return m_name;
+  }
+
+  /** Human-readable description. */
+  description get_description() const;
+
+  /** Set the random seeds used for the trainer */
+  void set_random_seeds(int root_random_seed, int random_seed, int data_seq_random_seed) {
+    m_root_random_seed = root_random_seed;
+    m_random_seed = random_seed;
+    m_data_seq_random_seed = data_seq_random_seed;
+  }
+
+  int get_random_seed() const { return m_random_seed; }
+  int get_data_seq_random_seed() const { return m_data_seq_random_seed; }
+
+  /** @brief Get the list of callbacks for the trainer. */
+  std::vector<observer_ptr<callback_base>> get_callbacks() {
+    std::vector<observer_ptr<callback_base>> callback_list;
+    callback_list.reserve(m_callbacks.size());
+    for (const auto& ptr : m_callbacks) {
+      callback_list.push_back(ptr.get());
+    }
+    return callback_list;
+  }
+
+  void add_callback(std::shared_ptr<callback_base> cb) {
+    if (cb == nullptr) {
+      throw lbann_exception("model: Attempted to add null pointer as a callback.");
+    }
+    m_callbacks.push_back(std::move(cb));
+  }
+
+  std::vector<std::shared_ptr<callback_base>>& get_callbacks_with_ownership() {
+    return m_callbacks;
+  }
+
+  /** Set up the trainer. */
+  void setup(std::unique_ptr<thread_pool> io_thread_pool, std::map<execution_mode, generic_data_reader *> data_readers);
+
+  using execution_context_key_pair_t = typename std::pair<observer_ptr<model>, execution_mode>;
+
+  execution_context_key_pair_t
+  check_and_build_execution_context(training_algorithm& alg,
+                                    observer_ptr<model> model,
+                                    execution_mode mode);
+
+  execution_context_key_pair_t
+  check_and_build_execution_context(execution_context& c,
+                                    model& model,
+                                    execution_mode mode);
+
+  execution_context& get_execution_context(observer_ptr<model> model,
+                                                                 execution_mode mode);
+
+  execution_context& get_execution_context(execution_context_key_pair_t key);
+
+  void delete_execution_context(execution_context_key_pair_t key);
+
+  void for_each_execution_context(std::function<void(observer_ptr<execution_context>)>fn);
+
+  data_coordinator& get_data_coordinator() { return m_data_coordinator; }
+
+  void apply(training_algorithm& alg,
+             observer_ptr<model> model,
+             execution_mode mode,
+             termination_criteria const& term_criteria);
+
+  void train(observer_ptr<model> model, El::Int num_epochs, El::Int num_batches=0);
+
+  void evaluate(observer_ptr<model> model, execution_mode mode, El::Int num_batches=0);
+
+  /** Return the I/O thread pool */
+  thread_pool& get_io_thread_pool() const {
+    if (!m_io_thread_pool) { LBANN_ERROR("m_io_thread_pool is null"); }
+    return *(m_io_thread_pool.get());
+  }
+
+  /** Get the trainer's comm. */
+  inline lbann_comm *get_comm() const {
+    return m_comm;
+  }
+
+  /** Get the trainer's persist object */
+  inline persist& get_persist_obj() {
+    return m_persist;
+  }
+
+  /** Get the trainer's maximum mini-batch size. */
+  inline size_t get_max_mini_batch_size() const {
+    return m_max_mini_batch_size;
+  }
+
+  /** Set a flag that can be used to enable / disable the background I/O activities */
+  void allow_background_io_activity(bool enable) { m_background_io_allowed = enable; }
+
+  /** Are background I/O activities enabled by the input layers */
+  bool background_io_activity_allowed() { return m_background_io_allowed; }
+
+  // ===========================================
+  // Checkpointing
+  // ===========================================
+
+  /** @brief Checkpoint model to given file descriptor, return number of bytes written */
+  bool save_to_checkpoint_shared();
+  /** @brief Restore model by reading checkpoint from given file descriptor, return number of bytes read */
+  bool load_from_checkpoint_shared(persist& p);
+  bool load_from_checkpoint_shared(model& m, execution_context& c);
+
+  bool save_to_checkpoint_distributed();
+  bool load_from_checkpoint_distributed(persist& p);
+  bool load_from_checkpoint_distributed(model& m, execution_context& c);
+
+  /** @brief Write model to proto file */
+  void write_proto(lbann_data::Trainer* proto);
+
+private:
+
+  /** Give trainer a name. */
+  std::string m_name;
+
+  /** Communicator for the trainer. */
+  lbann_comm *m_comm;
+
+  /** @details Maximum possible minibatch size supported by models and
+   *  layers in this trainer.  Note that this field will eventually be
+   *  local to the particular, instance of the training context..
+   */
+  size_t m_max_mini_batch_size;
+
+  // Root of the random seed tree: either default or user supplied
+  int m_root_random_seed;
+  // Random seed used for the general RNGs
+  int m_random_seed;
+  // Random seed used for the RNG used to fetch data
+  int m_data_seq_random_seed;
+
+  /** Threads available for I/O */
+  std::unique_ptr<thread_pool> m_io_thread_pool;
+
+  /** Flag that allows input layers to fetch data in the background */
+  bool m_background_io_allowed;
+
+  /** Persist object used for serializing LBANN classes */
+  persist m_persist;
+
+  /** Hash function for @c m_model_execution_context */
+  using model_execution_context_hash_t = pair_hash<observer_ptr<model>,
+                                                   execution_mode,
+                                                   std::hash<observer_ptr<model>>,
+                                                   enum_hash<execution_mode>>;
+
+  /** @brief Map from model and execution mode to its execution context */
+  std::unordered_map<std::pair<observer_ptr<model>, execution_mode>,
+                     std::unique_ptr<execution_context>,
+                     model_execution_context_hash_t> m_model_execution_context;
+
+  /** @brief Current callbacks to process. */
+  std::vector<std::shared_ptr<callback_base>> m_callbacks;
+
+  /** @brief Data Coordinator holding trainers data readers */
+  data_coordinator m_data_coordinator;
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_TRAINER_HPP
diff --git a/include/lbann/training_algorithms/CMakeLists.txt b/include/lbann/training_algorithms/CMakeLists.txt
new file mode 100644
index 00000000000..2240711572d
--- /dev/null
+++ b/include/lbann/training_algorithms/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  training_algorithm.hpp
+  sgd_training_algorithm.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/training_algorithms/sgd_training_algorithm.hpp b/include/lbann/training_algorithms/sgd_training_algorithm.hpp
new file mode 100644
index 00000000000..5721b00d670
--- /dev/null
+++ b/include/lbann/training_algorithms/sgd_training_algorithm.hpp
@@ -0,0 +1,111 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_SGD_TRAINING_ALGORITHM_HPP
+#define LBANN_SGD_TRAINING_ALGORITHM_HPP
+
+#include "lbann/training_algorithms/training_algorithm.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+
+namespace lbann {
+
+/** @brief Base class for LBANN SGD-family training algorithms. */
+class sgd_training_algorithm : public training_algorithm {
+public:
+
+  /** Constructor. */
+  sgd_training_algorithm() {};
+  /** Copy constructor. */
+  sgd_training_algorithm(const sgd_training_algorithm& other) = default;
+  /** Copy assignment operator. */
+  sgd_training_algorithm& operator=(const sgd_training_algorithm& other) = default;
+  /** Move constructor. */
+  sgd_training_algorithm(sgd_training_algorithm&& other) = default;
+  /** Move assignment operator. */
+  sgd_training_algorithm& operator=(sgd_training_algorithm&& other) = default;
+  /** Destructor. */
+  virtual ~sgd_training_algorithm() = default;
+  /** Copy training_algorithm. */
+  //  virtual sgd_training_algorithm* copy() const = default;
+
+  std::string get_name() const { return "sgd"; }
+
+  // ===========================================
+  // Execution
+  // ===========================================
+
+  /** Apply the training algorithm to the model with the provided
+      context and execution mode */
+  void apply(execution_context& c,
+             model& model,
+             data_coordinator& dc,
+             execution_mode mode,
+             termination_criteria const& term_criteria) override;
+
+  /** Train a model using an iterative SGD solver. */
+  void train(sgd_execution_context& c,
+             model& model,
+             data_coordinator& dc,
+             size_t num_epochs, size_t num_batches=0);
+
+  /** Evaluate a model using the forward pass of an SGD solver. */
+  void evaluate(sgd_execution_context& c,
+                model& model,
+                data_coordinator& dc,
+                execution_mode mode, size_t num_batches=0);
+
+protected:
+  /** Train model on one step / mini-batch of an SGD forward pass */
+  virtual bool train_mini_batch(sgd_execution_context& c, model& model, data_coordinator& dc);
+
+  /** Evaluate model on one step / mini-batch of an SGD forward pass */
+  virtual bool evaluate_mini_batch(sgd_execution_context& c, model& model, data_coordinator& dc, execution_mode mode);
+
+  ////////////////////////////////////////////////////////////
+  // Callbacks
+  ////////////////////////////////////////////////////////////
+
+  /** Execute callbacks at start of training. */
+  virtual void do_train_begin_cbs(model& model);
+  /** Execute callbacks at end of training. */
+  virtual void do_train_end_cbs(model& model);
+  /** Execute callbacks at start of evaluation. */
+  virtual void do_evaluate_begin_cbs(model& model, execution_mode mode);
+  /** Execute callbacks at end of evaluation. */
+  virtual void do_evaluate_end_cbs(model& model, execution_mode mode);
+  /** Execute callbacks at start of epoch. */
+  virtual void do_epoch_begin_cbs(model& model);
+  /** Execute callbacks at end of epoch. */
+  virtual void do_epoch_end_cbs(model& model);
+  /** Execute callbacks at start of mini-batch. */
+  virtual void do_batch_begin_cbs(model& model, execution_mode mode);
+  /** Execute callbacks at end of mini-batch. */
+  virtual void do_batch_end_cbs(model& model, execution_mode mode);
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_SGD_TRAINING_ALGORITHM_HPP
diff --git a/include/lbann/training_algorithms/training_algorithm.hpp b/include/lbann/training_algorithms/training_algorithm.hpp
new file mode 100644
index 00000000000..dfec6bfeef1
--- /dev/null
+++ b/include/lbann/training_algorithms/training_algorithm.hpp
@@ -0,0 +1,73 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRAINING_ALGORITHM_HPP
+#define LBANN_TRAINING_ALGORITHM_HPP
+
+#include "lbann/base.hpp"
+#include "lbann/execution_contexts/execution_context.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/data_coordinator/data_coordinator.hpp"
+
+namespace lbann {
+
+// Forward-declare this.
+class execution_context;
+
+/** Base class for LBANN training_algorithms. */
+class training_algorithm {
+public:
+
+  /** Constructor. */
+  training_algorithm() {};
+  /** Copy constructor. */
+  training_algorithm(const training_algorithm& other) = default;
+  /** Copy assignment operator. */
+  training_algorithm& operator=(const training_algorithm& other) = default;
+  /** Move constructor. */
+  training_algorithm(training_algorithm&& other) = default;
+  /** Move assignment operator. */
+  training_algorithm& operator=(training_algorithm&& other) = default;
+  /** Destructor. */
+  virtual ~training_algorithm() = default;
+  /** Copy training_algorithm. */
+  //  virtual training_algorithm* copy() const = default;
+
+  virtual std::string get_name() const = 0;
+
+  virtual void apply(execution_context& context,
+                     model& model,
+                     data_coordinator& dc,
+                     execution_mode mode,
+                     termination_criteria const& term_criteria) = 0;
+
+  void setup_models(std::vector<observer_ptr<model>> models, size_t max_mini_batch_size, DataReaderMetaData& dr_metadata);
+
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_TRAINING_ALGORITHM_HPP
diff --git a/include/lbann/transforms/CMakeLists.txt b/include/lbann/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..73511e8f331
--- /dev/null
+++ b/include/lbann/transforms/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  normalize.hpp
+  repack_HWC_to_CHW_layout.hpp
+  sample_normalize.hpp
+  scale.hpp
+  scale_and_translate.hpp
+  transform.hpp
+  transform_pipeline.hpp
+  )
+
+add_subdirectory(vision)
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/transforms/normalize.hpp b/include/lbann/transforms/normalize.hpp
new file mode 100644
index 00000000000..77bfa649489
--- /dev/null
+++ b/include/lbann/transforms/normalize.hpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include <google/protobuf/message.h>
+
+#include <vector>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Normalize with mean and standard deviation.
+ * This is done channel-wise for images. If the input does not have channels,
+ * (e.g. it is not an image), it is treated as having one "channel".
+ * This is only applicable after conversion to an LBANN CPUMat.
+ */
+class normalize : public transform {
+public:
+  /** Apply channel-wise means and standard deviations. */
+  normalize(std::vector<float> means, std::vector<float> stds) :
+    transform(), m_means(means), m_stds(stds) {
+    if (m_means.size() != m_stds.size()) {
+      LBANN_ERROR("Normalize mean and std have different numbers of channels.");
+    }
+  }
+
+  transform* copy() const override { return new normalize(*this); }
+
+  std::string get_type() const override { return "normalize"; }
+
+  bool supports_non_inplace() const override { return true; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+  void apply(utils::type_erased_matrix& data, CPUMat& out,
+             std::vector<size_t>& dims) override;
+private:
+  /** Channel-wise means. */
+  std::vector<float> m_means;
+  /** Channel-wise standard deviations. */
+  std::vector<float> m_stds;
+};
+
+// Builder function
+std::unique_ptr<transform>
+build_normalize_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_NORMALIZED_CENTER_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp
new file mode 100644
index 00000000000..59a02fc78fe
--- /dev/null
+++ b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp
@@ -0,0 +1,56 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED
+#define LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Convert data to LBANN's native data layout.
+ * Currently only supports converting from and interleaved channel format.
+ */
+class repack_HWC_to_CHW_layout : public transform {
+public:
+  transform* copy() const override { return new repack_HWC_to_CHW_layout(*this); }
+
+  std::string get_type() const override { return "to_lbann_layout"; }
+
+  bool supports_non_inplace() const override { return true; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+  void apply(utils::type_erased_matrix& data, CPUMat& out,
+             std::vector<size_t>& dims) override;
+};
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED
diff --git a/include/lbann/transforms/sample_normalize.hpp b/include/lbann/transforms/sample_normalize.hpp
new file mode 100644
index 00000000000..b6766d16915
--- /dev/null
+++ b/include/lbann/transforms/sample_normalize.hpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Normalize to have mean 0, standard deviation 1.
+ * This only works after conversion to an LBANN CPUMat.
+ */
+class sample_normalize : public transform {
+public:
+  transform* copy() const override { return new sample_normalize(*this); }
+
+  std::string get_type() const override { return "sample_normalize"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+};
+
+// Builder function
+std::unique_ptr<transform>
+build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED
diff --git a/include/lbann/transforms/scale.hpp b/include/lbann/transforms/scale.hpp
new file mode 100644
index 00000000000..36ff3bad6ad
--- /dev/null
+++ b/include/lbann/transforms/scale.hpp
@@ -0,0 +1,62 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_SCALE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_SCALE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+#include <memory>
+
+namespace lbann {
+namespace transform {
+
+/** Scale data by a constant. */
+class scale : public transform {
+public:
+  /** Scale all data by scale_val. */
+  scale(float scale_val) : transform(), m_scale(scale_val) {}
+
+  transform* copy() const override { return new scale(*this); }
+
+  std::string get_type() const override { return "scale"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Amount to scale data by. */
+  float m_scale;
+};
+
+// Builder function
+std::unique_ptr<transform>
+build_scale_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_SCALE_HPP_INCLUDED
diff --git a/include/lbann/transforms/scale_and_translate.hpp b/include/lbann/transforms/scale_and_translate.hpp
new file mode 100644
index 00000000000..42821168b33
--- /dev/null
+++ b/include/lbann/transforms/scale_and_translate.hpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+namespace lbann {
+namespace transform {
+
+/** Scale and Translate data by a constant pair of constants. */
+class scale_and_translate : public transform {
+public:
+  /** Scale_And_Translate all data by scale_and_translate_val. */
+  scale_and_translate(float scale_val, float translate_val)
+    : transform(), m_scale(scale_val), m_translate(translate_val) {}
+
+  transform* copy() const override { return new scale_and_translate(*this); }
+
+  std::string get_type() const override { return "scale"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Amount to scale data by. */
+  float m_scale;
+  /** Amount to translate data by. */
+  float m_translate;
+};
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED
diff --git a/include/lbann/transforms/transform.hpp b/include/lbann/transforms/transform.hpp
new file mode 100644
index 00000000000..140028a6429
--- /dev/null
+++ b/include/lbann/transforms/transform.hpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED
+#define LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED
+
+#include "lbann/base.hpp"
+#include "lbann/utils/description.hpp"
+#include "lbann/utils/random.hpp"
+#include "lbann/utils/type_erased_matrix.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Abstract base class for transforms on data.
+ *
+ * A transform takes a CPUMat and modifies it in-place. Transforms should
+ * be thread-safe, as one instance of a transform may be called concurrently
+ * within multiple threads.
+ *
+ * Because transforms may switch between underlying data types throughout the
+ * pipeline, everything is done in terms of a type_erased_matrix, which can
+ * swap between underlying data types.
+ */
+class transform {
+public:
+  transform() = default;
+  transform(const transform&) = default;
+  transform& operator=(const transform&) = default;
+  virtual ~transform() = default;
+
+  /** Create a copy of the transform instance. */
+  virtual transform* copy() const = 0;
+
+  /** Human-readable type name. */
+  virtual std::string get_type() const = 0;
+  /** Human-readable description. */
+  virtual description get_description() const {
+    return description(get_type() + " transform");
+  }
+
+  /** True if the transform supports non-in-place apply. */
+  virtual bool supports_non_inplace() const {
+    return false;
+  }
+
+  /**
+   * Apply the transform to data.
+   * @param data The input data to transform, which is modified in-place. The
+   *   matrix shuold be contiguous.
+   * @param dims The dimensions of the data tensor. For "plain data", dims
+   * should have one entry, giving its size. For images, dims should have three
+   * entries: channels, height, width.
+   * @note dims is a hack until we have proper tensors.
+   */
+  virtual void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) = 0;
+
+  /**
+   * Apply the transform to data.
+   * This does not modify data in-place but places its output in out.
+   */
+  virtual void apply(utils::type_erased_matrix& data, CPUMat& out,
+                     std::vector<size_t>& dims) {
+    LBANN_ERROR("Non-in-place apply not implemented.");
+  }
+protected:
+  /** Return a value uniformly at random in [a, b). */
+  static inline float get_uniform_random(float a, float b) {
+    fast_rng_gen& gen = get_fast_io_generator();
+    std::uniform_real_distribution<float> dist(a, b);
+    return dist(gen);
+  }
+  /** Return true with probability p. */
+  static inline bool get_bool_random(float p) {
+    return get_uniform_random(0.0, 1.0) < p;
+  }
+  /** Return an integer uniformly at random in [a, b). */
+  static inline El::Int get_uniform_random_int(El::Int a, El::Int b) {
+    fast_rng_gen& gen = get_fast_io_generator();
+    return fast_rand_int(gen, b - a) + a;
+  }
+};
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED
diff --git a/include/lbann/transforms/transform_pipeline.hpp b/include/lbann/transforms/transform_pipeline.hpp
new file mode 100644
index 00000000000..50ffb91b799
--- /dev/null
+++ b/include/lbann/transforms/transform_pipeline.hpp
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED
+
+#include "lbann/base.hpp"
+#include "lbann/utils/description.hpp"
+#include "lbann/transforms/transform.hpp"
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Applies a sequence of transforms to input data.
+ */
+class transform_pipeline {
+public:
+  transform_pipeline() {}
+  transform_pipeline(const transform_pipeline&);
+  transform_pipeline(transform_pipeline&&) = default;
+  transform_pipeline& operator=(const transform_pipeline&);
+  transform_pipeline& operator=(transform_pipeline&&) = default;
+  ~transform_pipeline() {}
+
+  transform_pipeline* copy() const { return new transform_pipeline(*this); }
+
+  /**
+   * Add trans as the next transform to apply.
+   */
+  void add_transform(std::unique_ptr<transform>&& trans) {
+    m_transforms.push_back(std::move(trans));
+  }
+
+  /**
+   * Set the expected dimensions of the data after applying the transforms.
+   * This is primarily meant as a debugging aid/sanity check.
+   */
+  void set_expected_out_dims(std::vector<size_t> expected_out_dims) {
+    m_expected_out_dims = expected_out_dims;
+  }
+
+  /**
+   * Apply the transforms to data.
+   * @param data The data to transform. data will be modified in-place.
+   * @param dims Dimensions of data. Will be modified in-place.
+   */
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims);
+  /** Apply to CPUMat data, which will be modified in-place. */
+  void apply(CPUMat& data, std::vector<size_t>& dims);
+  /**
+   * Apply the transforms to data.
+   * @param data The data to transform. Will be modified in-place.
+   * @param out_data Output will be placed here. It will not be reallocated.
+   * @param dims Dimensions of data. Will be modified in-place.
+   */
+  void apply(El::Matrix<uint8_t>& data, CPUMat& out_data,
+             std::vector<size_t>& dims);
+private:
+  /** Ordered list of transforms to apply. */
+  std::vector<std::unique_ptr<transform>> m_transforms;
+  /** Expected dimensions after applying all transforms. */
+  std::vector<size_t> m_expected_out_dims;
+
+  /** Assert dims matches expected_out_dims (if set). */
+  void assert_expected_out_dims(const std::vector<size_t>& dims);
+};
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/CMakeLists.txt b/include/lbann/transforms/vision/CMakeLists.txt
new file mode 100644
index 00000000000..2bd30f178c3
--- /dev/null
+++ b/include/lbann/transforms/vision/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  adjust_brightness.hpp
+  adjust_contrast.hpp
+  adjust_saturation.hpp
+  center_crop.hpp
+  colorize.hpp
+  color_jitter.hpp
+  cutout.hpp
+  grayscale.hpp
+  horizontal_flip.hpp
+  normalize_to_lbann_layout.hpp
+  random_affine.hpp
+  random_crop.hpp
+  random_resized_crop.hpp
+  random_resized_crop_with_fixed_aspect_ratio.hpp
+  resize.hpp
+  resized_center_crop.hpp
+  to_lbann_layout.hpp
+  vertical_flip.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/transforms/vision/adjust_brightness.hpp b/include/lbann/transforms/vision/adjust_brightness.hpp
new file mode 100644
index 00000000000..649c24c8feb
--- /dev/null
+++ b/include/lbann/transforms/vision/adjust_brightness.hpp
@@ -0,0 +1,68 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED
+#define LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Adjust the brightness of an image. */
+class adjust_brightness : public transform {
+public:
+  /**
+   * Adjust brightness with given factor.
+   * @param factor A non-negative factor. 0 gives a black image, 1 the original.
+   */
+  adjust_brightness(float factor) : transform(), m_factor(factor) {
+    if (factor < 0.0f) {
+      LBANN_ERROR("Brightness factor must be non-negative.");
+    }
+  }
+
+  transform* copy() const override { return new adjust_brightness(*this); }
+
+  std::string get_type() const override { return "adjust_brightness"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Factor to adjust brightness by. */
+  float m_factor;
+};
+
+// Builder function
+std::unique_ptr<transform>
+build_adjust_brightness_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/adjust_contrast.hpp b/include/lbann/transforms/vision/adjust_contrast.hpp
new file mode 100644
index 00000000000..3c33a747289
--- /dev/null
+++ b/include/lbann/transforms/vision/adjust_contrast.hpp
@@ -0,0 +1,73 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED
+#define LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Adjust the contrast of an image.
+ *
+ * This operates similarly to the contrast control on a television.
+ */
+class adjust_contrast : public transform {
+public:
+  /**
+   * Adjust contrast with given factor.
+   * @param factor A non-negative factor. 0 gives a solid grey image,
+   *     1 the original.
+   */
+  adjust_contrast(float factor) : transform(), m_factor(factor) {
+    if (factor < 0.0f) {
+      LBANN_ERROR("Contrast factor must be non-negative.");
+    }
+  }
+
+  transform* copy() const override { return new adjust_contrast(*this); }
+
+  std::string get_type() const override { return "adjust_contrast"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Factor to adjust contrast by. */
+  float m_factor;
+};
+
+// Builder function
+std::unique_ptr<transform>
+build_adjust_contrast_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/adjust_saturation.hpp b/include/lbann/transforms/vision/adjust_saturation.hpp
new file mode 100644
index 00000000000..65fb2f9e636
--- /dev/null
+++ b/include/lbann/transforms/vision/adjust_saturation.hpp
@@ -0,0 +1,75 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED
+#define LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Adjust the saturation of an image.
+ *
+ * This operates similarly to the controls on a color television
+ * (as opposed to a direct adjustment of saturation) by interpolating
+ * between the original value and its grayscale value.
+ */
+class adjust_saturation : public transform {
+public:
+  /**
+   * Adjust saturation with given factor.
+   * @param factor A non-negative factor. 0 gives a grayscale image,
+   *     1 the original.
+   */
+  adjust_saturation(float factor) : transform(), m_factor(factor) {
+    if (factor < 0.0f) {
+      LBANN_ERROR("Saturation factor must be non-negative.");
+    }
+  }
+
+  transform* copy() const override { return new adjust_saturation(*this); }
+
+  std::string get_type() const override { return "adjust_saturation"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Factor to adjust saturation by. */
+  float m_factor;
+};
+
+
+std::unique_ptr<transform>
+build_adjust_saturation_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/center_crop.hpp b/include/lbann/transforms/vision/center_crop.hpp
new file mode 100644
index 00000000000..9d4b2026a7e
--- /dev/null
+++ b/include/lbann/transforms/vision/center_crop.hpp
@@ -0,0 +1,59 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Crop an image at the center. */
+class center_crop : public transform {
+public:
+  /** Crop to an h x w image. */
+  center_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {}
+
+  transform* copy() const override { return new center_crop(*this); }
+
+  std::string get_type() const override { return "center_crop"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the crop. */
+  size_t m_h, m_w;
+};
+
+std::unique_ptr<transform>
+build_center_crop_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/color_jitter.hpp b/include/lbann/transforms/vision/color_jitter.hpp
new file mode 100644
index 00000000000..cd0ac8805a0
--- /dev/null
+++ b/include/lbann/transforms/vision/color_jitter.hpp
@@ -0,0 +1,85 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED
+#define LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Randomly change brightness, contrast, and saturation.
+ * This randomly adjusts brightness, contrast, and saturation, in a random
+ * order.
+ */
+class color_jitter : public transform {
+public:
+  /**
+   * Randomly adjust brightness, contrast, and saturation within given ranges.
+   * Set both min and max to 0 to disable that adjustment.
+   * @param min_brightness_factor Minimum brightness adjustment (>= 0).
+   * @param max_brightness_factor Maximum brightness adjustment.
+   * @param min_contrast_factor Minimum contrast adjustment (>= 0).
+   * @param max_contrast_factor Maximum contrast adjustment.
+   * @param min_saturation_factor Minimum saturation adjustment (>= 0).
+   * @param max_saturation_factor Maximum saturation adjustment.
+   */
+  color_jitter(float min_brightness_factor, float max_brightness_factor,
+               float min_contrast_factor, float max_contrast_factor,
+               float min_saturation_factor, float max_saturation_factor);
+
+  transform* copy() const override { return new color_jitter(*this); }
+
+  std::string get_type() const override { return "color_jitter"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Minimum brightness factor. */
+  float m_min_brightness_factor;
+  /** Maximum brightness factor. */
+  float m_max_brightness_factor;
+  /** Minimum contrast factor. */
+  float m_min_contrast_factor;
+  /** Maximum contrast factor. */
+  float m_max_contrast_factor;
+  /** Minimum saturation factor. */
+  float m_min_saturation_factor;
+  /** Maximum saturation factor. */
+  float m_max_saturation_factor;
+};
+
+std::unique_ptr<transform>
+build_color_jitter_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/colorize.hpp b/include/lbann/transforms/vision/colorize.hpp
new file mode 100644
index 00000000000..48864b0869f
--- /dev/null
+++ b/include/lbann/transforms/vision/colorize.hpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Convert an image from grayscale to color. */
+class colorize : public transform {
+public:
+  transform* copy() const override { return new colorize(*this); }
+
+  std::string get_type() const override { return "colorize"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+};
+
+std::unique_ptr<transform>
+build_colorize_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp
new file mode 100644
index 00000000000..b41c71f2800
--- /dev/null
+++ b/include/lbann/transforms/vision/cutout.hpp
@@ -0,0 +1,87 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED
+#define LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Cutout data augmentation which randomly masks out square regions of input.
+ *
+ * See:
+ *
+ *     DeVries and Taylor. "Improved Regularization of Convolutional Neural
+ *     Networks with Cutout". arXiv preprint arXiv:1708.04552 (2017).
+ *
+ * This will randomly select a center pixel for each square and set all pixels
+ * within that square to 0. It is permissible for portions of the masks to lie
+ * outside of the image.
+ *
+ * Normalization about 0 should be applied after applying cutout.
+ */
+class cutout : public transform {
+public:
+  /**
+   * Cutout with a given number of squares of a given size.
+   * @param num_holes Number of squares to mask out (must be positive).
+   * @param length Length of a side of the square (must be positive).
+   */
+  cutout(size_t num_holes, size_t length) :
+    transform(), m_num_holes(num_holes), m_length(length) {
+    if (num_holes == 0) {
+      LBANN_ERROR("num_holes must be positive, got 0");
+    }
+    if (length == 0) {
+      LBANN_ERROR("length must be positive, got 0");
+    }
+  }
+
+  transform* copy() const override { return new cutout(*this); }
+
+  std::string get_type() const override { return "cutout"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Number of squares that will be masked out. */
+  size_t m_num_holes;
+  /** Length of a side of each square that will be masked out. */
+  size_t m_length;
+};
+
+std::unique_ptr<transform>
+build_cutout_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/grayscale.hpp b/include/lbann/transforms/vision/grayscale.hpp
new file mode 100644
index 00000000000..a03b2b940cd
--- /dev/null
+++ b/include/lbann/transforms/vision/grayscale.hpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Convert an image to grayscale. */
+class grayscale : public transform {
+public:
+  transform* copy() const override { return new grayscale(*this); }
+
+  std::string get_type() const override { return "grayscale"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+};
+
+std::unique_ptr<transform>
+build_grayscale_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/horizontal_flip.hpp b/include/lbann/transforms/vision/horizontal_flip.hpp
new file mode 100644
index 00000000000..0d7a640f698
--- /dev/null
+++ b/include/lbann/transforms/vision/horizontal_flip.hpp
@@ -0,0 +1,60 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Horizontally flip image data with given probability. */
+class horizontal_flip : public transform {
+public:
+  /** Flip image with probability p. */
+  horizontal_flip(float p) : transform(), m_p(p) {}
+
+  transform* copy() const override { return new horizontal_flip(*this); }
+
+  std::string get_type() const override { return "horizontal_flip"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Probability that that the image is flipped. */
+  float m_p;
+};
+
+std::unique_ptr<transform>
+build_horizontal_flip_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp
new file mode 100644
index 00000000000..ef91c7fedaa
--- /dev/null
+++ b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED
+#define LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Normalize and convert data to LBANN's native data layout.
+ * Currently only supports converting from OpenCV layouts.
+ * This normalizes with provided channel-wise means and standard deviations,
+ * scales from [0, 255] to [0, 1], and converts to LBANN's data layout.
+ * Normalization is applied after the scaling to [0, 1].
+ * This essentially fuses the to_lbann_layout and normalize transforms.
+ */
+class normalize_to_lbann_layout : public transform {
+public:
+  /** Apply channel-wise means and standard deviations. */
+  normalize_to_lbann_layout(std::vector<float> means, std::vector<float> stds) :
+    transform(), m_means(means), m_stds(stds) {
+    if (m_means.size() != m_stds.size()) {
+      LBANN_ERROR("Normalize mean and std have different numbers of channels.");
+    }
+  }
+
+  transform* copy() const override { return new normalize_to_lbann_layout(*this); }
+
+  std::string get_type() const override { return "normalize_to_lbann_layout"; }
+
+  bool supports_non_inplace() const override { return true; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+  void apply(utils::type_erased_matrix& data, CPUMat& out,
+             std::vector<size_t>& dims) override;
+private:
+  /** Channel-wise means. */
+  std::vector<float> m_means;
+  /** Channel-wise standard deviations. */
+  std::vector<float> m_stds;
+};
+
+std::unique_ptr<transform>
+build_normalize_to_lbann_layout_transform_from_pbuf(
+  google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/random_affine.hpp b/include/lbann/transforms/vision/random_affine.hpp
new file mode 100644
index 00000000000..4ef0c587a24
--- /dev/null
+++ b/include/lbann/transforms/vision/random_affine.hpp
@@ -0,0 +1,82 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Apply a random affine transform to an image. */
+class random_affine : public transform {
+public:
+  /**
+   * Set up the affine transform.
+   * Rotate a random number of degrees selected in [rotate_min, rotate_max].
+   * Translate the vertical dimension in a random amount in [-h*translate_h,
+   * h*translate_h], and the horizontal dimension in [-w*translate_w,
+   * w*translate_w].
+   * Scale by a random amount in [scale_min, scale_max].
+   * Shear by a random number of degrees in [shear_min, shear_max].
+   * Set arguments to 0 to disable that transform.
+   */
+  random_affine(float rotate_min, float rotate_max,
+                float translate_h, float translate_w,
+                float scale_min, float scale_max,
+                float shear_min, float shear_max) :
+    transform(),
+    m_rotate_min(rotate_min), m_rotate_max(rotate_max),
+    m_translate_h(translate_h), m_translate_w(translate_w),
+    m_scale_min(scale_min), m_scale_max(scale_max),
+    m_shear_min(shear_min), m_shear_max(shear_max) {}
+
+  transform* copy() const override { return new random_affine(*this); }
+
+  std::string get_type() const override { return "random_affine"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Range in degrees to rotate. */
+  float m_rotate_min, m_rotate_max;
+  /** Fraction of height/width to translate. */
+  float m_translate_h, m_translate_w;
+  /** Range for fraction to scale by. */
+  float m_scale_min, m_scale_max;
+  /** Range for degrees to shear. */
+  float m_shear_min, m_shear_max;
+};
+
+std::unique_ptr<transform>
+build_random_affine_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RANDOM_AFFINED_CENTER_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/random_crop.hpp b/include/lbann/transforms/vision/random_crop.hpp
new file mode 100644
index 00000000000..dce14b98111
--- /dev/null
+++ b/include/lbann/transforms/vision/random_crop.hpp
@@ -0,0 +1,59 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Crop an image at a random location. */
+class random_crop : public transform {
+public:
+  /** Crop to an h x w image. */
+  random_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {}
+
+  transform* copy() const override { return new random_crop(*this); }
+
+  std::string get_type() const override { return "random_crop"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the crop. */
+  size_t m_h, m_w;
+};
+
+std::unique_ptr<transform>
+build_random_crop_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/random_resized_crop.hpp b/include/lbann/transforms/vision/random_resized_crop.hpp
new file mode 100644
index 00000000000..8f957106303
--- /dev/null
+++ b/include/lbann/transforms/vision/random_resized_crop.hpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Extract a crop of random size and aspect ratio, then crop to a size.
+ * This is commonly used for Inception-style networks and some other
+ * image classification networks.
+ */
+class random_resized_crop : public transform {
+public:
+  /**
+   * Crop to a random size and aspect ratio, then resize to h x w.
+   * The random crop has area in [scale_min, scale_max] of the original image
+   * area, and aspect ratio in [ar_min, ar_max] of the original. This random
+   * crop is then resized to be h x w.
+   * These default to (0.08, 1.0) and (3/4, 4/3), respectively, which are the
+   * standard.
+   */
+  random_resized_crop(size_t h, size_t w,
+                      float scale_min=0.08, float scale_max=1.0,
+                      float ar_min=0.75, float ar_max=4.0f/3.0f) :
+    transform(),
+    m_h(h), m_w(w),
+    m_scale_min(scale_min), m_scale_max(scale_max),
+    m_ar_min(ar_min), m_ar_max(ar_max) {}
+
+  transform* copy() const override { return new random_resized_crop(*this); }
+
+  std::string get_type() const override { return "random_resized_crop"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the final crop. */
+  size_t m_h, m_w;
+  /** Range for the area of the random crop. */
+  float m_scale_min, m_scale_max;
+  /** Range for the aspect ratio of the random crop. */
+  float m_ar_min, m_ar_max;
+};
+
+std::unique_ptr<transform>
+build_random_resized_crop_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp
new file mode 100644
index 00000000000..8290254aa82
--- /dev/null
+++ b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp
@@ -0,0 +1,68 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Resize an image then extract a random crop. */
+class random_resized_crop_with_fixed_aspect_ratio : public transform {
+public:
+  /** Resize to h x w, then extract a random crop_h x crop_w crop. */
+  random_resized_crop_with_fixed_aspect_ratio(
+    size_t h, size_t w, size_t crop_h, size_t crop_w) :
+    transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {}
+
+  transform* copy() const override {
+    return new random_resized_crop_with_fixed_aspect_ratio(*this);
+  }
+
+  std::string get_type() const override {
+    return "random_resized_crop_with_fixed_aspect_ratio";
+  }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the resized image. */
+  size_t m_h, m_w;
+  /** Height and width of the crop. */
+  size_t m_crop_h, m_crop_w;
+};
+
+std::unique_ptr<transform>
+build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf(
+  google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/resize.hpp b/include/lbann/transforms/vision/resize.hpp
new file mode 100644
index 00000000000..668b925c9b9
--- /dev/null
+++ b/include/lbann/transforms/vision/resize.hpp
@@ -0,0 +1,59 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Resize an image. */
+class resize : public transform {
+public:
+  /** Resize to h x w. */
+  resize(size_t h, size_t w) : transform(), m_h(h), m_w(w) {}
+
+  transform* copy() const override { return new resize(*this); }
+
+  std::string get_type() const override { return "resize"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the resized image. */
+  size_t m_h, m_w;
+};
+
+std::unique_ptr<transform>
+build_resize_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/resized_center_crop.hpp b/include/lbann/transforms/vision/resized_center_crop.hpp
new file mode 100644
index 00000000000..0ccb0ef93e6
--- /dev/null
+++ b/include/lbann/transforms/vision/resized_center_crop.hpp
@@ -0,0 +1,62 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Resize an image and then crop its center. */
+class resized_center_crop : public transform {
+public:
+  /** Resize to h x w, then extract a crop_h x crop_w crop from the center. */
+  resized_center_crop(size_t h, size_t w, size_t crop_h, size_t crop_w) :
+    transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {}
+
+  transform* copy() const override { return new resized_center_crop(*this); }
+
+  std::string get_type() const override { return "resized_center_crop"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+private:
+  /** Height and width of the resized image. */
+  size_t m_h, m_w;
+  /** Height and width of the crop. */
+  size_t m_crop_h, m_crop_w;
+};
+
+std::unique_ptr<transform>
+build_resized_center_crop_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/to_lbann_layout.hpp b/include/lbann/transforms/vision/to_lbann_layout.hpp
new file mode 100644
index 00000000000..5cbb81f699a
--- /dev/null
+++ b/include/lbann/transforms/vision/to_lbann_layout.hpp
@@ -0,0 +1,62 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED
+#define LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/**
+ * Convert data to LBANN's native data layout.
+ * Currently only supports converting from OpenCV layouts.
+ * This will also rescale data from [0, 255] to [0, 1].
+ */
+class to_lbann_layout : public transform {
+public:
+  transform* copy() const override { return new to_lbann_layout(*this); }
+
+  std::string get_type() const override { return "to_lbann_layout"; }
+
+  bool supports_non_inplace() const override { return true; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+  void apply(utils::type_erased_matrix& data, CPUMat& out,
+             std::vector<size_t>& dims) override;
+};
+
+std::unique_ptr<transform>
+build_to_lbann_layout_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED
diff --git a/include/lbann/transforms/vision/vertical_flip.hpp b/include/lbann/transforms/vision/vertical_flip.hpp
new file mode 100644
index 00000000000..712547c733a
--- /dev/null
+++ b/include/lbann/transforms/vision/vertical_flip.hpp
@@ -0,0 +1,60 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED
+#define LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED
+
+#include "lbann/transforms/transform.hpp"
+
+#include <google/protobuf/message.h>
+
+namespace lbann {
+namespace transform {
+
+/** Vertically flip image data with given probability. */
+class vertical_flip : public transform {
+public:
+  /** Flip image with probability p. */
+  vertical_flip(float p) : transform(), m_p(p) {}
+
+  transform* copy() const override { return new vertical_flip(*this); }
+
+  std::string get_type() const override { return "vertical_flip"; }
+
+  void apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) override;
+
+private:
+  /** Probability that that the image is flipped. */
+  float m_p;
+};
+
+std::unique_ptr<transform>
+build_vertical_flip_transform_from_pbuf(google::protobuf::Message const&);
+
+}  // namespace transform
+}  // namespace lbann
+
+#endif  // LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED
diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt
index a07932b662f..d5b680771a3 100644
--- a/include/lbann/utils/CMakeLists.txt
+++ b/include/lbann/utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   any.hpp
+  argument_parser.hpp
   compiler_control.hpp
   cublas.hpp
   cuda.hpp
@@ -8,27 +9,50 @@ set_full_path(THIS_DIR_HEADERS
   dataset.hpp
   description.hpp
   entrywise_operator.hpp
+  enum_iterator.hpp
+  environment_variable.hpp
+  eti_macros.hpp
   exception.hpp
   factory.hpp
   factory_error_policies.hpp
   file_utils.hpp
   glob.hpp
+  hydrogen_utils.hpp
   im2col.hpp
+  image.hpp
   jag_utils.hpp
   lbann_library.hpp
   mild_exception.hpp
   number_theory.hpp
+  nvshmem.hpp
   omp_diagnostics.hpp
+  opencv.hpp
   options.hpp
+  nvshmem.hpp
   profiling.hpp
   prototext.hpp
+  python.hpp
   random.hpp
+  random_number_generators.hpp
+  serialization.hpp
   statistics.hpp
   summary.hpp
+  summary_impl.hpp
   timer.hpp
+  trainer_file_utils.hpp
   type_erased_matrix.hpp
+  typename.hpp
   )
 
+if (LBANN_HAS_HALF)
+  list(APPEND THIS_DIR_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialization.hpp)
+endif (LBANN_HAS_HALF)
+
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/distconv.hpp")
+endif ()
+
 # Add the subdirectories
 add_subdirectory(threads)
 add_subdirectory(impl)
diff --git a/include/lbann/utils/any.hpp b/include/lbann/utils/any.hpp
index 6b55e7caf8b..ae95c6dee47 100644
--- a/include/lbann/utils/any.hpp
+++ b/include/lbann/utils/any.hpp
@@ -21,7 +21,8 @@ namespace lbann
 namespace utils
 {
 
-#ifdef LBANN_HAS_STD_ANY
+// Note (tym 4/8/20): CMake doesn't support NVCC with C++17
+#if defined(LBANN_HAS_STD_ANY) && !defined(__CUDACC__)
 // This case is simple symbol injection; don't feel great about this,
 // but it's not my fault they couldn't get this into C++11...
 
@@ -64,7 +65,7 @@ class any
   ///@{
 
   /** @brief Default construct an empty "any" */
-  any() noexcept = default;
+  any() noexcept {}
 
   /** @brief Construct an object holding a T */
   template <typename T>
diff --git a/include/lbann/utils/argument_parser.hpp b/include/lbann/utils/argument_parser.hpp
new file mode 100644
index 00000000000..f213a13f690
--- /dev/null
+++ b/include/lbann/utils/argument_parser.hpp
@@ -0,0 +1,790 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED
+#define LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED
+
+#include "lbann/utils/any.hpp"
+#include "lbann/utils/environment_variable.hpp"
+
+#include <clara.hpp>
+
+#include <initializer_list>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+namespace lbann
+{
+namespace utils
+{
+
+/** @class parse_error
+ *  @brief std::exception subclass that is thrown if the parser
+ *         can not parse the arguments.
+ */
+struct parse_error : std::runtime_error
+{
+  /** @brief Construct the exception with the string to be
+   *         return by what()
+   */
+  template <typename T>
+  parse_error(T&& what_arg)
+    : std::runtime_error{std::forward<T>(what_arg)} {}
+};// parse_error
+
+/** @class strict_parsing
+ *
+ *  Allows any valid subset of parameters. This will throw an
+ *  exception for any error raised by the underlying parser.
+ */
+struct strict_parsing
+{
+  void handle_error(clara::detail::InternalParseResult result,
+                    clara::Parser& parser,
+                    std::vector<char const*>& argv);
+};// struct strict_parsing
+
+/** @class allow_extra_parameters
+ *
+ *  Ignores "unknown token" errors raised by the parser and attempts
+ *  to proceed until all tokens are processed or another error is
+ *  detected.
+ */
+struct allow_extra_parameters
+{
+  void handle_error(clara::detail::InternalParseResult result,
+                    clara::Parser& parser,
+                    std::vector<char const*>& argv);
+};// struct allow_extra_parameters
+
+/** @class argument_parser
+ *  @brief Basic argument parsing with automatic help messages.
+ *
+ *  @section arg_parser_params Supported parameter types
+ *
+ *  The argument parser supports 3 types of command line parameters:
+ *  flags, options, and arguments.
+ *
+ *  @subsection arg_parser_flags Flags
+ *
+ *  Flags default to "false" and toggle to "true" when they are given
+ *  on the command line. It is an error to provide a value to a flag
+ *  on the command line (e.g., "-flag 0"). If a flag called "-v" is
+ *  tied to a variable called `verbose`, `verbose` will have default
+ *  value `false`. Passing "-v" on the command line, `a.out -v`, will
+ *  result in `verbose` having post-parse value `true`.
+ *
+ *  @subsection arg_parser_options Options
+ *
+ *  Options represent key-value pairs. They must take only a single
+ *  value (e.g. `a.out -key value`). It is an error to omit a value
+ *  for a parameter of option type (e.g., `a.out -key`). Options are
+ *  strongly typed to match their default values. The string passed on
+ *  the command line must be convertible to the type of the default
+ *  value provided by the developer programmatically.
+ *
+ *  @subsection arg_parser_arguments Arguments
+ *
+ *  Arguments (or "positional arguments") do not name a key on the
+ *  command line and are implicitly keyed by their index in the
+ *  argument list. A corollary to this is that required arguments must
+ *  appear before optional arguments. Arguments with each category
+ *  ("required" and "optional") are keyed in the order in which they
+ *  are added.
+ *
+ *  On command line, "optional" arguments are ordered after the
+ *  "required" arguments, in the order in which they are added. For
+ *  example, adding an (optional) argument called "A", then adding
+ *  a required argument called "B", then adding an (optioinal)
+ *  argument called "C" will require that these arguments be passed
+ *  as `a.out B A C`. Since "A" and "C" are optional, it is also
+ *  valid to pass `a.out B` or `a.out B A`. It is undefined
+ *  behavior to pass `a.out B C`.
+ *
+ *  Erroneously passing `a.out B C` might be accepted by the parser
+ *  if "A" and "C" have the same (or sufficiently compatible)
+ *  types, but the output will not be as unexpected (the variable
+ *  bound to "A" will have the value expected in "C", and the
+ *  variable bound to "C" will have its default value). If "A" and
+ *  "C" are not compatible types, an exception will be thrown. In
+ *  the first case, the parser cannot read your mind to know if you
+ *  passed things in the right order; it is the application
+ *  developer's responsibility to ensure that all arguments have
+ *  been added before the help message is printed, and it is the
+ *  user's responsibility to consult the help message for the
+ *  runtime ordering of arguments.
+ *
+ *  @section arg_parser_finalize Finalization
+ *
+ *  To accomodate the presence of required arguments with the
+ *  maintenance-intensive practice of adding arguments willy-nilly
+ *  (because I don't believe a PR without said terrifying
+ *  capability would ever make it through), parsing of the
+ *  arguments can be done two ways: with or without finalization.
+ *
+ *  If there are no required arguments registered in the parser,
+ *  these should be equivalent. If there are required arguments,
+ *  they must all have been registered with the parser and seen in
+ *  the arguments given to the parse functions before
+ *  finalization. Semantically, the parser must be finalized before
+ *  attempting to use any of the required arguments.
+ */
+template <typename ErrorHandler>
+class argument_parser : ErrorHandler
+{
+public:
+
+  /** @name Public types */
+  ///@{
+
+  /** @brief A proxy class representing the current value associated
+   *         with an option.
+   *
+   *  This class is best manipulated generically, through `auto`
+   *  variables.
+   *
+   *  @tparam T The type of the held object.
+   */
+  template <typename T>
+  class readonly_reference
+  {
+  public:
+    readonly_reference(T& val) noexcept : ref_(val) {}
+    T const& get() const noexcept { return ref_; }
+    operator T const& () const noexcept { return this->get(); }
+
+    template <typename S>
+    bool operator==(S const& y) const noexcept
+    { return this->get() == y; }
+
+  private:
+    T& ref_;
+  };// class readonly_reference<T>
+
+  /** @class parse_error
+   *  @brief std::exception subclass that is thrown if the parser
+   *         can not parse the arguments.
+   */
+  struct parse_error : std::runtime_error
+  {
+    /** @brief Construct the exception with the string to be
+     *         return by what()
+     */
+    template <typename T>
+    parse_error(T&& what_arg)
+      : std::runtime_error{std::forward<T>(what_arg)} {}
+  };
+
+  /** @class missing_required_arguments
+   *  @brief std::exception subclass that is thrown if a required
+   *         argument is not found.
+   */
+  struct missing_required_arguments : std::runtime_error
+  {
+    /** @brief Construct the exception with a list of the missing
+     *         argument names.
+     *
+     *  @param[in] missing_args A container that holds the names
+     *             of the missing arguments.
+     */
+    template <typename Container>
+    missing_required_arguments(Container const& missing_args)
+      : std::runtime_error{build_what_string_(missing_args)}
+    {}
+
+  private:
+    template <typename Container>
+    std::string build_what_string_(Container const& missing_args)
+    {
+      std::ostringstream oss;
+      oss << "The following required arguments are missing: {";
+      for (auto const& x : missing_args)
+        oss << " \"" << x << "\"";
+      oss << " }";
+      return oss.str();
+    }
+  };
+
+  ///@}
+
+public:
+
+  /** @name Constructors */
+  ///@{
+
+  /** @brief Create the parser */
+  argument_parser();
+
+  ///@}
+  /** @name Adding options and arguments */
+  ///@{
+
+  /** @brief Add a flag (i.e. a boolean parameter that is "true" if
+   *         given and "false" if not given).
+   *
+   *  The value of a flag defaults to `false`. If, for some strange
+   *  reason, users should be forced to type the boolean value on
+   *  the command line, e.g., "my_exe -b 1", use add_option()
+   *  instead. If a flag with default value `true` is desired,
+   *  invert the logic and use this instead.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to trigger
+   *             this flag to `true`. At least one must be given.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          flag.
+   */
+  readonly_reference<bool>
+  add_flag(std::string const& name,
+           std::initializer_list<std::string> cli_flags,
+           std::string const& description);
+
+  /** @brief Add a flag with environment variable override.
+   *
+   *  The value of a flag defaults to `false`. The flag may be set to
+   *  `true` by passing the flag on the command line. Alternatively,
+   *  it may be set to `true` if the environment variable `env` is
+   *  defined and has a value that converts to `true`.
+   *
+   *  @tparam AccessPolicy The access method for the environment
+   *          variable. (Deduced.)
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to trigger
+   *             this flag to `true`. At least one must be given.
+   *  @param[in] env The environment variable to prefer over the
+   *             default parameter value.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          flag.
+   */
+  template <typename AccessPolicy>
+  readonly_reference<bool>
+  add_flag(std::string const& name,
+           std::initializer_list<std::string> cli_flags,
+           EnvVariable<AccessPolicy> env,
+           std::string const& description)
+  {
+    if (env.exists() && env.template value<bool>())
+      return add_flag_impl_(name, std::move(cli_flags), description, true);
+    else
+      return add_flag(name, std::move(cli_flags), description);
+  }
+
+  /** @brief Add an additional named option.
+   *
+   *  Currently, named options are all optional. This could be
+   *  expanded if needed.
+   *
+   *  @tparam T The type associated with the option. Deduced if a
+   *          default value is given. If the default value is not
+   *          given, the template parameter must be named explicitly
+   *          and the default value will be default-constructed.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to identify
+   *             this option and its value. At least one must be
+   *             given.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The default value to be returned if
+   *             the option is not passed to the command line.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          option.
+   */
+  template <typename T>
+  readonly_reference<T>
+  add_option(std::string const& name,
+             std::initializer_list<std::string> cli_flags,
+             std::string const& description,
+             T default_value = T());
+
+  /** @brief Add an additional named option.
+   *
+   *  Currently, named options are all optional. This could be
+   *  expanded if needed.
+   *
+   *  @tparam T The type associated with the option. Deduced if a
+   *          default value is given. If the default value is not
+   *          given, the template parameter must be named explicitly
+   *          and the default value will be default-constructed.
+   *  @tparam AccessPolicy The access method for the environment
+   *          variable. (Deduced.)
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to identify
+   *             this option and its value. At least one must be
+   *             given.
+   *  @param[in] env The environment variable to prefer over the
+   *             default parameter value.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The default value to be returned if
+   *             the option is not passed to the command line.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          option.
+   */
+  template <typename T, typename AccessPolicy>
+  readonly_reference<T>
+  add_option(std::string const& name,
+             std::initializer_list<std::string> cli_flags,
+             EnvVariable<AccessPolicy> env,
+             std::string const& description,
+             T default_value = T())
+  {
+    if (env.exists())
+      return add_option(name, std::move(cli_flags), description,
+                        env.template value<T>());
+    else
+      return add_option(name, std::move(cli_flags), description,
+                        std::move(default_value));
+  }
+
+  /** @brief Add an additional named option; overloaded for "char
+   *         const*" parameters.
+   *
+   *  The value will be stored as an `std::string`. Its value must
+   *  be extracted using `get<std::string>(name)`.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to trigger
+   *             this flag to `true`. At least one must be given.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The default value to be returned if
+   *             the option is not passed to the command line.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          option.
+   */
+  readonly_reference<std::string>
+  add_option(std::string const& name,
+             std::initializer_list<std::string> cli_flags,
+             std::string const& description,
+             char const* default_value)
+  {
+    return add_option(name, std::move(cli_flags), description,
+                      std::string(default_value));
+  }
+
+  /** @brief Add an additional named option; overloaded for "char
+   *         const*" parameters.
+   *
+   *  The value will be stored as an `std::string`. Its value must
+   *  be extracted using `get<std::string>(name)`.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] cli_flags The valid command line flags to trigger
+   *             this flag to `true`. At least one must be given.
+   *  @param[in] env The environment variable to prefer over the
+   *             default parameter value.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The default value to be returned if
+   *             the option is not passed to the command line.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          option.
+   */
+  template <typename AccessPolicy>
+  readonly_reference<std::string>
+  add_option(std::string const& name,
+             std::initializer_list<std::string> cli_flags,
+             EnvVariable<AccessPolicy> env,
+             std::string const& description,
+             char const* default_value)
+  {
+    return add_option(name, cli_flags, std::move(env),
+                      description, std::string(default_value));
+  }
+
+  /** @brief Add an optional positional argument.
+   *
+   *  These are essentially defaulted positional arguments. They must
+   *  be given on the command line in the order in which they are
+   *  added to the parser. If the arguments have all been added by the
+   *  time the help message is produced, the help message will display
+   *  the correct ordering.
+   *
+   *  @tparam T The type to which the argument maps.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The value to use for this argument if
+   *             not detected in the formal argument list.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          argument.
+   */
+  template <typename T>
+  readonly_reference<T> add_argument(
+    std::string const& name,
+    std::string const& description,
+    T default_value = T());
+
+  /** @brief Add a positional argument; char const* overload
+   *
+   *  The data is stored in an std::string object internally and
+   *  must be accessed using `get<std::string>(name)`.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *  @param[in] default_value The value to use for this argument if
+   *             not detected in the formal argument list.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          argument.
+   */
+  readonly_reference<std::string> add_argument(
+    std::string const& name,
+    std::string const& description,
+    char const* default_value)
+  {
+    return add_argument(
+      name, description, std::string(default_value));
+  }
+
+  /** @brief Add a "required" positional argument.
+   *
+   *  @tparam T The type to which the argument maps.
+   *
+   *  @param[in] name The name to be used to refer to the argument.
+   *  @param[in] description A brief description of the argument,
+   *             used for the help message.
+   *
+   *  @return A read-only reference to the value pointed to by this
+   *          argument.
+   */
+  template <typename T>
+  readonly_reference<T> add_required_argument(
+    std::string const& name,
+    std::string const& description);
+
+  ///@}
+  /** @name Command-line-like parsing */
+  ///@{
+
+  /** @brief Parse the command line arguments and finalize the
+   *         arguments.
+   *
+   *  This is equivalent to calling parse_no_finalize() followed
+   *  immediately by finalize().
+   *
+   *  @param[in] argc The number of arguments
+   *  @param[in] argv The list of arguments
+   *
+   *  @throws parse_error if an internal parsing error is detected.
+   */
+  void parse(int argc, char const* const argv[]);
+
+  /** @brief Parse the command line arguments but do not finalize
+   *         the parser.
+   *
+   *  This parses command-line-like arguments but does no checks for
+   *  required arguments. Users should call finalize() before
+   *  attempting to use the values associated with any required
+   *  arguments.
+   *
+   *  @param[in] argc The number of arguments
+   *  @param[in] argv The list of arguments
+   *
+   *  @throws parse_error if an internal parsing error is detected.
+   */
+  void parse_no_finalize(int argc, char const* const argv[]);
+
+  /** @brief Assert that all required components are set properly.
+   *
+
+   *  This should be called sometime after parse_no_finalize() and
+   *  before using the values. This is implicitly called by parse().
+   *
+   *  @throws missing_required_arguments If a missing argument is
+   *          detected.
+   */
+  void finalize() const;
+
+  ///@}
+  /** @name Queries */
+  ///@{
+
+  /** @brief Get the executable name.
+   *
+   *  This is only meaningful after calling either parse() or
+   *  parse_no_finalize().
+   *
+   *  @return The name of the executable.
+   */
+  std::string const& get_exe_name() const noexcept;
+
+  /** @brief Test if an option exists in the parser.
+   *
+   *  This only tests whether the argument or option is known to the
+   *  parser, not whether it has been set or modified by the parser.
+   *
+   *  @param[in] option_name The name of the option/argument.
+   */
+  bool option_is_defined(std::string const& option_name) const;
+
+  /** @brief Test if help has been requested. */
+  bool help_requested() const;
+
+  /** @brief Get the requested value from the argument list.
+   *  @tparam T The type of the requested parameter.
+   *  @param option_name The name given to the option or argument.
+   *  @return A const-reference to the held value.
+   */
+  template <typename T>
+  T const& get(std::string const& option_name) const;
+
+  ///@}
+  /** @name Output */
+  ///@{
+
+  /** @brief Print a help string to a stream.
+   *  @param[in] stream The ostream to print the help message to.
+   */
+  void print_help(std::ostream& stream) const;
+
+  ///@}
+
+private:
+
+  /** @brief Implementation of add_flag */
+  readonly_reference<bool>
+  add_flag_impl_(std::string const& name,
+                 std::initializer_list<std::string> cli_flags,
+                 std::string const& description,
+                 bool default_value);
+
+private:
+  /** @brief Dictionary of arguments to their values */
+  std::unordered_map<std::string, utils::any> params_;
+  /** @brief Patch around in-progress clara limitation */
+  std::unordered_set<std::string> required_;
+  /** @brief The underlying clara object */
+  clara::Parser parser_;
+  /** @brief The name of the executable. */
+  std::string exe_name_ = "<exe>";
+
+};
+
+template <typename ErrorHandler>
+inline bool
+argument_parser<ErrorHandler>::option_is_defined(std::string const& option_name) const
+{
+  return params_.count(option_name);
+}
+
+template <typename ErrorHandler>
+template <typename T>
+inline T const& argument_parser<ErrorHandler>::get(std::string const& option_name) const
+{
+  return utils::any_cast<T const&>(params_.at(option_name));
+}
+
+template <typename ErrorHandler>
+template <typename T>
+inline auto argument_parser<ErrorHandler>::add_option(
+  std::string const& name,
+  std::initializer_list<std::string> cli_flags,
+  std::string const& description,
+  T default_value)
+  -> readonly_reference<T>
+{
+  params_[name] = std::move(default_value);
+  auto& param_ref = any_cast<T&>(params_[name]);
+  clara::Opt option(param_ref, name);
+  for (auto const& f : cli_flags)
+    option[f];
+  parser_ |= option(description).optional();
+  return param_ref;
+}
+
+template <typename ErrorHandler>
+template <typename T>
+inline auto argument_parser<ErrorHandler>::add_argument(
+  std::string const& name,
+  std::string const& description,
+  T default_value)
+  -> readonly_reference<T>
+{
+  params_[name] = std::move(default_value);
+  auto& param_ref = utils::any_cast<T&>(params_[name]);
+  parser_ |= clara::Arg
+    (param_ref, name)
+    (description).optional();
+  return param_ref;
+}
+
+template <typename ErrorHandler>
+template <typename T>
+inline auto argument_parser<ErrorHandler>::add_required_argument(
+  std::string const& name,
+  std::string const& description)
+  -> readonly_reference<T>
+{
+  // Add the reference to bind to
+  params_[name] = T{};
+  auto& param_any = params_[name];
+  auto& param_ref = any_cast<T&>(param_any);
+
+  required_.insert(name);
+
+  // Make sure the required arguments are all grouped together.
+  auto iter = parser_.m_args.cbegin(), invalid = parser_.m_args.cend();
+  while (iter != invalid && !iter->isOptional())
+    ++iter;
+
+  // Create the argument
+  auto ret = parser_.m_args.emplace(
+    iter,
+    [name,&param_ref,this](std::string const& value)
+    {
+      auto result = clara::detail::convertInto(value, param_ref);
+      if (result)
+        required_.erase(name);
+      return result;
+    },
+    name);
+  ret->operator() (description).required();
+  return param_ref;
+}
+
+template <typename ErrorHandler>
+argument_parser<ErrorHandler>::argument_parser()
+{
+  params_["print help"] = false;
+  parser_ |= clara::ExeName(exe_name_);
+  parser_ |= clara::Help(utils::any_cast<bool&>(params_["print help"]));
+
+  // Work around a bug in Clara logic
+  parser_.m_exeName.set(exe_name_);
+}
+
+template <typename ErrorHandler>
+void argument_parser<ErrorHandler>::parse(int argc, char const* const argv[])
+{
+  parse_no_finalize(argc, argv);
+  finalize();
+}
+
+template <typename ErrorHandler>
+void argument_parser<ErrorHandler>::parse_no_finalize(int argc, char const* const argv[])
+{
+  std::vector<char const*> newargv(argv, argv+argc);
+  auto parse_result =
+    parser_.parse(clara::Args(newargv.size(), newargv.data()));
+
+  if (!parse_result)
+    this->handle_error(parse_result, parser_, newargv);
+}
+
+template <typename ErrorHandler>
+void argument_parser<ErrorHandler>::finalize() const
+{
+  if (!help_requested() && required_.size())
+    throw missing_required_arguments(required_);
+}
+
+template <typename ErrorHandler>
+auto argument_parser<ErrorHandler>::add_flag(
+  std::string const& name,
+  std::initializer_list<std::string> cli_flags,
+  std::string const& description)
+  -> readonly_reference<bool>
+{
+  return add_flag_impl_(name, std::move(cli_flags), description, false);
+}
+
+template <typename ErrorHandler>
+std::string const& argument_parser<ErrorHandler>::get_exe_name() const noexcept
+{
+  return exe_name_;
+}
+
+template <typename ErrorHandler>
+bool argument_parser<ErrorHandler>::help_requested() const
+{
+  return utils::any_cast<bool>(params_.at("print help"));
+}
+
+template <typename ErrorHandler>
+void argument_parser<ErrorHandler>::print_help(std::ostream& out) const
+{
+  out << parser_ << std::endl;
+}
+
+template <typename ErrorHandler>
+auto argument_parser<ErrorHandler>::add_flag_impl_(
+  std::string const& name,
+  std::initializer_list<std::string> cli_flags,
+  std::string const& description,
+  bool default_value)
+  -> readonly_reference<bool>
+{
+  params_[name] = default_value;
+  auto& param_ref = any_cast<bool&>(params_[name]);
+  clara::Opt option(param_ref);
+  for (auto const& f : cli_flags)
+    option[f];
+  parser_ |= option(description).optional();
+  return param_ref;
+}
+
+}// namespace utils
+
+using default_arg_parser_type =
+         utils::argument_parser<utils::allow_extra_parameters>;
+
+default_arg_parser_type& global_argument_parser();
+
+}// namespace lbann
+
+/** @brief Write the parser's help string to the given @c ostream */
+template <typename ErrorHandler>
+std::ostream& operator<<(
+  std::ostream& os,
+  lbann::utils::argument_parser<ErrorHandler> const& parser)
+{
+  parser.print_help(os);
+  return os;
+}
+
+#endif /* LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED */
diff --git a/include/lbann/utils/beta.hpp b/include/lbann/utils/beta.hpp
new file mode 100644
index 00000000000..eef834e2466
--- /dev/null
+++ b/include/lbann/utils/beta.hpp
@@ -0,0 +1,233 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_BETA_HPP
+#define LBANN_UTILS_BETA_HPP
+
+#include <random>
+#include <ostream>
+#include <istream>
+#include <cmath>
+
+#include "lbann/utils/random.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+/**
+ * Produces random floating point values drawn from a Beta distribution with
+ * parameters a > 0 and b > 0.
+ *
+ * See:
+ *
+ *     https://en.wikipedia.org/wiki/Beta_distribution
+ *
+ * for more details.
+ */
+template <typename RealType = double>
+class beta_distribution {
+public:
+  using result_type = RealType;
+
+  class param_type {
+  public:
+    using distribution_type = beta_distribution;
+
+    explicit param_type(RealType param_a, RealType param_b) :
+      m_a(param_a), m_b(param_b) {
+      if (param_a <= RealType(0) || param_b <= RealType(0)) {
+        LBANN_ERROR("Beta distribution parameters must be positive");
+      }
+    }
+
+    constexpr RealType a() const { return m_a; }
+    constexpr RealType b() const { return m_b; }
+
+    bool operator==(const param_type& other) const {
+      return m_a == other.m_a && m_b == other.m_b;
+    }
+    bool operator!=(const param_type& other) const {
+      return m_a != other.m_a || m_b != other.m_b;
+    }
+  private:
+    RealType m_a, m_b;
+  };
+
+  explicit beta_distribution(RealType a, RealType b) :
+    m_params(a, b), m_gamma_a(a), m_gamma_b(b) {}
+  explicit beta_distribution(const param_type& p) :
+    m_params(p), m_gamma_a(p.a()), m_gamma_b(p.b()) {}
+
+  result_type a() const { return m_params.a(); }
+  result_type b() const { return m_params.b(); }
+
+  void reset() {}
+
+  param_type param() const { return m_params; }
+  void param(const param_type& p) {
+    m_params = p;
+    m_gamma_a = gamma_dist(p.a());
+    m_gamma_b = gamma_dist(p.b());
+  }
+
+  template <typename Generator>
+  result_type operator()(Generator& g) {
+    return generate(g);
+  }
+  template <typename Generator>
+  result_type operator()(Generator& g, const param_type& p) {
+    return generate(g, p);
+  }
+
+  result_type min() const { return result_type(0); }
+  result_type max() const { return result_type(1); }
+
+  bool operator==(const beta_distribution<result_type>& other) const {
+    return param() == other.param();
+  }
+  bool operator!=(const beta_distribution<result_type>& other) const {
+    return param() != other.param();
+  }
+
+private:
+  param_type m_params;
+
+  using gamma_dist = std::gamma_distribution<RealType>;
+  gamma_dist m_gamma_a, m_gamma_b;
+
+  // Generator for when we use the distribution's parameters.
+  template <typename Generator>
+  result_type generate(Generator& g) {
+    if (a() <= result_type(1) && b() <= result_type(1)) {
+      return generate_johnk(g, m_params.a(), m_params.b());
+    } else {
+      return generate_gamma(g, m_gamma_a, m_gamma_b);
+    }
+  }
+  // Generator for when we use specified parameters.
+  template <typename Generator>
+  result_type generate(Generator& g, const param_type& p) {
+    if (p.a() <= result_type(1) && p.b() <= result_type(1)) {
+      return generate_johnk(g, p.a(), p.b());
+    } else {
+      gamma_dist gamma_a(p.a()), gamma_b(p.b());
+      return generate_gamma(g, gamma_a, gamma_b);
+    }
+  }
+
+  /**
+   * Generate Beta-distributed values using Johnk's algorithm.
+   * This is a rejection-sampling algorithm that only needs a few
+   * uniformly random values.
+   *
+   * See:
+   *
+   *     Johnk, H. D. "Erzeugung von betaverteilten und gammaverteilten
+   *     Zufallszahlen." Metrika 8, no. 1 (1964).
+   *
+   * For an English-language presentation, see:
+   *
+   *     Atkinson, A. C. and M. C. Pearce. "The computer generation of beta,
+   *     gamma and normal random variables." Journal of the Royal Statistical
+   *     Society: Series A (General) 139, no. 4 (1976).
+   *
+   * This includes fixes for numerical stability when the parameters are small,
+   * see:
+   *
+   *     https://github.com/numpy/numpy/issues/5851
+   *
+   * for discussion there; and a catch for the (extremely rare) case of the RNG
+   * giving us U and V both exactly 0.
+   *
+   * Note: There should be an umlaut on the "o" in "Johnk", but blame poor
+   * unicode support.
+   */
+  template <typename Generator>
+  result_type generate_johnk(Generator& g, result_type a, result_type b) {
+    while (true) {
+      const result_type U = random_uniform<result_type>(g);
+      const result_type V = random_uniform<result_type>(g);
+      const result_type X = std::pow(U, result_type(1) / a);
+      const result_type Y = std::pow(V, result_type(1) / b);
+      const result_type XplusY = X + Y;
+      if (XplusY <= result_type(1.0)) {
+        if (XplusY > result_type(0)) {
+          return X / XplusY;
+        } else if (U != result_type(0) && V != result_type(0)) {
+          // Work with logs instead if a/b is too small.
+          result_type logX = std::log(U) / a;
+          result_type logY = std::log(V) / b;
+          const result_type log_max = std::max(logX, logY);
+          logX -= log_max;
+          logY -= log_max;
+          return std::exp(logX - std::log(std::exp(logX) + std::exp(logY)));
+        }
+      }
+    }
+  }
+
+  /**
+   * Generate Beta-distributed values based on Gamma distributions.
+   * See:
+   *     https://en.wikipedia.org/wiki/Beta_distribution#Generating_beta-distributed_random_variates
+   * for details.
+   */
+  template <typename Generator>
+  result_type generate_gamma(Generator& g, gamma_dist& gamma_a,
+                             gamma_dist& gamma_b) {
+    const result_type Ga = gamma_a(g);
+    const result_type Gb = gamma_b(g);
+    return Ga / (Ga + Gb);
+  }
+};
+
+template <typename CharT, typename RealType>
+std::basic_ostream<CharT>& operator<<(std::basic_ostream<CharT>& os,
+                                      const beta_distribution<RealType>& d) {
+  os << "~Beta(" << d.a() << "," << d.b() << ")";
+  return os;
+}
+
+template <typename CharT, typename RealType>
+std::basic_istream<CharT>& operator>>(std::basic_istream<CharT>& is,
+                                      beta_distribution<RealType>& d) {
+  std::string s;
+  RealType a, b;
+  if (std::getline(is, s, '(') && s == "~Beta"
+      && is >> a
+      && is.get() == ','
+      && is >> b
+      && is.get() == ')') {
+    d = beta_distribution<RealType>(a, b);
+  } else {
+    is.setstate(std::ios::failbit);
+  }
+  return is;
+}
+
+}  // namespace lbann
+
+#endif  // LBANN_UTILS_BETA_HPP
diff --git a/include/lbann/utils/cloneable.hpp b/include/lbann/utils/cloneable.hpp
new file mode 100644
index 00000000000..7e5c825f2f3
--- /dev/null
+++ b/include/lbann/utils/cloneable.hpp
@@ -0,0 +1,234 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_UTILS_CLONEABLE_HPP_INCLUDED
+#define LBANN_UTILS_CLONEABLE_HPP_INCLUDED
+
+#include <memory>
+#include <type_traits>
+
+/** @file
+ *
+ *  This file implements covariant returns via smart pointers for a
+ *  polymorphic @c clone function. The implementation largely follows
+ *  the solution presented <a
+ *  href="https://www.fluentcpp.com/2017/09/12/how-to-return-a-smart-pointer-and-use-covariance/">by
+ *  the FluentC++ blog</a>. Some class/tag names have been updated to
+ *  be clearer, in my opinion. Additionally, a semi-useful predicate
+ *  has been added to aid metaprogramming down the line.
+ */
+
+namespace lbann {
+
+/** @brief Declare @c Base to be a virtual base.
+ *
+ *  This metafunction adds @c Base as a virtual base
+ *  class. Constructors of @c Base are added to this class.
+ *
+ *  @tparam Base The class to be declared as a virtual base.
+ */
+template <typename Base>
+struct AsVirtualBase : virtual Base
+{
+  using Base::Base;
+};
+
+/** @brief Declare that @c T has unimplemented virtual functions.
+ *
+ *  Due to metaprogramming restrictions on CRTP interfaces, we rely on
+ *  the user of these mechanisms to declare when a class has
+ *  unimplemented virtual functions (or "is abstract").
+ *
+ *  @tparam T The type that has at least one unimplemented virtual
+ *  function.
+ */
+template <typename T>
+struct HasAbstractFunction {};
+
+/** @brief Alias for HasAbstractFunction.
+ *
+ *  Good OO practice suggests that non-leaf classes should be abstract
+ *  -- that is, have at least one unimplemented virtual
+ *  function. LBANN fits this paradigm, so this alias is appropriate.
+*/
+template <typename T>
+using NonLeafClass = HasAbstractFunction<T>;
+
+/** @brief Inject polymorphic clone functions into hierarchies.
+ *
+ *  This class uses CRTP to inject the derived class's clone()
+ *  function directly into the class and uses
+ *  <a href="http://www.gotw.ca/publications/mill18.htm">the
+ *  Template Method</a> to virtualize it.
+ *
+ *  @tparam T The concrete class to be cloned.
+ *  @tparam Base The base class of T.
+ */
+template <typename T, typename... Base>
+class Cloneable
+  : public Base...
+{
+public:
+  /** @brief Return an exception-safe, memory-safe copy of this object. */
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+private:
+  /** @brief Implement the covariant raw-pointer-based clone operation. */
+  virtual Cloneable* do_clone_() const override {
+    return new T(static_cast<T const&>(*this));
+  }
+};// class Cloneable
+
+template <typename T, typename Base>
+class Cloneable<T, Base>
+  : public Base
+{
+public:
+  /** @brief Return an exception-safe, memory-safe copy of this object. */
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+protected:
+  using Base::Base;
+private:
+  /** @brief Implement the covariant raw-pointer-based clone operation. */
+  virtual Cloneable* do_clone_() const override {
+    return new T(static_cast<T const&>(*this));
+  }
+};// class Cloneable
+
+/** @brief Specialization of Cloneable to handle stand-alone classes. */
+template <typename T>
+class Cloneable<T>
+{
+public:
+  virtual ~Cloneable() = default;
+
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+private:
+  Cloneable* do_clone_() const {
+    return new T(static_cast<T const&>(*this));
+  }
+};// class Cloneable<T>
+
+/** @brief Specialization of Cloneable for intermediate classes.
+ *
+ *  Classes that are neither the top of the hierarchy nor a leaf of
+ *  the class tree should be virtual. An unfortunate consequence of
+ *  the CRTP method is that the target of the CRTP, @c T in this case,
+ *  is not a complete class when this class is instantiated, so
+ *  metaprogramming based on @c T is very restricted. Thus, users must
+ *  tag the target class with HasAbstractFunction. Doing so will
+ *  ensure that the @c do_clone_() function is declared pure virtual.
+ */
+template <typename T, typename... Base>
+class Cloneable<HasAbstractFunction<T>, Base...>
+  : public Base...
+{
+public:
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+private:
+  virtual Cloneable* do_clone_() const = 0;
+};
+
+template <typename T, typename Base>
+class Cloneable<HasAbstractFunction<T>, Base>
+  : public Base
+{
+public:
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+protected:
+  using Base::Base;
+private:
+  virtual Cloneable* do_clone_() const = 0;
+};
+
+/** @brief Specialization of Cloneable to handle the top of hierarchies. */
+template <typename T>
+class Cloneable<HasAbstractFunction<T>>
+{
+public:
+  virtual ~Cloneable() = default;
+
+  std::unique_ptr<T> clone() const {
+    return std::unique_ptr<T>{static_cast<T*>(this->do_clone_())};
+  }
+private:
+  virtual Cloneable* do_clone_() const = 0;
+};// class Cloneable<T>
+
+/** @brief Predicate testing for Cloneable interface.
+ *
+ *  This predicate determines whether a class supports the Cloneable
+ *  interface. If true, this class will support a smart-pointer-to-T
+ *  return from a @c clone() method.
+ *
+ *  This predicate type suffers a deficiency that it can be fooled
+ *  rather easily. It is generally not possible to determine from the
+ *  specific Cloneable instantiation used for a given type. Thus,
+ *  alternative strategies must be used. As it stands, any class that
+ *  provides a @c clone() method that returns a @c std::unique_ptr<T>
+ *  will satisfy this predicate.
+ *
+ *  @tparam T The type being tested.
+ */
+template <typename T>
+struct IsCloneableT;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+// The obvious case; I'd be concerned if this were ever called.
+template <typename... Ts>
+struct IsCloneableT<Cloneable<Ts...>> : std::true_type {};
+
+namespace details {
+
+struct definitely_not_a_unique_ptr;
+
+template <typename T>
+auto has_right_clone(T const& x) -> decltype(x.clone());
+
+definitely_not_a_unique_ptr has_right_clone(...);
+
+}// namespace details
+
+template <typename T>
+struct IsCloneableT
+  : std::is_same<decltype(details::has_right_clone(std::declval<T>())),
+                 std::unique_ptr<T>>
+{};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+template <typename T>
+constexpr bool IsCloneable_v() { return IsCloneableT<T>::value; };
+
+}// namespace lbann
+#endif // LBANN_UTILS_CLONEABLE_HPP_INCLUDED
diff --git a/include/lbann/utils/commify.hpp b/include/lbann/utils/commify.hpp
new file mode 100644
index 00000000000..d5c43ab1956
--- /dev/null
+++ b/include/lbann/utils/commify.hpp
@@ -0,0 +1,16 @@
+#ifndef LBANN_UTILS_COMMIFY_INCLUDED
+#define LBANN_UTILS_COMMIFY_INCLUDED
+
+#include <string>
+
+namespace lbann
+{
+namespace utils
+{
+
+/** @brief Inserts commas large integers for pretty-printing */
+std::string commify(size_t n);
+
+}// namespace utils
+}// namespace lbann
+#endif // LBANN_UTILS_ANY_HPP_INCLUDED
diff --git a/include/lbann/utils/cublas.hpp b/include/lbann/utils/cublas.hpp
index 49225ff2336..e206b0e3813 100644
--- a/include/lbann/utils/cublas.hpp
+++ b/include/lbann/utils/cublas.hpp
@@ -29,6 +29,7 @@
 
 #include "lbann/base.hpp"
 #include "lbann/utils/cuda.hpp"
+#include "lbann/utils/exception.hpp"
 
 #ifdef LBANN_HAS_CUDA
 #include <cuda_runtime.h>
@@ -44,8 +45,9 @@
       const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call);   \
       if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {         \
         cudaDeviceReset();                                              \
-        LBANN_ERROR(std::string("cuBLAS error: ")                       \
-                    + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \
+        LBANN_ERROR("cuBLAS error: ",                                   \
+                    lbann::cublas::get_error_string(                    \
+                      status_FORCE_CHECK_CUBLAS));                      \
       }                                                                 \
     }                                                                   \
     {                                                                   \
@@ -55,8 +57,8 @@
         status_FORCE_CHECK_CUBLAS = cudaGetLastError();                 \
       if (status_FORCE_CHECK_CUBLAS != cudaSuccess) {                   \
         cudaDeviceReset();                                              \
-        LBANN_ERROR(std::string("CUDA error: ")                         \
-                    + cudaGetErrorString(status_FORCE_CHECK_CUBLAS));   \
+        LBANN_ERROR("CUDA error: ",                                     \
+                    cudaGetErrorString(status_FORCE_CHECK_CUBLAS));     \
       }                                                                 \
     }                                                                   \
   } while (0)
@@ -67,20 +69,19 @@
       const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call);   \
       if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {         \
         cudaDeviceReset();                                              \
-        LBANN_ERROR(std::string("cuBLAS error: ")                       \
-                    + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \
+        LBANN_ERROR("cuBLAS error: ",                                   \
+                    lbann::cublas::get_error_string(                    \
+                      status_FORCE_CHECK_CUBLAS));                      \
       }                                                                 \
     }                                                                   \
   } while (0)
-#define FORCE_CHECK_CUBLAS_SYNC(cuda_call)                                    \
-  do {                                                                        \
-    const cudaError_t cuda_status = cuda_call;                                \
-    if (cuda_status != cudaSuccess) {                                         \
-      std::cerr << "CUDA error: " << cudaGetErrorString(cuda_status) << "\n"; \
-      std::cerr << "Error at " << __FILE__ << ":" << __LINE__ << "\n";        \
-      cudaDeviceReset();                                                      \
-      throw lbann::lbann_exception("CUDA error");                             \
-    }                                                                         \
+#define FORCE_CHECK_CUBLAS_SYNC(cuda_call)                              \
+  do {                                                                  \
+    const cudaError_t cuda_status = cuda_call;                          \
+    if (cuda_status != cudaSuccess) {                                   \
+      cudaDeviceReset();                                                \
+      LBANN_ERROR("CUDA error: ", cudaGetErrorString(cuda_status));     \
+    }                                                                   \
   } while (0)
 #ifdef LBANN_DEBUG
 #define CHECK_CUBLAS(cublas_call)                       \
@@ -99,61 +100,88 @@ namespace cublas {
 const std::string get_error_string(cublasStatus_t status);
 
 // BLAS Level-1 functions
+template <typename TensorDataType>
 void axpy(cublasHandle_t const& handle,
           int n,
-          DataType alpha,
-          DataType const* x, int incx,
-          DataType * y, int incy);
+          TensorDataType alpha,
+          TensorDataType const* x, int incx,
+          TensorDataType * y, int incy);
+template <typename TensorDataType>
 void dot(cublasHandle_t const& handle,
          int n,
-         DataType const* x, int incx,
-         DataType const* y, int incy,
-         DataType * result);
-DataType dot(cublasHandle_t const& handle,
+         TensorDataType const* x, int incx,
+         TensorDataType const* y, int incy,
+         TensorDataType * result);
+template <typename TensorDataType>
+TensorDataType dot(cublasHandle_t const& handle,
              int n,
-             DataType const* x, int incx,
-             DataType const* y, int incy);
+             TensorDataType const* x, int incx,
+             TensorDataType const* y, int incy);
+template <typename TensorDataType>
 void nrm2(cublasHandle_t const& handle,
           int n,
-          DataType const* x, int incx,
-          DataType * result);
-DataType nrm2(cublasHandle_t const& handle,
+          TensorDataType const* x, int incx,
+          TensorDataType * result);
+template <typename TensorDataType>
+TensorDataType nrm2(cublasHandle_t const& handle,
               int n,
-              DataType const* x, int incx);
+              TensorDataType const* x, int incx);
+template <typename TensorDataType>
 void scal(cublasHandle_t const& handle,
           int n,
-          DataType alpha,
-          DataType * x, int incx);
+          TensorDataType alpha,
+          TensorDataType * x, int incx);
 
 // BLAS Level-2 functions
+template <typename TensorDataType>
 void gemv(cublasHandle_t const& handle,
           cublasOperation_t trans,
           int m, int n,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType const * x, int incx,
-          DataType beta,
-          DataType * y, int iny);
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType const * x, int incx,
+          TensorDataType beta,
+          TensorDataType * y, int iny);
 
 // BLAS Level-3 functions
+template <typename TensorDataType>
 void gemm(cublasHandle_t const& handle,
           cublasOperation_t transa, cublasOperation_t transb,
           int m, int n, int k,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType const * B, int ldb,
-          DataType beta,
-          DataType * C, int ldc);
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType const * B, int ldb,
+          TensorDataType beta,
+          TensorDataType * C, int ldc);
 
 // BLAS-like extension
+template <typename TensorDataType>
 void geam(cublasHandle_t const& handle,
           cublasOperation_t transa, cublasOperation_t transb,
           int m, int n,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType beta,
-          DataType const * B, int ldb,
-          DataType * C, int ldc);
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType beta,
+          TensorDataType const * B, int ldb,
+          TensorDataType * C, int ldc);
+template <typename TensorDataType>
+void gemm_strided_batched(cublasHandle_t const& handle,
+                          cublasOperation_t transa, cublasOperation_t transb,
+                          int m, int n, int k,
+                          TensorDataType alpha,
+                          TensorDataType const * A, int lda,
+                          long long int strideA,
+                          TensorDataType const * B, int ldb,
+                          long long int strideB,
+                          TensorDataType beta,
+                          TensorDataType * C, int ldc,
+                          long long int strideC,
+                          int batchCount);
+
+/** @brief Set the default to use tensor core operations, allowing
+ *         FP32->FP16 conversions.
+ */
+void default_to_tensor_ops();
 
 } // namespace cublas
 } // namespace lbann
diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp
index 87201c0fe8d..d124487df3e 100644
--- a/include/lbann/utils/cuda.hpp
+++ b/include/lbann/utils/cuda.hpp
@@ -110,6 +110,40 @@ namespace cuda {
 template <typename T> __device__ __forceinline__
 T atomic_add(T* address, T val);
 
+/** @brief Sum over threads in CUDA block
+ *
+ *  Every thread in a CUDA block must enter this function. The sum is
+ *  returned on thread 0.
+ *
+ *  @tparam bdimx   x-dimension of CUDA block
+ *  @tparam bdimy   y-dimension of CUDA block
+ *  @tparam bdimz   z-dimension of CUDA block
+ *  @tparam T       Data type
+ *  @param  val     Contribution from thread
+ *  @returns On thread 0, the sum. Not meaningful on other threads.
+ */
+template <size_t bdimx, size_t bdimy, size_t bdimz, class T>
+__device__ __forceinline__
+T block_reduce(T val);
+
+/** @brief Reduction over threads in CUDA block
+ *
+ *  Every thread in a CUDA block must enter this function. The reduced
+ *  value is returned on thread 0.
+ *
+ *  @tparam bdimx   x-dimension of CUDA block
+ *  @tparam bdimy   y-dimension of CUDA block
+ *  @tparam bdimz   z-dimension of CUDA block
+ *  @tparam T       Data type
+ *  @tparam Op      Functor for reduction operation
+ *  @param  val     Contribution from each thread
+ *  @returns On thread 0, the reduced value. Not meaningful on other
+ *  threads.
+ */
+template <size_t bdimx, size_t bdimy, size_t bdimz, class T, class Op>
+__device__ __forceinline__
+T block_reduce(T val);
+
 // Unary math functions
 template <typename T> __device__ __forceinline__ T abs(const T& x);
 template <typename T> __device__ __forceinline__ T round(const T& x);
@@ -146,6 +180,15 @@ template <typename T> constexpr __device__ __forceinline__ T max();
 template <typename T> constexpr __device__ __forceinline__ T epsilon();
 template <typename T> __device__ __forceinline__ T infinity();
 
+/** @brief Array with fixed type and size. */
+template <typename T, size_t N>
+struct array {
+  T vals[N];
+  __host__ __device__ __forceinline__ size_t size() const;
+  __host__ __device__ __forceinline__ T& operator[](size_t i);
+  __host__ __device__ __forceinline__ const T& operator[](size_t i) const;
+};
+
 #endif // __CUDACC__
 
 // -------------------------------------------------------------
@@ -187,36 +230,40 @@ class event_wrapper {
  *  The input and output data must be on GPU and must have the same
  *  dimensions.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsMat& input,
-                                    AbsMat& output);
+template <template <typename> class UnaryOperator, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractMatrix<TensorDataType>& input,
+  El::AbstractMatrix<TensorDataType>& output);
 
 /** Apply an entry-wise binary operator to GPU data.
  *  The input and output data must be on GPU and must have the same
  *  dimensions.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsMat& input1,
-                                     const AbsMat& input2,
-                                     AbsMat& output);
+template <template <typename> class BinaryOperator, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractMatrix<TensorDataType>& input1,
+  const El::AbstractMatrix<TensorDataType>& input2,
+  El::AbstractMatrix<TensorDataType>& output);
 
 
 /** Apply an entry-wise unary operator to GPU data.
  *  The input and output data must be on GPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsDistMat& input,
-                                    AbsDistMat& output);
+template <template <typename> class UnaryOperator, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input,
+  El::AbstractDistMatrix<TensorDataType>& output);
 
 /** Apply an entry-wise binary operator to GPU data.
  *  The input and output data must be on GPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsDistMat& input1,
-                                     const AbsDistMat& input2,
-                                     AbsDistMat& output);
+template <template <typename> class BinaryOperator, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input1,
+  const El::AbstractDistMatrix<TensorDataType>& input2,
+  El::AbstractDistMatrix<TensorDataType>& output);
 
 #endif // __CUDACC__
 
diff --git a/include/lbann/utils/cudnn.hpp b/include/lbann/utils/cudnn.hpp
index e6a507624c7..355d644805e 100644
--- a/include/lbann/utils/cudnn.hpp
+++ b/include/lbann/utils/cudnn.hpp
@@ -31,6 +31,7 @@
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/layers/layer.hpp"
+#include "lbann/layers/data_type_layer.hpp"
 #include <vector>
 
 #ifdef LBANN_HAS_CUDNN
@@ -83,6 +84,23 @@ class Layer;
 
 namespace cudnn {
 
+template <typename T>
+struct ScalingParameterT
+{
+  using type = T;
+};
+
+template <typename T>
+using ScalingParamType = typename ScalingParameterT<T>::type;
+
+#ifdef LBANN_HAS_GPU_FP16
+template <>
+struct ScalingParameterT<fp16>
+{
+  using type = float;
+};
+#endif // LBANN_USE_GPU_FP16
+
 ////////////////////////////////////////////////////////////
 // Global cuDNN objects
 ////////////////////////////////////////////////////////////
@@ -102,11 +120,13 @@ cudnnHandle_t& get_handle();
 ////////////////////////////////////////////////////////////
 
 /** Get cuDNN data type associated with DataType. */
+template <typename TensorDataType>
 cudnnDataType_t get_data_type();
 
 /** Set cuDNN tensor descriptor.
  *  desc is created if necessary.
  */
+template <typename TensorDataType>
 void set_tensor_desc(cudnnTensorDescriptor_t& desc,
                      std::vector<int> dims,
                      std::vector<int> strides = {});
@@ -128,17 +148,20 @@ void copy_activation_desc(const cudnnActivationDescriptor_t& src,
 ////////////////////////////////////////////////////////////
 
 /** Manager for a layer's cuDNN tensor descriptors. */
+template <typename TensorDataType>
 class layer_tensor_manager {
 public:
-  layer_tensor_manager(const Layer* l = nullptr);
+  using LayerType = data_type_layer<TensorDataType>;
+public:
+  layer_tensor_manager(const LayerType* l = nullptr);
   layer_tensor_manager(const layer_tensor_manager& other);
   layer_tensor_manager& operator=(const layer_tensor_manager& other);
   virtual ~layer_tensor_manager();
 
   /** Get the layer being managed. */
-  const Layer* get_layer() const { return m_layer; }
+  const LayerType* get_layer() const { return m_layer; }
   /** Set the layer being managed. */
-  void set_layer(const Layer* l);
+  void set_layer(const LayerType* l);
 
   /** Get cuDNN tensor descriptor for layer input. */
   virtual cudnnTensorDescriptor_t& get_prev_activations(int parent_index = 0) = 0;
@@ -157,7 +180,7 @@ class layer_tensor_manager {
   void set_num_children(int num_children);
 
   /** Layer being managed. */
-  const Layer* m_layer;
+  const LayerType* m_layer;
   /** cuDNN tensor descriptors for layer inputs. */
   std::vector<cudnnTensorDescriptor_t> m_prev_activations;
   /** cuDNN tensor descriptors for layer outputs. */
@@ -170,9 +193,12 @@ class layer_tensor_manager {
 };
 
 /** Manager for a data-parallel layer's cuDNN tensor descriptors. */
-class data_parallel_layer_tensor_manager : public layer_tensor_manager {
+template <typename TensorDataType>
+class data_parallel_layer_tensor_manager : public layer_tensor_manager<TensorDataType> {
+public:
+  using LayerType = data_type_layer<TensorDataType>;
 public:
-  data_parallel_layer_tensor_manager(const Layer* l = nullptr);
+  data_parallel_layer_tensor_manager(const LayerType* l = nullptr);
   data_parallel_layer_tensor_manager(
     const data_parallel_layer_tensor_manager& other) = default;
   data_parallel_layer_tensor_manager&
@@ -185,9 +211,12 @@ class data_parallel_layer_tensor_manager : public layer_tensor_manager {
 };
 
 /** Manager for an entry-wise layer's cuDNN tensor descriptors. */
-class entrywise_layer_tensor_manager : public layer_tensor_manager {
+template <typename TensorDataType>
+class entrywise_layer_tensor_manager : public layer_tensor_manager<TensorDataType> {
 public:
-  entrywise_layer_tensor_manager(const Layer* l = nullptr);
+  using LayerType = data_type_layer<TensorDataType>;
+public:
+  entrywise_layer_tensor_manager(const LayerType* l = nullptr);
   entrywise_layer_tensor_manager(
     const entrywise_layer_tensor_manager& other) = default;
   entrywise_layer_tensor_manager&
@@ -267,6 +296,17 @@ cudnnConvolutionBwdFilterAlgo_t get_bwd_filter_algorithm(
   size_t ws_size,
   void* ws);
 
+/** @brief Set the default to use tensor core operations, allowing
+ *         FP32->FP16 conversions.
+ */
+void default_to_tensor_ops() noexcept;
+
+/** @brief Get the default math type.
+ *
+ *  Will query the command-line args.
+ */
+cudnnMathType_t get_default_convolution_math_type() noexcept;
+
 } // namespace cudnn
 } // namespace lbann
 
diff --git a/include/lbann/utils/dataset.hpp b/include/lbann/utils/dataset.hpp
index 2c1373f3807..154a47c8d44 100644
--- a/include/lbann/utils/dataset.hpp
+++ b/include/lbann/utils/dataset.hpp
@@ -28,6 +28,7 @@
 #define LBANN_DATASET_HPP_INCLUDED
 
 #include "lbann/data_readers/data_reader.hpp"
+#include <cereal/types/utility.hpp>
 
 namespace lbann {
 
@@ -38,6 +39,11 @@ class dataset {
   // the data reader.
   dataset(const dataset& other) = default;
   dataset& operator=(const dataset& other) = default;
+  template <class Archive> void serialize( Archive & ar ) {
+    ar(CEREAL_NVP(m_num_samples_processed),
+       CEREAL_NVP(m_total_samples));
+  }
+
   long get_num_samples_processed() const { return m_num_samples_processed; }
   long& num_samples_processed() { return m_num_samples_processed; }
   long get_total_samples() const { return m_total_samples; }
diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
new file mode 100644
index 00000000000..f6ff4001f0e
--- /dev/null
+++ b/include/lbann/utils/distconv.hpp
@@ -0,0 +1,257 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_DISTCONV_HPP
+#define LBANN_UTILS_DISTCONV_HPP
+
+#include "lbann_config.hpp"
+
+#ifdef LBANN_HAS_DISTCONV
+
+#include "El.hpp"
+#include "lbann/comm.hpp"
+#include <vector>
+
+#ifdef LBANN_DEBUG
+#define DISTCONV_DEBUG
+#endif
+
+#include "distconv/distconv.hpp"
+#include "distconv/tensor/tensor_mpi_cuda.hpp"
+#include "distconv/tensor/shuffle_mpi.hpp"
+#include "distconv/tensor/shuffle_mpi_cuda.hpp"
+#include "distconv/tensor/shuffle_mpi_cuda_al.hpp"
+#include "distconv/tensor/algorithms.hpp"
+#include "distconv/util/util.hpp"
+#ifdef DISTCONV_HAS_P2P
+#include "p2p/p2p.hpp"
+#include "distconv/tensor/shuffle_mpi_cuda_p2p.hpp"
+#include "distconv/tensor/shuffle_mpi_cuda_hybrid.hpp"
+#endif // DISTCONV_HAS_P2P
+
+namespace lbann {
+
+class Layer;
+
+namespace dc {
+
+namespace tensor = ::distconv::tensor;
+namespace util = ::distconv::util;
+
+////////////////////////////////////////////////////////////
+// Helper type aliases
+////////////////////////////////////////////////////////////
+using IntVector = ::distconv::IntVector;
+using IndexVector = ::distconv::IndexVector;
+using Shape = ::distconv::tensor::Shape;
+
+using Dist = ::distconv::tensor::Distribution;
+
+using LocaleMPI = ::distconv::tensor::LocaleMPI;
+
+using AbsTensor = ::distconv::tensor::AbstractTensor;
+
+template <typename TensorDataType>
+using TensorHost = ::distconv::tensor::Tensor<
+  TensorDataType, LocaleMPI, ::distconv::tensor::BaseAllocator>;
+
+template <typename TensorDataType>
+using TensorDev = ::distconv::tensor::Tensor<
+  TensorDataType, LocaleMPI, ::distconv::tensor::CUDAAllocator>;
+
+template <typename TensorDataType>
+using TensorHostShuffler = ::distconv::tensor::TensorMPIShuffler<
+  TensorDataType, ::distconv::tensor::BaseAllocator>;
+
+template <typename TensorDataType>
+using TensorShuffler = ::distconv::tensor::TensorMPICUDAShuffler<TensorDataType>;
+template <typename TensorDataType>
+using TensorShufflerAL = ::distconv::tensor::TensorMPICUDAShufflerAL<TensorDataType>;
+#ifdef DISTCONV_HAS_P2P
+template <typename TensorDataType>
+using TensorShufflerP2P = ::distconv::tensor::TensorMPICUDAShufflerP2P<TensorDataType>;
+template <typename TensorDataType>
+using TensorShufflerHybrid = ::distconv::tensor::TensorMPICUDAShufflerHybrid<TensorDataType>;
+#endif // DISTCONV_HAS_P2P
+
+// Debug printing functions
+using MPIPrintStreamDebug = ::distconv::util::MPIPrintStreamDebug;
+using MPIPrintStreamError = ::distconv::util::MPIPrintStreamError;
+using MPIPrintStreamInfo = ::distconv::util::MPIPrintStreamInfo;
+using MPIPrintStreamWarning = ::distconv::util::MPIPrintStreamWarning;
+using MPIRootPrintStreamDebug = ::distconv::util::MPIRootPrintStreamDebug;
+using MPIRootPrintStreamError = ::distconv::util::MPIRootPrintStreamError;
+using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo;
+using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;
+
+// Distconv layer classes
+using Backend = ::distconv::cudnn::BackendCUDNN;
+using ReLU = ::distconv::ReLU<Backend>;
+using LeakyReLU = ::distconv::LeakyReLU<Backend>;
+template <typename TensorDataType>
+using Convolution = ::distconv::Convolution<Backend, TensorDataType>;
+template <typename TensorDataType>
+using Pooling = ::distconv::Pooling<Backend, TensorDataType>;
+template <typename TensorDataType>
+using BatchNormalization = ::distconv::BatchNormalization<Backend, TensorDataType>;
+using Softmax = ::distconv::Softmax<Backend>;
+using CrossEntropy = ::distconv::CrossEntropy<Backend>;
+
+using ::distconv::get_sample_dim;
+using ::distconv::get_channel_dim;
+
+int get_strided_mpi_rank(MPI_Comm comm);
+MPI_Comm get_strided_mpi_comm(MPI_Comm comm);
+
+/** Initialize Distconv
+ */
+void initialize(MPI_Comm comm);
+
+/** Finalize Distconv
+ */
+void finalize();
+
+/** Return MPI_Comm used for distconv
+
+    Note that training only a single model is considered. This should
+    be equal to MPI_COMM_WORLD.
+ */
+MPI_Comm get_mpi_comm();
+
+/** Return the MPI rank
+ */
+int get_mpi_rank();
+
+/** Return the number of MPI ranks
+ */
+int get_mpi_num_ranks();
+
+/** Query if this rank is the root of the MPI communiator
+ */
+bool is_mpi_root();
+
+/** Query rank stride
+ */
+int get_rank_stride();
+
+/** Query if the execution is for performance evaluation
+ */
+bool evaluate_performance();
+
+/** Query convolution forward algorithm name.
+ */
+std::string get_convolution_fwd_algorithm();
+
+/** Query convolution backward data algorithm name.
+ */
+std::string get_convolution_bwd_data_algorithm();
+
+/** Query convolution backward filter algorithm name.
+ */
+std::string get_convolution_bwd_filter_algorithm();
+
+/** Query method for random number generation in synthetic data reader.
+ */
+std::string get_synthetic_data_reader_randgen();
+
+/** Query the number of synthetic data to pre-generate.
+ */
+int get_number_of_pre_generated_synthetic_data();
+
+/** Query if determinism is requested
+ */
+bool is_deterministic();
+
+/** Query the number of partitions in the depth dimension.
+ */
+int get_number_of_io_partitions();
+
+/** Query if Cosmoflow parallel I/O is enabled.
+ */
+bool is_cosmoflow_parallel_io_enabled();
+
+#ifdef DISTCONV_HAS_P2P
+/** Get p2p handle
+ */
+p2p::P2P &get_p2p();
+#endif // DISTCONV_HAS_P2P
+
+/** Get Aluminum MPI-CUDA backend
+ */
+Al::mpicuda_backend::comm_type &get_mpicuda();
+
+/** Get Distconv backend handle.
+ */
+Backend &get_backend();
+
+/** Return a HaloExchangeMethod
+ */
+::distconv::HaloExchangeMethod get_halo_exchange_method();
+
+template <typename TensorDataType>
+TensorShuffler<TensorDataType> *get_tensor_shuffler(const TensorDev<TensorDataType> &src,
+                                                    const TensorDev<TensorDataType> &dst);
+
+MPI_Comm get_input_comm(const lbann_comm &comm);
+
+/** Return the MPI rank when reading input dataset
+ */
+int get_input_rank(const lbann_comm &comm);
+
+/** Return Dist for data-parallel Hydrogen matrices
+ */
+Dist get_hydrogen_data_parallel_distribution(int num_dims);
+
+template <typename Tensor>
+void dump_tensor(const Tensor &t, const std::string &path) {
+  dc::MPIPrintStreamDebug() << "Dumping tensor to " << path;
+  cudaDeviceSynchronize();
+  distconv::dump_tensor(t, path, true);
+}
+
+size_t get_workspace_capacity();
+
+int get_num_dims(const Layer &layer);
+int get_num_spatial_dims(const Layer &layer);
+
+#ifndef LBANN_UTILS_DISTCONV_INSTANTIATE
+#define PROTO(T)                                                \
+  extern template TensorShuffler<T> *get_tensor_shuffler<T>(    \
+      const TensorDev<T> &, const TensorDev<T> &);
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_UTILS_DISTCONV_INSTANTIATE
+
+} // namespace dc
+} // namespace lbann
+
+#endif // LBANN_HAS_DISTCONV
+#endif // LBANN_UTILS_DISTCONV_HPP
diff --git a/include/lbann/utils/entrywise_operator.hpp b/include/lbann/utils/entrywise_operator.hpp
index 50172bc7700..5368262763e 100644
--- a/include/lbann/utils/entrywise_operator.hpp
+++ b/include/lbann/utils/entrywise_operator.hpp
@@ -36,10 +36,11 @@ namespace lbann {
  *  The input and output data must be on CPU and must have the same
  *  dimensions.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsMat& input,
-                                    AbsMat& output) {
-
+template <template <typename> class Op, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractMatrix<TensorDataType>& input,
+  El::AbstractMatrix<TensorDataType>& output) {
+  using UnaryOperator = Op<TensorDataType>;
   // Check that input and output are valid
   std::stringstream err;
   if (input.GetDevice() != El::Device::CPU) {
@@ -83,13 +84,13 @@ void apply_entrywise_unary_operator(const AbsMat& input,
  *  The input and output data must be on CPU and must have the same
  *  dimensions.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsMat& input1,
-                                     const AbsMat& input2,
-                                     AbsMat& output) {
-
+template <template <typename> class Op, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractMatrix<TensorDataType>& input1,
+  const El::AbstractMatrix<TensorDataType>& input2,
+  El::AbstractMatrix<TensorDataType>& output) {
+  using BinaryOperator = Op<TensorDataType>;
   // Check that input and output are valid
-  std::stringstream err;
   if (input1.GetDevice() != El::Device::CPU
       || input2.GetDevice() != El::Device::CPU) {
     LBANN_ERROR("input is not on CPU");
@@ -99,12 +100,11 @@ void apply_entrywise_binary_operator(const AbsMat& input1,
              || input1.Width() != input2.Width()
              || input1.Height() != output.Height()
              || input1.Width() != output.Width()) {
-    err << "input matrix dimensions "
-        << "(" << input1.Height() << " x " << input1.Width() << ", "
-        << input2.Height() << " x " << input2.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input1.Height(), " x ", input1.Width(), ", ",
+                input2.Height(), " x ", input2.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   }
 
   // Apply binary operator
@@ -137,50 +137,48 @@ void apply_entrywise_binary_operator(const AbsMat& input1,
  *  The input and output data must be on CPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsDistMat& input,
-                                    AbsDistMat& output) {
-  std::stringstream err;
+template <template <typename> class Op, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input,
+  El::AbstractDistMatrix<TensorDataType>& output) {
   if (input.Height() != output.Height()
       || input.Width() != output.Width()) {
-    err << "input matrix dimensions "
-        << "(" << input.Height() << " x " << input.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input.Height(), " x ", input.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   } else if (input.DistData() != output.DistData()) {
     LBANN_ERROR("input and output matrix distributions don't match");
   }
-  apply_entrywise_unary_operator<UnaryOperator>(input.LockedMatrix(),
-                                                output.Matrix());
+  apply_entrywise_unary_operator<Op>(input.LockedMatrix(),
+                                     output.Matrix());
 }
 
 /** Apply an entry-wise binary operator to GPU data.
  *  The input and output data must be on GPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsDistMat& input1,
-                                     const AbsDistMat& input2,
-                                     AbsDistMat& output) {
+template <template <typename> class Op, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input1,
+  const El::AbstractDistMatrix<TensorDataType>& input2,
+  El::AbstractDistMatrix<TensorDataType>& output) {
   if (input1.Height() != input2.Height()
       || input1.Width() != input2.Width()
       || input1.Height() != output.Height()
       || input1.Width() != output.Width()) {
-    std::stringstream err;
-    err << "input matrix dimensions "
-        << "(" << input1.Height() << " x " << input1.Width() << ", "
-        << input2.Height() << " x " << input2.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input1.Height(), " x ", input1.Width(), ", ",
+                input2.Height(), " x ", input2.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   } else if (input1.DistData() != input2.DistData()
              || input1.DistData() != output.DistData()) {
     LBANN_ERROR("input and output matrix distributions don't match");
   }
-  apply_entrywise_binary_operator<BinaryOperator>(input1.LockedMatrix(),
-                                                  input2.LockedMatrix(),
-                                                  output.Matrix());
+  apply_entrywise_binary_operator<Op>(input1.LockedMatrix(),
+                                      input2.LockedMatrix(),
+                                      output.Matrix());
 }
 
 } // namespace lbann
diff --git a/include/lbann/utils/enum_iterator.hpp b/include/lbann/utils/enum_iterator.hpp
new file mode 100644
index 00000000000..247420f0e90
--- /dev/null
+++ b/include/lbann/utils/enum_iterator.hpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_ENUM_ITERATOR_H
+#define LBANN_ENUM_ITERATOR_H
+
+#include <type_traits>
+
+namespace lbann {
+
+/** @brief Create an iterator that goes over a contiguous (unit-step)
+    enum class  */
+template < typename C, C beginVal, C endVal>
+class enum_iterator {
+  typedef typename std::underlying_type<C>::type val_t;
+  int val;
+public:
+  enum_iterator(const C & f) : val(static_cast<val_t>(f)) {}
+  enum_iterator() : val(static_cast<val_t>(beginVal)) {}
+  enum_iterator operator++() {
+    ++val;
+    return *this;
+  }
+  C operator*() { return static_cast<C>(val); }
+  enum_iterator begin() { return *this; } //default ctor is good
+  enum_iterator end() {
+      static const enum_iterator endIter=++enum_iterator(endVal); // cache it
+      return endIter;
+  }
+  bool operator!=(const enum_iterator& i) { return val != i.val; }
+};
+
+}
+#endif // LBANN_ENUM_ITERATOR_H
diff --git a/include/lbann/utils/environment_variable.hpp b/include/lbann/utils/environment_variable.hpp
new file mode 100644
index 00000000000..95dd09de2f6
--- /dev/null
+++ b/include/lbann/utils/environment_variable.hpp
@@ -0,0 +1,145 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED
+#define LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED
+
+#include "lbann/utils/from_string.hpp"
+
+#include <string>
+
+namespace lbann
+{
+namespace utils
+{
+
+/** @brief Access environment variables using getenv. */
+class GetEnvAccessor
+{
+public:
+  std::string get(std::string const& var_name) const;
+};
+
+/** @brief An environment variable
+ *
+ *  Values are acquired lazily. The only maintained state is the name.
+ */
+template <typename AccessPolicy=GetEnvAccessor>
+class EnvVariable
+{
+public:
+
+  /** @name Constructors */
+  ///@{
+
+  /** @brief Construct from a string. */
+  EnvVariable(std::string const& var_name);
+
+  /** @brief Construct from a temporary string. */
+  EnvVariable(std::string&& var_name);
+
+  ///@}
+  /** @name Queries */
+  ///@{
+
+  /** @brief Test if the variable exists in the environment.
+   *
+   *  Existence means set to a nonempty string.
+   */
+  bool exists() const;
+
+  ///@}
+  /** @name Accessors */
+  ///@{
+
+  /** @brief Get the name of the environment variable. */
+  std::string const& name() const noexcept;
+
+  /** @brief Get the string value of the environment variable. */
+  std::string raw_value() const;
+
+  /** @brief Get the value of the environment variable as a certain type. */
+  template <typename T>
+  T value() const;
+
+  ///@}
+
+private:
+  /** @brief The name of the variable. */
+  std::string name_;
+};
+
+/** @brief Convenience typedef */
+using ENV = EnvVariable<>;
+
+// Implementation
+
+template <typename AccessPolicy>
+inline
+EnvVariable<AccessPolicy>::
+EnvVariable(std::string const& var_name)
+  : name_{var_name}
+{}
+
+template <typename AccessPolicy>
+inline
+EnvVariable<AccessPolicy>::
+EnvVariable(std::string&& var_name)
+  : name_{std::move(var_name)}
+{}
+
+template <typename AccessPolicy>
+inline bool
+EnvVariable<AccessPolicy>::exists() const
+{
+  return raw_value().size() > 0;
+}
+
+template <typename AccessPolicy>
+inline std::string const&
+EnvVariable<AccessPolicy>::name() const noexcept
+{
+  return name_;
+}
+
+template <typename AccessPolicy>
+inline std::string
+EnvVariable<AccessPolicy>::raw_value() const
+{
+  AccessPolicy access;
+  return access.get(name_);
+}
+
+template <typename AccessPolicy>
+template <typename T>
+T EnvVariable<AccessPolicy>::value() const
+{
+  return from_string<T>(raw_value());
+}
+
+}// namespace utils
+}// namespace lbann
+#endif /* LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED */
diff --git a/include/lbann/utils/exception.hpp b/include/lbann/utils/exception.hpp
index c487eb6af64..ccea1aa5b8c 100644
--- a/include/lbann/utils/exception.hpp
+++ b/include/lbann/utils/exception.hpp
@@ -28,36 +28,50 @@
 #define LBANN_UTILS_EXCEPTION_HPP_INCLUDED
 
 #include "lbann/comm.hpp"
+
+#include <exception>
 #include <iostream>
 #include <sstream>
-#include <exception>
 
 // Macro to throw an LBANN exception
-#define LBANN_ERROR(message)                                    \
+#define LBANN_ERROR(...)                                        \
   do {                                                          \
-    std::stringstream ss_LBANN_ERROR;                           \
-    ss_LBANN_ERROR << "LBANN error ";                           \
     const int rank_LBANN_ERROR = lbann::get_rank_in_world();    \
-    if (rank_LBANN_ERROR >= 0) {                                \
-      ss_LBANN_ERROR << "on rank " << rank_LBANN_ERROR << " ";  \
-    }                                                           \
-    ss_LBANN_ERROR << "(" << __FILE__ << ":" << __LINE__ << ")" \
-                     << ": " << (message);                      \
-    throw lbann::exception(ss_LBANN_ERROR.str());               \
+    throw lbann::exception(                                     \
+      lbann::build_string(                                      \
+        "LBANN error",                                          \
+        (rank_LBANN_ERROR >= 0                                  \
+         ? " on rank " + std::to_string(rank_LBANN_ERROR)       \
+         : std::string()),                                      \
+        " (", __FILE__, ":", __LINE__, "): ", __VA_ARGS__));    \
   } while (0)
 
 // Macro to print a warning to standard error stream.
-#define LBANN_WARNING(message)                                          \
-  do {                                                                  \
-    std::stringstream ss_LBANN_WARNING;                                 \
-    ss_LBANN_WARNING << "LBANN warning ";                               \
-    const int rank_LBANN_WARNING = lbann::get_rank_in_world();          \
-    if (rank_LBANN_WARNING >= 0) {                                      \
-      ss_LBANN_WARNING << "on rank " << rank_LBANN_WARNING << " ";      \
-    }                                                                   \
-    ss_LBANN_WARNING << "(" << __FILE__ << ":" << __LINE__ << ")"       \
-                     << ": " << (message) << std::endl;                 \
-    std::cerr << ss_LBANN_WARNING.str();                                \
+#define LBANN_WARNING(...)                                      \
+  do {                                                          \
+    const int rank_LBANN_WARNING = lbann::get_rank_in_world();  \
+    std::cerr << lbann::build_string(                           \
+      "LBANN warning",                                          \
+      (rank_LBANN_WARNING >= 0                                  \
+       ? " on rank " + std::to_string(rank_LBANN_WARNING)       \
+       : std::string()),                                        \
+      " (", __FILE__, ":", __LINE__, "): ", __VA_ARGS__)        \
+              << std::endl;                                     \
+  } while (0)
+
+// Macro to print a message to standard cout stream.
+#define LBANN_MSG(...)                                          \
+  do {                                                          \
+    const int rank_LBANN_MSG = lbann::get_rank_in_world();      \
+    if(rank_LBANN_MSG == 0) {                                   \
+      std::cout << lbann::build_string(                         \
+      "LBANN message",                                          \
+      (rank_LBANN_MSG >= 0                                      \
+       ? " on rank " + std::to_string(rank_LBANN_MSG)           \
+       : std::string()),                                        \
+      " (", __FILE__, ":", __LINE__, "): ", __VA_ARGS__)        \
+              << std::endl;                                     \
+    }                                                           \
   } while (0)
 
 namespace lbann {
@@ -91,6 +105,23 @@ class exception : public std::exception {
 };
 using lbann_exception = exception;
 
+/** @brief Build a string from the arguments.
+ *
+ *  The arguments must be stream-outputable (have operator<<(ostream&,
+ *  T) defined). It will be a static error if this fails.
+ *
+ *  @tparam Args (Inferred) The types of the arguments.
+ *
+ *  @param[in] args The things to be stringified.
+ */
+template <typename... Args>
+std::string build_string(Args&&... args) {
+  std::ostringstream oss;
+  int dummy[] = { (oss << args, 0)... };
+  (void) dummy; // silence compiler warnings
+  return oss.str();
+}
+
 } // namespace lbann
 
 #endif // LBANN_UTILS_EXCEPTION_HPP_INCLUDED
diff --git a/include/lbann/utils/factory.hpp b/include/lbann/utils/factory.hpp
index 8ab4995a013..d76fbdc24c0 100644
--- a/include/lbann/utils/factory.hpp
+++ b/include/lbann/utils/factory.hpp
@@ -2,14 +2,36 @@
 #ifndef LBANN_UTILS_FACTORY_HPP_
 #define LBANN_UTILS_FACTORY_HPP_
 
+#include <lbann_config.hpp>
+#include <lbann/utils/factory_error_policies.hpp>
+
+#ifdef LBANN_HAS_DIHYDROGEN
+
+#include <h2/patterns/factory/ObjectFactory.hpp>
+
+namespace lbann
+{
+
+template <class BaseT, typename KeyT,
+          typename BuilderT = std::function<std::unique_ptr<BaseT>()>,
+          template <typename, class> class KeyErrorPolicy
+          = default_key_error_policy>
+using generic_factory =
+  h2::factory::ObjectFactory<BaseT, KeyT, BuilderT, KeyErrorPolicy>;
+
+} // namespace lbann
+
+#else // !LBANN_HAS_DIHYDROGEN
+
+// WARNING: This code is deprecated and will be removed when
+// DiHydrogen becomes a required dependency of LBANN.
+
 #include <algorithm>
 #include <forward_list>
 #include <functional>
 #include <memory>
 #include <unordered_map>
 
-#include <lbann/utils/factory_error_policies.hpp>
-
 namespace lbann
 {
 
@@ -24,30 +46,30 @@ namespace lbann
  *      using callback_factory
  *        = generic_factory<lbann_callback, string, callback_builder_type>;
  *
- *  The default behavior for key errors is to throw an exception.
+ *  The default behavior for id errors is to throw an exception.
  *
  *  @tparam BaseT        The base class of the types being constructed.
- *  @tparam KeyT         The index type used to differentiate concrete types.
+ *  @tparam IdT         The index type used to differentiate concrete types.
  *  @tparam BuilderT     The functor type that builds concrete types.
- *  @tparam ErrorPolicy  The policy for handling key errors.
+ *  @tparam ErrorPolicy  The policy for handling id errors.
  */
-template <class BaseT, typename KeyT,
+template <class BaseT, typename IdT,
           typename BuilderT = std::function<std::unique_ptr<BaseT>()>,
-          template <typename, class> class KeyErrorPolicy
+          template <typename, class> class IdErrorPolicy
           = default_key_error_policy>
-class generic_factory : private KeyErrorPolicy<KeyT,BaseT>
+class generic_factory : private IdErrorPolicy<IdT,BaseT>
 {
 public:
   using base_type = BaseT;
-  using key_type = KeyT;
+  using id_type = IdT;
   using builder_type = BuilderT;
 
 private:
   // This could be any of std::unordered_map, std::map, and something
-  // even more bland like std::list<std::pair<key_type, builder_type>>
-  // depending on the properties of "key_type". My initial assumption
-  // is that keys will be hashable types...
-  using map_type = std::unordered_map<key_type,builder_type>;
+  // even more bland like std::list<std::pair<id_type, builder_type>>
+  // depending on the properties of "id_type". My initial assumption
+  // is that ids will be hashable types...
+  using map_type = std::unordered_map<id_type,builder_type>;
 
 public:
   using size_type = typename map_type::size_type;
@@ -56,32 +78,32 @@ class generic_factory : private KeyErrorPolicy<KeyT,BaseT>
   /** @name Builder registration */
   ///@{
 
-  /** @brief Register a new builder for key @c key.
+  /** @brief Register a new builder for id @c id.
    *
-   *  @param key     An identifier for a concrete type to be constructed.
+   *  @param id     An identifier for a concrete type to be constructed.
    *  @param builder An @c Invokable object that builds concrete objects.
    *
    *  @return @c true if the builder was registered successfully; @c
    *      false otherise.
    */
-  bool register_builder(key_type key, builder_type builder)
+  bool register_builder(id_type id, builder_type builder)
   {
     return m_registered_builders.emplace(
       std::piecewise_construct,
-      std::forward_as_tuple(std::move(key)),
+      std::forward_as_tuple(std::move(id)),
       std::forward_as_tuple(std::move(builder))).second;
   }
 
-  /** @brief Unregister the current builder for key @c key.
+  /** @brief Unregister the current builder for id @c id.
    *
-   *  @param key The key for the builder to be removed from the factory.
+   *  @param id The id for the builder to be removed from the factory.
    *
    *  @return @c true if a builder was unregistered; @c false
    *      otherwise.
    */
-  bool unregister(key_type const& key)
+  bool unregister(id_type const& id)
   {
-    return m_registered_builders.erase(key);
+    return m_registered_builders.erase(id);
   }
 
   ///@}
@@ -90,20 +112,20 @@ class generic_factory : private KeyErrorPolicy<KeyT,BaseT>
 
   /** @brief Construct a new object.
    *
-   *  @param key  The key for the object to be created.
+   *  @param id  The id for the object to be created.
    *  @param args Extra arguments for the builder.
    *
    *  @return A newly-built object managed by an @c std::unique_ptr.
    */
   template <typename... Ts>
   std::unique_ptr<base_type> create_object(
-    key_type const& key, Ts&&... args) const
+    id_type const& id, Ts&&... args) const
   {
-    auto it = m_registered_builders.find(key);
+    auto it = m_registered_builders.find(id);
     if (it != m_registered_builders.end())
       return (it->second)(std::forward<Ts>(args)...);
 
-    return this->handle_unknown_key(key);
+    return this->handle_unknown_id(id);
   }
 
   ///@}
@@ -111,18 +133,18 @@ class generic_factory : private KeyErrorPolicy<KeyT,BaseT>
   ///@{
 
   /** @brief Get the number of registered builders. */
-  size_type get_num_registered_builders() const noexcept
+  size_type size() const noexcept
   {
     return m_registered_builders.size();
   }
 
   /** @brief Get the names of all builders known to the factory.
    *
-   *  @return A list of the known keys.
+   *  @return A list of the known ids.
    */
-  std::forward_list<key_type> get_registered_keys() const
+  std::forward_list<id_type> registered_ids() const
   {
-    std::forward_list<key_type> names;
+    std::forward_list<id_type> names;
     std::transform(
       m_registered_builders.cbegin(), m_registered_builders.cend(),
       std::front_inserter(names),
@@ -137,9 +159,10 @@ class generic_factory : private KeyErrorPolicy<KeyT,BaseT>
   }
 
 private:
-  /** @brief An associative list of keys and builders. */
+  /** @brief An associative list of ids and builders. */
   map_type m_registered_builders;
 };// class generic_factory
 
 }// namespace lbann
-#endif /* LBANN_UTILS_FACTORY_HPP_ */
+#endif // LBANN_HAS_DIHYDROGEN
+#endif // LBANN_UTILS_FACTORY_HPP_
diff --git a/include/lbann/utils/factory_error_policies.hpp b/include/lbann/utils/factory_error_policies.hpp
index 22061ad9646..7f8dd6a51bb 100644
--- a/include/lbann/utils/factory_error_policies.hpp
+++ b/include/lbann/utils/factory_error_policies.hpp
@@ -9,44 +9,44 @@
 namespace lbann
 {
 
-/** @class default_key_error_policy
- *  @brief Default policy describing how to handle unknown keys.
+/** @class default_id_error_policy
+ *  @brief Default policy describing how to handle unknown ids.
  *
- *  The policy must define "handle_unknown_key(KeyT const&)".
+ *  The policy must define "handle_unknown_id(IdT const&)".
  *
  *  The default behavior is to throw an exception.
  *
- *  @tparam KeyT The type of key.
+ *  @tparam IdT The type of id.
  *  @tparam ObjectT The type of the object being constructed by the factory.
  */
-template <typename KeyT, class ObjectT>
+template <typename IdT, class ObjectT>
 struct default_key_error_policy
 {
-  std::unique_ptr<ObjectT> handle_unknown_key(KeyT const&) const
+  std::unique_ptr<ObjectT> handle_unknown_id(IdT const& id) const
   {
-    // This could be expanded to print the key, but that would
-    // assume that either the key can be inserted into a stream or
-    // that the key can be converted to a string, which isn't
+    // This could be expanded to print the id, but that would
+    // assume that either the id can be inserted into a stream or
+    // that the id can be converted to a string, which isn't
     // necessarily the case.
-    LBANN_ERROR("Unknown key detected.");
+    LBANN_ERROR("Unknown id \"", id, "\" detected.");
   }
 };// class default_key_error_policy
 
 /** @class nullptr_key_error_policy
- *  @brief Policy returning a nullptr if the key is unknown
+ *  @brief Policy returning a nullptr if the id is unknown
  *
  *  This class just returns "nullptr". Use of this class is not
  *  recommended as it probably indicates bad design that would better
  *  utilize exception handling. But it felt awkward to not at least
  *  provide it.
  *
- *  @tparam KeyT The type of key.
+ *  @tparam IdT The type of id.
  *  @tparam ObjectT The type of the object being constructed by the factory.
  */
-template <typename KeyT, class ObjectT>
+template <typename IdT, class ObjectT>
 struct nullptr_key_error_policy
 {
-  std::unique_ptr<ObjectT> handle_unknown_key(KeyT const&) const noexcept
+  std::unique_ptr<ObjectT> handle_unknown_id(IdT const&) const noexcept
   {
     return nullptr;
   }
diff --git a/include/lbann/utils/from_string.hpp b/include/lbann/utils/from_string.hpp
new file mode 100644
index 00000000000..9a7c21e0d52
--- /dev/null
+++ b/include/lbann/utils/from_string.hpp
@@ -0,0 +1,132 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_FROM_STRING_INCLUDED
+#define LBANN_UTILS_FROM_STRING_INCLUDED
+
+#include <algorithm>
+#include <string>
+
+namespace lbann {
+namespace utils {
+
+/** @brief An exceedingly simple implementation of boost::lexical_cast, e.g.
+ *
+ *  Generally, these implementations prefer `sto*` function calls to
+ *  the stream method because stream operators do not provide
+ *  straight-forward error feedback.
+ *
+ *  @tparam T The type to cast to.
+ *
+ *  @param str The input string.
+ *
+ *  @return The value of the input string as a T.
+ *
+ *  @todo chars, shorts, unsigned. Bool needs some work.
+ */
+template <typename T>
+T from_string(std::string const& str);
+
+inline std::string from_string(std::string&& str)
+{
+  return std::move(str);
+}
+
+template <>
+inline std::string from_string<std::string>(std::string const& str)
+{
+  return str;
+}
+
+template <>
+inline int from_string<int>(std::string const& str)
+{
+  return std::stoi(str);
+}
+
+template <>
+inline long from_string<long>(std::string const& str)
+{
+  return std::stol(str);
+}
+
+template <>
+inline long long from_string<long long>(std::string const& str)
+{
+  return std::stoll(str);
+}
+
+template <>
+inline unsigned long from_string<unsigned long>(std::string const& str)
+{
+  return std::stoul(str);
+}
+
+template <>
+inline unsigned long long from_string<unsigned long long>(std::string const& str)
+{
+  return std::stoull(str);
+}
+
+template <>
+inline float from_string<float>(std::string const& str)
+{
+  return std::stof(str);
+}
+
+template <>
+inline double from_string<double>(std::string const& str)
+{
+  return std::stod(str);
+}
+
+template <>
+inline long double from_string<long double>(std::string const& str)
+{
+  return std::stold(str);
+}
+
+template <>
+inline bool from_string<bool>(std::string const& str)
+{
+  auto upcase = [](std::string s) {
+                  std::transform(s.begin(), s.end(), s.begin(),
+                                 [](unsigned char c)
+                                 { return std::toupper(c); });
+                  return s;
+                };
+  auto upper = upcase(str);
+  if (upper == "TRUE")
+    return true;
+  else if (upper == "FALSE")
+    return false;
+  else
+    return from_string<int>(str);
+}
+
+}// namespace utils
+}// namespace lbann
+#endif // LBANN_UTILS_FROM_STRING_INCLUDED
diff --git a/include/lbann/utils/graph.hpp b/include/lbann/utils/graph.hpp
index f9d4522762a..d81e66ab3a7 100644
--- a/include/lbann/utils/graph.hpp
+++ b/include/lbann/utils/graph.hpp
@@ -36,7 +36,7 @@ namespace graph {
 /** Print the nodes and edges of a graph to an output stream. */
 void print(const std::set<El::Int>& nodes,
            const std::map<El::Int,std::set<El::Int>>& edges,
-           std::ostream& os = std::cout);
+           std::ostream& os);
 
 /** Get nodes adjacent to a given node. */
 std::set<El::Int> get_neighbors(El::Int node,
diff --git a/include/lbann/utils/h2_tmp.hpp b/include/lbann/utils/h2_tmp.hpp
new file mode 100644
index 00000000000..ec23b6d7e67
--- /dev/null
+++ b/include/lbann/utils/h2_tmp.hpp
@@ -0,0 +1,867 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_H2_TMP_HPP_
+#define LBANN_UTILS_H2_TMP_HPP_
+
+#include <lbann_config.hpp>
+
+// Disable C++17 features.
+#ifndef H2_NO_CXX17
+#define H2_NO_CXX17
+#endif // H2_NO_CXX17
+
+#ifdef LBANN_HAS_DIHYDROGEN
+
+#include <h2/meta/Core.hpp>
+#include <h2/meta/TypeList.hpp>
+#include <h2/patterns/multimethods/SwitchDispatcher.hpp>
+
+#else // !LBANN_HAS_DIHYDROGEN
+
+// WARNING: This code is deprecated and will be removed when
+// DiHydrogen becomes a required dependency of LBANN.
+
+/** @file
+ *
+ *  This file contains a small slice of the metaprogramming library
+ *  available in DiHydrogen. This file will eventually be deleted as
+ *  DiHydrogen is integrated into LBANN; however, this was seen as too
+ *  large a task for the present needs.
+ *
+ *  @note Testing for this functionality has not been included in
+ *        LBANN. It is available in the main H2 repository.
+ *
+ *  @warning This file is deprecated and will be removed when
+ *           DiHydrogen becomes a required dependency of LBANN.
+ */
+
+#include <utility> // std::forward, std::declval
+
+#ifndef H2_META_CORE_LAZY_HPP_
+#define H2_META_CORE_LAZY_HPP_
+
+namespace h2
+{
+namespace meta
+{
+
+/** @brief Suspend a given type. */
+template <typename T>
+struct Susp
+{
+    using type = T;
+};
+
+/** @brief Extract the internal type from a suspended type. */
+template <typename SuspT>
+using Force = typename SuspT::type;
+
+}// namespace meta
+}// namespace h2
+#endif // H2_META_CORE_LAZY_HPP_
+
+#ifndef H2_META_CORE_SFINAE_HPP_
+#define H2_META_CORE_SFINAE_HPP_
+
+namespace h2
+{
+namespace meta
+{
+
+/** @brief A SFINAE tool for excluding functions/overloads.
+ *
+ *  Contains a typedef `type` if the condition is `true`.
+ */
+template <bool B, typename ResultT = void>
+struct EnableIfT;
+
+/** @brief A SFINAE tool that contains a type when the condition is true. */
+template <bool B, typename ResultT = void>
+using EnableIf = meta::Force<EnableIfT<B, ResultT>>;
+
+/** @brief An alias for EnableIf. */
+template <bool B, typename ResultT = void>
+using EnableWhen = EnableIf<B, ResultT>;
+
+/** @brief A SFINAE tool that contains a type when the condition is false. */
+template <bool B, typename ResultT = void>
+using EnableUnless = EnableWhen<!B, ResultT>;
+
+/** @brief A version of EnableIf that operates on valued types. */
+template <typename B, typename ResultT = void>
+using EnableIfV = EnableIf<B::value, ResultT>;
+
+/** @brief An alias for EnableIfV. */
+template <typename B, typename ResultT = void>
+using EnableWhenV = EnableWhen<B::value, ResultT>;
+
+/** @brief A version of EnableUnless that operates on valued types. */
+template <typename B, typename ResultT = void>
+using EnableUnlessV = EnableUnless<B::value, ResultT>;
+
+/** @brief Representation of a substitution failure.
+ *
+ *  This follows an idiom I first encountered in _The C++ Programming
+ *  Language_ by Bjarne Stroustrop.
+ */
+struct SubstitutionFailure;
+
+/** @brief Representation of a substitution success. */
+template <typename T>
+struct SubstitutionSuccess
+{
+    static constexpr bool value = true;
+};
+
+/** @brief Substitution failure is not success. */
+template <>
+struct SubstitutionSuccess<SubstitutionFailure>
+{
+    static constexpr bool value = false;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template <bool B, typename ResultT> struct EnableIfT {};
+
+template <typename ResultT>
+struct EnableIfT<true, ResultT>
+{
+    using type = ResultT;
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+}// namespace meta
+}// namespace h2
+#endif // H2_META_CORE_SFINAE_HPP_
+
+#ifndef H2_META_CORE_INVOCABLE_HPP_
+#define H2_META_CORE_INVOCABLE_HPP_
+
+#include <utility>
+
+namespace h2
+{
+namespace meta
+{
+
+/** @brief Test whether F can be invoked with the given arguments. */
+template <typename F, typename... Args>
+struct IsInvocableVT;
+
+/** @brief Test whether F can be invoked with the given arguments. */
+template <typename F, typename... Args>
+inline constexpr bool IsInvocableV()
+{
+    return IsInvocableVT<F, Args...>::value;
+}
+
+#ifndef H2_NO_CXX17
+/** @brief Test whether F can be invoked with the given arguments. */
+template <typename F, typename... Args>
+inline constexpr bool IsInvocable = IsInvocableV<F, Args...>();
+#endif // H2_NO_CXX17
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+namespace details
+{
+
+// This is a detail nobody needs to see.
+template <typename F, typename... Args>
+struct GetInvocationResultT
+{
+private:
+    template <typename F_deduce, typename... Args_deduce>
+    static auto check(F_deduce f, Args_deduce&&... args)
+        -> decltype(f(std::forward<Args_deduce>(args)...));
+    static SubstitutionFailure check(...);
+public:
+    using type = decltype(check(std::declval<F>(), std::declval<Args>()...));
+};
+
+template <typename F, typename... Args>
+using GetInvocationResult = meta::Force<GetInvocationResultT<F, Args...>>;
+
+}// namespace details
+
+template <typename F, typename... Args>
+struct IsInvocableVT
+    : SubstitutionSuccess<details::GetInvocationResult<F, Args...>>
+{};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+}// namespace meta
+}// namespace h2
+#endif // H2_META_CORE_INVOCABLE_HPP_
+
+#ifndef H2_META_CORE_VALUEASTYPE_HPP_
+#define H2_META_CORE_VALUEASTYPE_HPP_
+
+namespace h2
+{
+namespace meta
+{
+
+/** @brief A constexpr value represented as a type. */
+template <typename T, T Value>
+struct ValueAsTypeT
+{
+    static constexpr T value = Value;
+    using value_type = T;
+    using type = ValueAsTypeT;
+    constexpr operator value_type() const noexcept { return value; }
+    constexpr value_type operator()() const noexcept { return value; }
+};
+
+/** @brief A constexpr value represented as a type. */
+template <typename T, T Value>
+using ValueAsType = Force<ValueAsTypeT<T, Value>>;
+
+/** @brief A representation of boolean `true` values as a type. */
+using TrueType = ValueAsType<bool, true>;
+
+/** @brief A representation of boolean `false` values as a type. */
+using FalseType = ValueAsType<bool, false>;
+
+}// namespace meta
+}// namespace h2
+#endif // H2_META_CORE_VALUEASTYPE_HPP_
+
+#ifndef H2_META_TYPELIST_TYPELIST_HPP_
+#define H2_META_TYPELIST_TYPELIST_HPP_
+
+namespace h2
+{
+namespace meta
+{
+
+/** @struct TypeList
+ *  @brief A basic type list.
+ *
+ *  Functions that act on typelists are in the tlist namespace. There
+ *  are basic accessors that offer either Lisp- or Haskell-like
+ *  semantics. In a post-C++11 world, Haskell semantics are probably
+ *  closer to what is natural in template metaprogramming.
+ *
+ *  When Lisp-family semantic choices need to be made (e.g., what
+ *  happens when you take the car of the empty list), the ANSI Common
+ *  Lisp standard is followed.
+ *
+ *  When ML-family semantic choices need to be made, Haskell
+ *  conventions are adopted.
+ */
+template <typename... Ts>
+struct TypeList;
+
+/** @brief A short-hand alias for TypeLists. */
+template <typename... Ts>
+using TL = TypeList<Ts...>;
+
+/** @brief Basic metamethods on TypeLists. */
+namespace tlist
+{
+/** @brief The empty list. */
+using Empty = TypeList<>;
+
+/** @brief The empty list. */
+using Nil = Empty;
+
+}// namespace tlist
+
+// Implementation
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+// This gives typelists boolean value semantics. It's not clear if
+// this matters.
+template <typename... Ts>
+struct TypeList : TrueType {};
+
+template <>
+struct TypeList<> : FalseType {};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+}// namespace meta
+}// namespace h2
+#endif // H2_META_TYPELIST_TYPELIST_HPP_
+
+#ifndef H2_META_TYPELIST_LISPACCESSORS_HPP_
+#define H2_META_TYPELIST_LISPACCESSORS_HPP_
+
+namespace h2
+{
+namespace meta
+{
+namespace tlist
+{
+
+/** @brief The basic Cons operation.
+ *  @details Prepend an item to a list.
+ *  @tparam T The new item to prepend to the list
+ *  @tparam List The list
+ */
+template <typename T, typename List>
+struct ConsT;
+
+/** @brief An appending version of the Cons operation.
+ *  @details A naive lisp implementation makes this an O(n) operation;
+ *           however, the nature of variadic templates allows an O(1)
+ *           implementation. Thus it is provided as a convenience.
+ *  @tparam T The new item to prepend to the list
+ *  @tparam List The list
+ */
+template <typename List, typename T>
+struct ConsBackT;
+
+/** @brief Get the first item in a list. */
+template <typename List>
+struct CarT;
+
+/** @brief Get a copy of the list with the first item removed. */
+template <typename List>
+struct CdrT;
+
+/** @brief The basic Cons operation.
+ *  @details Prepend an item to a list.
+ *  @tparam T The new item to prepend to the list
+ *  @tparam List The list
+ */
+template <typename T, typename List>
+using Cons = Force<ConsT<T,List>>;
+
+/** @brief An appending version of the Cons operation.
+ *  @details Append an item to a list.
+ *  @tparam List The list
+ *  @tparam T The new item to prepend to the list
+ */
+template <typename List, typename T>
+using ConsBack = Force<ConsBackT<List,T>>;
+
+/** @brief Get the first item in a list
+ *  @tparam List The list.
+ */
+template <typename List>
+using Car = Force<CarT<List>>;
+
+/** @brief Get a copy of the list with the first item removed
+ *  @tparam List The list
+ */
+template <typename List>
+using Cdr = Force<CdrT<List>>;
+
+// A few Lisp-y things. The CL spec goes out to 4 operations.
+
+// 2 operations
+template <typename List> using Caar = Car<Car<List>>;
+template <typename List> using Cadr = Car<Cdr<List>>;
+template <typename List> using Cdar = Cdr<Car<List>>;
+template <typename List> using Cddr = Cdr<Cdr<List>>;
+
+// 3 operations
+template <typename List> using Caaar = Car<Caar<List>>;
+template <typename List> using Caadr = Car<Cadr<List>>;
+template <typename List> using Cadar = Car<Cdar<List>>;
+template <typename List> using Cdaar = Cdr<Caar<List>>;
+template <typename List> using Caddr = Car<Cddr<List>>;
+template <typename List> using Cddar = Cdr<Cdar<List>>;
+template <typename List> using Cdadr = Cdr<Cadr<List>>;
+template <typename List> using Cdddr = Cdr<Cddr<List>>;
+
+// 4 operations
+template <typename List> using Caaaar = Car<Caaar<List>>;
+template <typename List> using Caaadr = Car<Caadr<List>>;
+template <typename List> using Caadar = Car<Cadar<List>>;
+template <typename List> using Cadaar = Car<Cdaar<List>>;
+template <typename List> using Cdaaar = Cdr<Caaar<List>>;
+template <typename List> using Caaddr = Car<Caddr<List>>;
+template <typename List> using Cadadr = Car<Cdadr<List>>;
+template <typename List> using Cdaadr = Cdr<Caadr<List>>;
+template <typename List> using Cdadar = Cdr<Cadar<List>>;
+template <typename List> using Cddaar = Cdr<Cdaar<List>>;
+template <typename List> using Caddar = Car<Cddar<List>>;
+template <typename List> using Cadddr = Car<Cdddr<List>>;
+template <typename List> using Cdaddr = Cdr<Caddr<List>>;
+template <typename List> using Cddadr = Cdr<Cdadr<List>>;
+template <typename List> using Cdddar = Cdr<Cddar<List>>;
+template <typename List> using Cddddr = Cdr<Cdddr<List>>;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+// Cons
+template <typename T, typename... Ts>
+struct ConsT<T,TypeList<Ts...>>
+{
+    using type = TypeList<T,Ts...>;
+};
+
+// ConsBack
+template <typename T, typename... Ts>
+struct ConsBackT<TypeList<Ts...>, T>
+{
+    using type = TypeList<Ts..., T>;
+};
+
+// Car
+template <typename T, typename... Ts>
+struct CarT<TypeList<T,Ts...>>
+{
+    using type = T;
+};
+
+template <>
+struct CarT<Empty>
+{
+    using type = Nil;
+};
+
+// Cdr
+template <typename T, typename... Ts>
+struct CdrT<TypeList<T, Ts...>>
+{
+    using type = TypeList<Ts...>;
+};
+
+template <>
+struct CdrT<Empty>
+{
+    using type = Empty;
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+}// namespace tlist
+}// namespace meta
+}// namespace h2
+#endif // H2_META_TYPELIST_LISPACCESSORS_HPP_
+
+#ifndef H2_META_TYPELIST_EXPAND_HPP_
+#define H2_META_TYPELIST_EXPAND_HPP_
+
+namespace h2
+{
+namespace meta
+{
+namespace tlist
+{
+
+/** @brief Expand a template and parameters into a typelist */
+template <template <typename> class UnaryT, typename... Ts>
+struct ExpandT;
+
+/** @brief Expand a template and parameters into a typelist */
+template <template <typename> class UnaryT, typename... Ts>
+using Expand = Force<ExpandT<UnaryT, Ts...>>;
+
+/** @brief Expand a template and parameters stored in a typelist into
+ *  a typelist.
+ */
+template <template <typename> class UnaryT, typename TList>
+struct ExpandTLT;
+
+/** @brief Expand a template and parameters stored in a typelist into
+ *  a typelist.
+ */
+template <template <typename> class UnaryT, typename TList>
+using ExpandTL = Force<ExpandTLT<UnaryT, TList>>;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template <template <typename> class UnaryT, typename... Ts>
+struct ExpandT
+{
+    using type = TL<UnaryT<Ts>...>;
+};
+
+template <template <typename> class UnaryT, typename... Ts>
+struct ExpandTLT<UnaryT, TL<Ts...>>
+{
+    using type = Expand<UnaryT, Ts...>;
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+}// namespace tlist
+}// namespace meta
+}// namespace h2
+#endif // H2_META_TYPELIST_EXPAND_HPP_
+
+#ifndef H2_META_TYPELIST_MEMBER_HPP_
+#define H2_META_TYPELIST_MEMBER_HPP_
+
+namespace h2
+{
+namespace meta
+{
+namespace tlist
+{
+
+/** @brief Determine if T is a member of List. */
+template <typename T, typename List>
+struct MemberVT;
+
+/** @brief Determine if T is a member of List. */
+template <typename T, typename List>
+constexpr bool MemberV() { return MemberVT<T, List>::value; }
+
+#ifdef H2_USE_CXX17
+template <typename T, typename List>
+inline constexpr bool Member = MemberV<T,List>();
+#endif // TOM_USE_CXX17
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+// Base case
+template <typename T>
+struct MemberVT<T, Empty>
+  : FalseType
+{};
+
+// Match case
+template <typename T, typename... Ts>
+struct MemberVT<T, TL<T, Ts...>>
+  : TrueType
+{};
+
+// Recursive case
+template <typename T, typename Head, typename... Tail>
+struct MemberVT<T, TL<Head, Tail...>>
+  : MemberVT<T, TL<Tail...>>
+{};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+}// namespace tlist
+}// namespace meta
+}// namespace h2
+#endif // H2_META_TYPELIST_MEMBER_HPP_
+
+#ifndef H2_MULTIMETHODS_SWITCHDISPATCHER_HPP_
+#define H2_MULTIMETHODS_SWITCHDISPATCHER_HPP_
+
+namespace h2
+{
+namespace multimethods
+{
+
+/** @brief Dispatch a functor call based on the dynamic type of the arguments.
+ *
+ *  @tparam FunctorT The type of the functor to dispatch. It must
+ *          implement `operator()`. All overloads must have the same
+ *          return type.
+ *  @tparam ReturnT The return type of all overloads of `operator()`.
+ *  @tparam ArgumentTs The types of the arguments to the
+ *          functor. Arguments that are part of a `BaseTypesPair` will
+ *          undergo dynamic deduction.
+ *
+ *  @section switch-dispatch-intro Introduction
+ *
+ *  The problem of multiple dispatch is that, occasionally, objects
+ *  need to interact at the public API level via references to their
+ *  base class(es) but have implementations that vary based on the
+ *  concrete (dynamic) types of the objects. Handling this dispatch
+ *  manually is messy, prone to duplication, and difficult to
+ *  maintain. This dispatcher implements one solution to this problem
+ *  by deducing the dynamic type of certain types of arguments in a
+ *  brute-force fashion. That is, code is generated for all possible
+ *  combinations of dynamically-deduced types, though some may end in
+ *  exceptions being thrown if no viable dispatch is found.
+ *
+ *  @section switch-dispatch-algo Algorithm
+ *
+ *  This implements a "switch-on-type" approach to multiple dispatch,
+ *  and it can handle any number of dynamically-deduced arguments. The
+ *  type of each argument is determined in order, first to last, by
+ *  checking a user-provided list of possible dynamic types. The
+ *  checks are done by use of `dynamic_cast`, so there is extensive
+ *  use of Runtime Type Information (RTTI). If an argument's dynamic
+ *  type cannot be deduced, it is left to the user to handle dispatch
+ *  errors. How that is handled is entirely outside the scope of this
+ *  dispatcher; for more information, see @ref
+ *  switch-dispatch-usage-functor "the expections on functors".
+ *
+ *  @subsection switch-dispatch-algo-inspiration Inspiration
+ *
+ *  This is inspired by the StaticDispatcher in _Modern C++ Design_ by
+ *  Alexei Alexandrescu, with some improvements for modern C++
+ *  standards. Most notably, this seamlessly handles an arbitrary
+ *  number of arguments, whereas that reference only demonstrates
+ *  double dispatch, the two-argument case. This also admits
+ *  additional "unclosed" arguments (i.e., not held as members in the
+ *  functor), though this is somewhat clunky and not strictly
+ *  necessary (because they could just be closed in the functor).
+ *
+ *  @section switch-dispatch-usage Usage
+ *
+ *  Multiple dispatch should always be hidden in at least one layer of
+ *  indirection and should not be part of a public implementation
+ *  (i.e., "client code"). There are two components to using this
+ *  dispatcher, preparation of the functor and the multiple dispatch
+ *  call site. These are covered in more detail below.
+ *
+ *  @subsection switch-dispatch-usage-functor Functor Preparation
+ *
+ *  This section details the requirements on the functor that is
+ *  passed into the dispatcher.
+ *
+ *  The dispatcher is responsible for determining the dynamic type of
+ *  each "virtual" argument; there is no way for it to dispatch
+ *  directly to an overloaded function (since function names are not
+ *  first-class symbols as they are in, say, LISP languages). Thus we
+ *  take the standard approach of adding a layer of indirection,
+ *  namely running dispatch through an object with suitably overloaded
+ *  member functions. This object is a "functor" (Alexandrescu calls
+ *  them "executors"), a callable object.
+ *
+ *  The functor is required to have `operator()` implemented for every
+ *  combination of types that is dispatchable. For dispatch to have
+ *  guaranteed success, the overload set must contain every possible
+ *  combination of types from the given typelists, and every possible
+ *  dynamic type for each argument must be present in the given
+ *  typelists. Additionally, each overload must have the same return
+ *  type. Note that templates or "partially dynamically-typed"
+ *  overloads are able to cover various cases, as needed. For example,
+ *  if (some of) the overload set is already available as free
+ *  functions, a template would be an easy way to thunk the dispatch
+ *  to these free functions.
+ *
+ *  While it is *strongly* encouraged to treat the functor as a
+ *  closure around the non-deduced arguments, it is possible to expose
+ *  additional "unenclosed" arguments that are not deduced in the
+ *  functor interface. These arguments must be positioned *before* the
+ *  deduced arguments in formal argument list for `operator()`. For
+ *  example, the following is a valid use of an additional argument:
+ *
+ *  @code{.cpp}
+ *  struct MyFunctor {
+ *    void operator()(int x, deduced& a, deduced& b) {...}
+ *  };
+ *  @endcode
+ *
+ *  The following is an *invalid* use of an additional argument:
+ *
+ *  @code{.cpp}
+ *  struct MyFunctor {
+ *    // ERROR: Additional argument splits deduced arguments
+ *    void operator()(deduced& a, int x, deduced& b) {...}
+ *    // ERROR: Additional argument follows deduced arguments
+ *    void operator()(deduced& a, deduced& b, int x) {...}
+ *  };
+ *  @endcode
+ *
+ *  The reason for this restriction is technical, and may be lifted in
+ *  the future. Note that the ordering of formal arguments to the
+ *  dispatcher will be given in a @ref
+ *  switch-dispatch-usage-call-site-arguments "different order".
+ *
+ *  @subsubsection switch-dispatch-usage-functor-errors Error handling
+ *
+ *  Handling errors is deferred to the functor as well. There are two
+ *  types of possible errors that can come out of the dynamic dispatch
+ *  process, and the functor class must provide a mechanism for
+ *  dealing with each of them.
+ *
+ *  First, the dynamic type of an argument might not be found in that
+ *  argument's typelist. For this, the functor is required to provide
+ *  the function `ReturnT DeductionError(...)`. Currently, the
+ *  argument list must be variadic; this is a detail of the dispatch
+ *  engine that is being ironed out and will hopefully disappear. When
+ *  that happens, the requirement will be "... the function `ReturnT
+ *  DeductionError(base_typed_signature)`.
+ *
+ *  Second, the functor may not be callable with the deduced
+ *  types. The functor is required to provide a function equivalent to
+ *  `ReturnT DispatchError(Args)` in this case, where `Args` matches
+ *  the argument list for `operator()` with dynamically-deduced
+ *  arguments replaced by their respective base-class references. More
+ *  complex techniques (such as templates) could also be used to
+ *  provide more detailed functionality.
+ *
+ *  Ultimately, what happens inside these error-handling functions is
+ *  up to the implementation of the functor; no expection or
+ *  requirement is imposed by this dispatcher. That is, these cases
+ *  are only known to be errors with respect to the dynamic dispatch
+ *  engine; it is use-case-specific whether this constitutes a program
+ *  error. These functions merely provide a signal to the functor that
+ *  this has situation has occurred.
+ *
+ *  It is important to note that these functions are always required
+ *  to be present in a functor. There may be particular use-cases of
+ *  this dispatcher that can be implemented such that these cases
+ *  cannot occur at runtime; the error functions are still required to
+ *  be present. They may be empty.
+ *
+ *  @subsection switch-dispatch-usage-call-site Call-site Particulars
+ *
+ *  This section details the use-patterns and idiosyncracies of using
+ *  this dispatcher to achieve multiple dispatch.
+ *
+ *  It bears repeating that this dispatch engine does not directly
+ *  operate on overloaded functions; it requires @ref
+ *  switch-dispatch-usage-functor "functors with special structure".
+ *  Once that has been designed as described, usage is
+ *  straight-forward. First, the template arguments to the dispatcher
+ *  must be created. Then, the arguments to the dispatcher must be
+ *  ordered correctly.
+ *
+ *  @subsubsection switch-dispatch-usage-call-site-tparams Template Parameters
+ *
+ *  For a functor with `N` dynamically-deduced arguments, there will
+ *  be `2+2*N` template parameters to the dispatcher. The first two
+ *  are very simple: the type of the functor and the type that is
+ *  returned by its `operator()` (or the overload set that will be
+ *  exploited in this dispatch). Following that, the remaining `2*N`
+ *  arguments must be given in pairs: first a base type, then a list
+ *  of concrete types against which to test the formal argument. These
+ *  must be given in the same order as the dynamically-deduced formal
+ *  arguments, and there must be one pair for each formal argument,
+ *  even if that means repeating pairs. This may be optimized away in
+ *  the future.
+ *
+ *  @subsubsection switch-dispatch-usage-call-site-arguments Formal Arguments
+ *
+ *  The dispatcher exposes a single static API: `Exec(...)`. This
+ *  function has return type as specified in the template
+ *  parameters. The arguments are as follows:
+ *
+ *    -# A functor object, by value.
+ *    -# The arguments that will be dynamically deduced, in the same
+ *       order that they will be passed to the functor's `operator()`.
+ *    -# The extra "unclosed" arguments, in the same order that they will
+ *       be passed to the functor's `operator()`.
+ *
+ *  Note that these last two groups are ordered differently than when
+ *  implementing the functor. This is intentional. Work is in-progress
+ *  to resolve this confusion.
+ *
+ *  @warning This method of multiple dispatch is robust, but it relies
+ *  on `dynamic_cast` to check the type of each argument. This heavy
+ *  use of RTTI could affect performance if not used carefully. It is
+ *  left to users of this dispatch engine to determine whether this
+ *  cost is acceptable. In general, it is advisable to avoid multiple
+ *  dispatch issues inside tight loops and other performance-critical
+ *  sections.
+ *
+ *  @warning If the functor is implemented using templates, this could
+ *  implicitly instantiate all combinations of parameters if care has
+ *  not been taken to prevent this. If this incurs too high a
+ *  compilation cost, perhaps consider controlling instantiation via
+ *  explicit template instantiation, using ETI declarations where
+ *  appropriate.
+ *
+ */
+template <
+    typename FunctorT,
+    typename ReturnT,
+    typename... ArgumentTs>
+class SwitchDispatcher;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template <
+    typename FunctorT,
+    typename ReturnT,
+    typename ThisBase,
+    typename ThisList,
+    typename... ArgumentTs>
+class SwitchDispatcher<FunctorT, ReturnT,
+                       ThisBase, ThisList,
+                       ArgumentTs...>
+{
+    static_assert(sizeof...(ArgumentTs) % 2 == 0,
+                  "Must pass ArgumentTs as (Base, TL<DTypes>).");
+
+public:
+    template <typename... Args>
+    static ReturnT Exec(FunctorT F, ThisBase& arg, Args&&... others)
+    {
+        using Head = meta::tlist::Car<ThisList>;
+        using Tail = meta::tlist::Cdr<ThisList>;
+
+        if (auto* arg_dc = dynamic_cast<Head*>(&arg))
+            return SwitchDispatcher<FunctorT, ReturnT, ArgumentTs...>::
+                Exec(F, std::forward<Args>(others)..., *arg_dc);
+        else
+            return SwitchDispatcher<FunctorT, ReturnT,
+                                    ThisBase, Tail,
+                                    ArgumentTs...>::
+                Exec(F, arg, std::forward<Args>(others)...);
+    }
+};
+
+// Base case
+template <
+    typename FunctorT,
+    typename ReturnT>
+class SwitchDispatcher<FunctorT, ReturnT>
+{
+    template <typename... Ts>
+    using Invocable = meta::IsInvocableVT<FunctorT, Ts...>;
+
+public:
+    template <typename... Args,
+              meta::EnableWhenV<Invocable<Args...>,int> = 0>
+    static ReturnT Exec(FunctorT F, Args&&... others)
+    {
+        return F(std::forward<Args>(others)...);
+    }
+
+    // All types were deduced, but there is no suitable dispatch for
+    // this case.
+    template <typename... Args,
+              meta::EnableUnlessV<Invocable<Args...>,int> = 0>
+    static ReturnT Exec(FunctorT F, Args&&... args)
+    {
+        return F.DispatchError(std::forward<Args>(args)...);
+    }
+};
+
+// Deduction failure case
+template <
+    typename FunctorT,
+    typename ReturnT,
+    typename ThisBase,
+    typename... ArgumentTs>
+class SwitchDispatcher<FunctorT, ReturnT,
+                       ThisBase, meta::tlist::Empty,
+                       ArgumentTs...>
+{
+public:
+    template <typename... Args>
+    static ReturnT Exec(FunctorT F, Args&&... args)
+    {
+        return F.DeductionError(std::forward<Args>(args)...);
+    }
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+}// namespace multimethods
+}// namespace h2
+#endif // H2_MULTIMETHODS_SWITCHDISPATCHER_HPP_
+
+#endif // LBANN_HAS_DIHYDROGEN
+#endif // LBANN_UTILS_H2_TMP_HPP_
diff --git a/include/lbann/utils/hash.hpp b/include/lbann/utils/hash.hpp
new file mode 100644
index 00000000000..20ff50175bf
--- /dev/null
+++ b/include/lbann/utils/hash.hpp
@@ -0,0 +1,81 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_HASH_HPP_INCLUDED
+#define LBANN_UTILS_HASH_HPP_INCLUDED
+
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+namespace lbann {
+
+/** @brief Combine two hash values
+ *
+ *  A hash function is applied to an object and the resulting hash
+ *  value is mixed with another hash value. See
+ *  https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine.
+ *
+ *  @param seed     Hash value.
+ *  @param val      Input to hash function.
+ *  @tparam Hash    Hash function for type @c T.
+ */
+template <class T, class Hash=std::hash<T>>
+std::size_t hash_combine(std::size_t seed, const T& val) {
+  return seed ^ (Hash()(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+}
+
+/** @brief Hash function for enumeration type
+ *
+ *  Equivalent to @c std::hash if the input is not an enumeration
+ *  type.
+ */
+template <class T>
+struct enum_hash {
+  using underlying_t
+  = typename std::conditional<std::is_enum<T>::value,
+                              typename std::underlying_type<T>::type,
+                              T>::type;
+  std::size_t operator()(T val) const {
+    return std::hash<underlying_t>()(static_cast<underlying_t>(val));
+  }
+};
+
+/** @brief Hash function for @c std::pair */
+template <class T1,
+          class T2,
+          class Hash1=std::hash<T1>,
+          class Hash2=std::hash<T2>>
+struct pair_hash {
+  std::size_t operator()(const std::pair<T1,T2>& val) const {
+    auto seed = Hash1()(val.first);
+    return hash_combine<T2,Hash2>(seed, val.second);
+  }
+};
+
+} // namespace lbann
+
+#endif // LBANN_UTILS_HASH_HPP_INCLUDED
diff --git a/include/lbann/utils/hydrogen_utils.hpp b/include/lbann/utils/hydrogen_utils.hpp
new file mode 100644
index 00000000000..e8c345ed938
--- /dev/null
+++ b/include/lbann/utils/hydrogen_utils.hpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+namespace lbann {
+
+#include <El.hpp>
+#include <lbann/utils/memory.hpp>
+
+template <typename TensorDataType, typename EvalDataType>
+struct ViewIfPossibleOrCopy {
+  static std::unique_ptr<El::AbstractMatrix<EvalDataType>> get(El::AbstractMatrix<TensorDataType> const& x)
+  {
+    switch (x.GetDevice()) {
+    case El::Device::CPU:
+      return get(static_cast<El::Matrix<TensorDataType, El::Device::CPU> const&>(x));
+#ifdef LBANN_HAS_GPU
+    case El::Device::GPU:
+      return get(static_cast<El::Matrix<TensorDataType, El::Device::GPU> const&>(x));
+#endif
+    default: return nullptr;
+    }
+  }
+  template <El::Device D>
+  static std::unique_ptr<El::Matrix<EvalDataType, D>> get(El::Matrix<TensorDataType, D> const& x)
+  {
+    auto ret = make_unique<El::Matrix<EvalDataType, D>>();
+    El::Copy(x, *ret);
+    return ret;
+  }
+};
+
+// Specialize for same data type -- make a view instead.
+template <typename DataType>
+struct ViewIfPossibleOrCopy<DataType,DataType> {
+  static std::unique_ptr<El::AbstractMatrix<DataType>> get(El::AbstractMatrix<DataType> const& x)
+  {
+    switch (x.GetDevice()) {
+    case El::Device::CPU:
+      return get(static_cast<El::Matrix<DataType, El::Device::CPU> const&>(x));
+#ifdef LBANN_HAS_GPU
+    case El::Device::GPU:
+      return get(static_cast<El::Matrix<DataType, El::Device::GPU> const&>(x));
+#endif
+    default: return nullptr;
+    }
+  }
+  template <El::Device D>
+  static std::unique_ptr<El::Matrix<DataType, D>> get(El::Matrix<DataType, D> const& x)
+  {
+    auto ret = make_unique<El::Matrix<DataType, D>>();
+    El::LockedView(*ret, x);
+    return ret;
+  }
+};
+
+}
diff --git a/include/lbann/utils/im2col.hpp b/include/lbann/utils/im2col.hpp
index 7579eb4fa4e..c8fe40c0eb9 100644
--- a/include/lbann/utils/im2col.hpp
+++ b/include/lbann/utils/im2col.hpp
@@ -46,8 +46,9 @@ namespace lbann {
  *  @param window_dims      Dimensions of window.
  *  @param window_strides   Window shift strides.
  */
-void im2col(const CPUMat& im,
-            CPUMat& col,
+template <typename TensorDataType>
+void im2col(const CPUMatDT<TensorDataType>& im,
+            CPUMatDT<TensorDataType>& col,
             int num_channels,
             int im_num_dims,
             const int * im_dims,
@@ -71,8 +72,9 @@ void im2col(const CPUMat& im,
  *  @param window_dims      Dimensions of window.
  *  @param window_strides   Window shift strides.
  */
-void col2im(const CPUMat& col,
-            CPUMat& im,
+template <typename TensorDataType>
+void col2im(const CPUMatDT<TensorDataType>& col,
+            CPUMatDT<TensorDataType>& im,
             int num_channels,
             int im_num_dims,
             const int * im_dims,
@@ -97,15 +99,16 @@ void col2im(const CPUMat& col,
  *  @param window_strides   Window shift strides.
  *  @param reduction_op     Reduction operation.
  */
-void col2im(const CPUMat& col,
-            CPUMat& im,
+template <typename TensorDataType>
+void col2im(const CPUMatDT<TensorDataType>& col,
+            CPUMatDT<TensorDataType>& im,
             int num_channels,
             int im_num_dims,
             const int * im_dims,
             const int * im_pads,
             const int * window_dims,
             const int * window_strides,
-            std::function<DataType(const DataType&,const DataType&)> reduction_op);
+            std::function<TensorDataType(const TensorDataType&, const TensorDataType&)> reduction_op);
 
 /// Rearrange 1x1 image blocks into matrix columns
 /** This is an optimized implementation of im2col when the window has
@@ -113,8 +116,9 @@ void col2im(const CPUMat& col,
  *  one. im2col will automatically call this routine if it detects a
  *  1x1 im2col.
  */
-void im2col_1x1(const DataType * input_buffer,
-                DataType * output_buffer,
+template <typename TensorDataType>
+void im2col_1x1(const TensorDataType * input_buffer,
+                TensorDataType * output_buffer,
                 int num_channels,
                 int num_input_dims,
                 const int * input_dims);
@@ -123,8 +127,9 @@ void im2col_1x1(const DataType * input_buffer,
 /** This is an optimized implementation of im2col for 2D data. im2col
  *  will automatically call this routine if it detects 2D data.
  */
-void im2col_2d(const DataType *__restrict__ input_buffer,
-               DataType *__restrict__ output_buffer,
+template <typename TensorDataType>
+void im2col_2d(const TensorDataType *__restrict__ input_buffer,
+               TensorDataType *__restrict__ output_buffer,
                int input_dim_x,
                int input_dim_y,
                int input_pad_x,
@@ -141,8 +146,9 @@ void im2col_2d(const DataType *__restrict__ input_buffer,
  *  one. col2im will automatically call this routine if it detects a
  *  1x1 col2im.
  */
-void col2im_1x1(const DataType * input_buffer,
-                DataType * output_buffer,
+template <typename TensorDataType>
+void col2im_1x1(const TensorDataType * input_buffer,
+                TensorDataType * output_buffer,
                 const int num_channels,
                 const int num_output_dims,
                 const int * output_dims);
@@ -151,8 +157,9 @@ void col2im_1x1(const DataType * input_buffer,
 /** This is an optimized implementation of col2im for 2D data. col2im
  *  will automatically call this routine if it detects 2D data.
  */
-void col2im_2d(const DataType *__restrict__ input_buffer,
-               DataType *__restrict__ output_buffer,
+template <typename TensorDataType>
+void col2im_2d(const TensorDataType *__restrict__ input_buffer,
+               TensorDataType *__restrict__ output_buffer,
                int output_dim_x,
                int output_dim_y,
                int output_pad_x,
@@ -163,5 +170,6 @@ void col2im_2d(const DataType *__restrict__ input_buffer,
                int offset_stride_x,
                int offset_stride_y);
 
-} // end namespace
+} // namespace lbann
+
 #endif // LBANN_UTILS_IM2COL_HPP
diff --git a/include/lbann/utils/image.hpp b/include/lbann/utils/image.hpp
new file mode 100644
index 00000000000..e6bbfa6fa99
--- /dev/null
+++ b/include/lbann/utils/image.hpp
@@ -0,0 +1,92 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_IMAGE_HPP
+#define LBANN_UTILS_IMAGE_HPP
+
+#include "lbann/base.hpp"
+
+namespace lbann {
+
+/**
+ * @brief Load an image from filename.
+ * @param filename The path to the image to load.
+ * @param dst Image will be loaded into this matrix, in OpenCV format.
+ * @param dims Will contain the dimensions of the image as {channels, height,
+ * width}.
+ */
+void load_image(const std::string& filename, El::Matrix<uint8_t>& dst,
+                std::vector<size_t>& dims);
+
+/**
+ * @brief Decode an image from buf.
+ * @param src A buffer containing image data to be decoded.
+ * @param dst Image will be loaded into this matrix, in OpenCV format.
+ * @param dims Will contain the dimensions of the image as {channels, height,
+ * width}.
+ */
+void decode_image(El::Matrix<uint8_t>& src, El::Matrix<uint8_t>& dst,
+                  std::vector<size_t>& dims);
+
+/**
+ * @brief Save an image to filename.
+ * @param filename The path to the image to write.
+ * @param src The image to save. This is in OpenCV format.
+ * @param dims The dimensions of the image.
+ */
+void save_image(const std::string& filename, El::Matrix<uint8_t>& src,
+                const std::vector<size_t>& dims);
+/**
+ * @brief Save an image to filename.
+ * @param filename The path to the image to write.
+ * @param src The image to save. This is in standard LBANN format, and will be
+ * converted to a uint8_t matrix, interpolating between the min and max values
+ * in it.
+ * @param dims The dimensions of the image.
+ */
+void save_image(const std::string& filename, const CPUMat& src,
+                const std::vector<size_t>& dims);
+/**
+ * @brief Convert image from El::Matrix<DataType> to El::Matrix<uint8_t>
+ * @param image The image to convert.
+ * @param dims The dimensions of the image.
+ * @returns El::Matrix<uint8_t> Returns image in El::Matrix<uint8_t> format
+ */
+El::Matrix<uint8_t> get_uint8_t_image(const CPUMat& image,
+                                      const std::vector<size_t>& dims);
+/**
+ * @brief Encodes image to std:string format
+ * @param image The image to convert
+ * @param dims The dimensions of the image.
+ * @returns std::string Returns image in std::string format
+ */
+std::string encode_image(const El::Matrix<uint8_t>& image,
+                         const std::vector<size_t>& dims,
+                         std::string const& img_format);
+
+}  // namespace lbann
+
+#endif  // LBANN_UTILS_IMAGE_HPP
diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp
index 56b1ed27d9b..8fa2bb79ff9 100644
--- a/include/lbann/utils/impl/cuda.hpp
+++ b/include/lbann/utils/impl/cuda.hpp
@@ -25,9 +25,14 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <thrust/system/cuda/execution_policy.h>
+
+// Headers for NVCC
 #ifdef __CUDACC__
+#ifdef HYDROGEN_HAVE_CUB
+#include "cub/block/block_reduce.cuh"
+#endif // HYDROGEN_HAVE_CUB
 #include <math_constants.h>
-#include <cuda_fp16.hpp>
+#include <cuda_fp16.h>
 #endif // __CUDACC__
 
 namespace lbann {
@@ -42,7 +47,7 @@ namespace cuda {
 #if __CUDA_ARCH__ >= 530
 template <> __device__ __forceinline__
 __half atomic_add<__half>(__half* address, __half val) {
-#if 0 // TODO: replace this once Nvidia implements atomicAdd for __half
+#if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
   return atomicAdd(address, val);
 #else
   unsigned int* address_as_uint = (unsigned int*) address;
@@ -58,7 +63,7 @@ __half atomic_add<__half>(__half* address, __half val) {
     old = atomicCAS(address_as_uint, assumed, updated);
   } while (assumed != old);
   return *old_as_half;
-#endif // 0
+#endif // __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
 }
 #endif // __CUDA_ARCH__ >= 530
 template <> __device__ __forceinline__
@@ -83,6 +88,59 @@ double atomic_add<double>(double* address, double val) {
 #endif // __CUDA_ARCH__ < 600
 }
 
+// Block reduction
+template <size_t bdimx, size_t bdimy, size_t bdimz, class T>
+__device__ __forceinline__
+T block_reduce(T val) {
+#ifdef HYDROGEN_HAVE_CUB
+  constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
+  using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
+  __shared__ typename BlockReduce::TempStorage workspace;
+  val = BlockReduce(workspace).Sum(val);
+#else
+  const size_t tid = threadIdx.x + threadIdx.y*bdimx + threadIdx.z*bdimx*bdimy;
+  constexpr size_t bsize = bdimx * bdimy * bdimz;
+  __shared__ DataType shared_max_vals[bsize];
+  shared_vals[tid] = val;
+  for (size_t stride = bsize/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (tid < stride) {
+      shared_vals[tid] = shared_vals[tid] + shared_vals[tid+stride];
+    }
+  }
+  if (tid == 0) {
+    val = shared_vals[0];
+  }
+#endif // HYDROGEN_HAVE_CUB
+  return val;
+}
+template <size_t bdimx, size_t bdimy, size_t bdimz, class T, class Op>
+__device__ __forceinline__
+T block_reduce(T val) {
+#ifdef HYDROGEN_HAVE_CUB
+  constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
+  using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
+  __shared__ typename BlockReduce::TempStorage workspace;
+  val = BlockReduce(workspace).Reduce(val, Op());
+#else
+  Op op;
+  const size_t tid = threadIdx.x + threadIdx.y*bdimx + threadIdx.z*bdimx*bdimy;
+  constexpr size_t bsize = bdimx * bdimy * bdimz;
+  __shared__ DataType shared_max_vals[bsize];
+  shared_vals[tid] = val;
+  for (size_t stride = bsize/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (tid < stride) {
+      shared_vals[tid] = op(shared_vals[tid], shared_vals[tid+stride]);
+    }
+  }
+  if (tid == 0) {
+    val = shared_vals[0];
+  }
+#endif // HYDROGEN_HAVE_CUB
+  return val;
+}
+
 // Unary math functions
 #define WRAP_UNARY_CUDA_MATH_FUNCTION(func)                     \
   template <> __device__ __forceinline__                        \
@@ -118,6 +176,71 @@ WRAP_UNARY_CUDA_MATH_FUNCTION(asinh)
 WRAP_UNARY_CUDA_MATH_FUNCTION(atanh)
 #undef WRAP_UNARY_CUDA_MATH_FUNCTION
 
+template <typename T> __device__ __forceinline__
+bool isfinite(T const& x) { return ::isfinite(x); }
+
+template <typename T> __device__ __forceinline__
+bool isnan(T const& x) { return ::isnan(x); }
+
+#if __CUDA_ARCH__ >= 530
+template <> __device__ __forceinline__
+bool isfinite(__half const& x) { return !(::__isnan(x) || ::__hisinf(x)); }
+
+template <> __device__ __forceinline__
+bool isnan(__half const& x) { return ::__hisnan(x); }
+
+// This support is far from complete!
+#define WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(func)              \
+  template <> __device__ __forceinline__                      \
+  __half func<__half>(__half const& x) { return ::h##func(x); }
+
+// FIXME (trb): This is maybe not the best long-term solution, but it
+// might be the best we can do without really digging into
+// half-precision implementation.
+#define WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(func) \
+  template <> __device__ __forceinline__                       \
+  __half func<__half>(__half const& x) { return func(float(x)); }
+
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(round)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(ceil)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(floor)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sqrt)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(rsqrt)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(exp)
+//WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(expm1)
+//
+// FIXME (trb): This is not going to be as accurate as a native expm1
+// implementation could be:
+template <> __device__ __forceinline__
+__half expm1<__half>(__half const& x) {
+    return ::__hsub(::hexp(x), ::__float2half(1.f));
+}
+
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(log)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(log1p)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(cos)
+WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sin)
+
+//WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(tan)
+//
+// FIXME (trb): This just uses the trig identity. Probably less
+// accurate than a native implementation.
+template <> __device__ __forceinline__
+__half tan<__half>(__half const& x) { return ::__hdiv(::hsin(x), ::hcos(x)); }
+
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acos)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asin)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atan)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(cosh)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(sinh)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(tanh)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acosh)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asinh)
+WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atanh)
+
+#undef WRAP_UNARY_CUDA_HALF_MATH_FUNCTION
+#endif // __CUDA_ARCH__ >= 530
+
 // Binary math functions
 #define WRAP_BINARY_CUDA_MATH_FUNCTION(func)                    \
   template <> __device__ __forceinline__                        \
@@ -149,6 +272,24 @@ double mod<double>(const double& x, const double& y) { return ::fmod(x,y); }
 WRAP_BINARY_CUDA_MATH_FUNCTION(pow)
 #undef WRAP_BINARY_CUDA_MATH_FUNCTION
 
+template <> __device__ __forceinline__
+__half pow<__half>(const __half& x, const __half& y)
+{ return pow(float(x), float(y)); }
+
+template <> __device__ __forceinline__
+__half mod<__half>(const __half& x, const __half& y)
+{ return mod(float(x), float(y)); }
+
+#if __CUDA_ARCH__ >= 530
+template <> __device__ __forceinline__
+__half min<__half>(const __half& x, const __half& y)
+{ return ::__hle(x, y) ? x : y; }
+
+template <> __device__ __forceinline__
+__half max<__half>(const __half& x, const __half& y)
+{ return ::__hle(x, y) ? y : x; }
+#endif // __CUDA_ARCH__ >= 530
+
 // Numeric limits
 #ifdef __CUDACC_RELAXED_CONSTEXPR__
 template <typename T> constexpr __device__ __forceinline__ T min() {
@@ -179,9 +320,41 @@ SPECIFIERS constexpr float epsilon<float>()   { return FLT_EPSILON; }
 SPECIFIERS constexpr double epsilon<double>() { return DBL_EPSILON; }
 SPECIFIERS float infinity<float>()   { return CUDART_INF_F; }
 SPECIFIERS double infinity<double>() { return CUDART_INF;   }
-#undef HEADER
+#undef SPECIFIERS
 #endif // __CUDACC_RELAXED_CONSTEXPR__
 
+// FIXME (TRB): I think this is right? Borrowed the values from the
+// sourceforge half library.
+template <> __device__ __forceinline__ __half min<__half>() {
+  return __short_as_half(0x0400);
+}
+template <> __device__ __forceinline__ __half max<__half>() {
+  return __short_as_half(0x7BFF);
+}
+template <> __device__ __forceinline__ __half epsilon<__half>() {
+  return __short_as_half(0x1400);
+}
+template <> __device__ __forceinline__ __half infinity<__half>() {
+  return __short_as_half(0x7C00);
+}
+
+// Array member functions
+template <typename T, size_t N>
+__host__ __device__ __forceinline__
+size_t array<T,N>::size() const {
+  return N;
+}
+template <typename T, size_t N>
+__host__ __device__ __forceinline__
+T& array<T,N>::operator[](size_t i) {
+  return vals[i];
+}
+template <typename T, size_t N>
+__host__ __device__ __forceinline__
+const T& array<T,N>::operator[](size_t i) const {
+  return vals[i];
+}
+
 #endif // __CUDACC__
 
 // -------------------------------------------------------------
@@ -190,17 +363,17 @@ SPECIFIERS double infinity<double>() { return CUDART_INF;   }
 #ifdef __CUDACC__
 
 /** CUDA kernel to apply an entry-wise unary operator. */
-template <typename UnaryOperator>
+template <template <typename> class UnaryOperator, typename TensorDataType>
 __global__
 void entrywise_unary_operator_kernel(El::Int height, El::Int width,
-                                     const DataType* __restrict__ input,
+                                     const TensorDataType* __restrict__ input,
                                      El::Int input_ldim,
-                                     DataType* __restrict__ output,
+                                     TensorDataType* __restrict__ output,
                                      El::Int output_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
   const El::Int num_threads = blockDim.x * gridDim.x;
-  UnaryOperator op;
+  UnaryOperator<TensorDataType> op;
   for (El::Int pos = gid; pos < size; pos += num_threads) {
     const auto& row = pos % height;
     const auto& col = pos / height;
@@ -211,19 +384,19 @@ void entrywise_unary_operator_kernel(El::Int height, El::Int width,
 }
 
 /** CUDA kernel to apply an entry-wise binary operator. */
-template <typename BinaryOperator>
+template <template <typename> class BinaryOperator, typename TensorDataType>
 __global__
 void entrywise_binary_operator_kernel(El::Int height, El::Int width,
-                                     const DataType* __restrict__ input1,
+                                     const TensorDataType* __restrict__ input1,
                                      El::Int input1_ldim,
-                                     const DataType* __restrict__ input2,
+                                     const TensorDataType* __restrict__ input2,
                                      El::Int input2_ldim,
-                                     DataType* __restrict__ output,
+                                     TensorDataType* __restrict__ output,
                                      El::Int output_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
   const El::Int num_threads = blockDim.x * gridDim.x;
-  BinaryOperator op;
+  BinaryOperator<TensorDataType> op;
   for (El::Int pos = gid; pos < size; pos += num_threads) {
     const auto& row = pos % height;
     const auto& col = pos / height;
@@ -238,23 +411,22 @@ void entrywise_binary_operator_kernel(El::Int height, El::Int width,
  *  The input and output data must be on GPU and must have the same
  *  dimensions.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsMat& input,
-                                    AbsMat& output) {
+template <template <typename> class UnaryOp, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractMatrix<TensorDataType>& input,
+  El::AbstractMatrix<TensorDataType>& output) {
 
   // Check that input and output are valid
-  std::stringstream err;
   if (input.GetDevice() != El::Device::GPU) {
     LBANN_ERROR("input is not on GPU");
   } else if (output.GetDevice() != El::Device::GPU) {
     LBANN_ERROR("output is not on GPU");
   } else if (input.Height() != output.Height()
              || input.Width() != output.Width()) {
-    err << "input matrix dimensions "
-        << "(" << input.Height() << " x " << input.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input.Height(), " x ", input.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   }
 
   // Get CUDA grid dimensions
@@ -272,7 +444,7 @@ void apply_entrywise_unary_operator(const AbsMat& input,
   // Launch CUDA kernel
   if (grid_dim > 0) {
     CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    entrywise_unary_operator_kernel<UnaryOperator>
+    entrywise_unary_operator_kernel<UnaryOp>
       <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
         height, width, input.LockedBuffer(), input.LDim(),
         output.Buffer(), output.LDim());
@@ -284,13 +456,13 @@ void apply_entrywise_unary_operator(const AbsMat& input,
  *  The input and output data must be on GPU and must have the same
  *  dimensions.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsMat& input1,
-                                     const AbsMat& input2,
-                                     AbsMat& output) {
+template <template <typename> class BinaryOp, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractMatrix<TensorDataType>& input1,
+  const El::AbstractMatrix<TensorDataType>& input2,
+  El::AbstractMatrix<TensorDataType>& output) {
 
   // Check that input and output are valid
-  std::stringstream err;
   if (input1.GetDevice() != El::Device::GPU
       || input2.GetDevice() != El::Device::GPU) {
     LBANN_ERROR("input is not on GPU");
@@ -300,12 +472,11 @@ void apply_entrywise_binary_operator(const AbsMat& input1,
              || input1.Width() != input2.Width()
              || input1.Height() != output.Height()
              || input1.Width() != output.Width()) {
-    err << "input matrix dimensions "
-        << "(" << input1.Height() << " x " << input1.Width() << ", "
-        << input2.Height() << " x " << input2.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input1.Height(), " x ", input1.Width(), ", ",
+                input2.Height(), " x ", input2.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   }
 
   // Get CUDA grid dimensions
@@ -323,7 +494,7 @@ void apply_entrywise_binary_operator(const AbsMat& input1,
   // Launch CUDA kernel
   if (grid_dim > 0) {
     CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    entrywise_binary_operator_kernel<BinaryOperator>
+    entrywise_binary_operator_kernel<BinaryOp>
       <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
         height, width,
         input1.LockedBuffer(), input1.LDim(),
@@ -337,17 +508,16 @@ void apply_entrywise_binary_operator(const AbsMat& input1,
  *  The input and output data must be on GPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename UnaryOperator>
-void apply_entrywise_unary_operator(const AbsDistMat& input,
-                                    AbsDistMat& output) {
-  std::stringstream err;
+template <template <typename> class UnaryOperator, typename TensorDataType>
+void apply_entrywise_unary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input,
+  El::AbstractDistMatrix<TensorDataType>& output) {
   if (input.Height() != output.Height()
       || input.Width() != output.Width()) {
-    err << "input matrix dimensions "
-        << "(" << input.Height() << " x " << input.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input.Height(), " x ", input.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   } else if (input.DistData() != output.DistData()) {
     LBANN_ERROR("input and output matrix distributions don't match");
   }
@@ -359,21 +529,20 @@ void apply_entrywise_unary_operator(const AbsDistMat& input,
  *  The input and output data must be on GPU, have the same
  *  dimensions, and be aligned.
  */
-template <typename BinaryOperator>
-void apply_entrywise_binary_operator(const AbsDistMat& input1,
-                                     const AbsDistMat& input2,
-                                     AbsDistMat& output) {
+template <template <typename> class BinaryOperator, typename TensorDataType>
+void apply_entrywise_binary_operator(
+  const El::AbstractDistMatrix<TensorDataType>& input1,
+  const El::AbstractDistMatrix<TensorDataType>& input2,
+  El::AbstractDistMatrix<TensorDataType>& output) {
   if (input1.Height() != input2.Height()
       || input1.Width() != input2.Width()
       || input1.Height() != output.Height()
       || input1.Width() != output.Width()) {
-    std::stringstream err;
-    err << "input matrix dimensions "
-        << "(" << input1.Height() << " x " << input1.Width() << ", "
-        << input2.Height() << " x " << input2.Width() << ")"
-        << "don't match output matrix dimensions "
-        << "(" << output.Height() << " x " << output.Width() << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("input matrix dimensions "
+                "(", input1.Height(), " x ", input1.Width(), ", ",
+                input2.Height(), " x ", input2.Width(), ")"
+                "don't match output matrix dimensions "
+                "(", output.Height(), " x ", output.Width(), ")");
   } else if (input1.DistData() != input2.DistData()
              || input1.DistData() != output.DistData()) {
     LBANN_ERROR("input and output matrix distributions don't match");
diff --git a/include/lbann/utils/lbann_library.hpp b/include/lbann/utils/lbann_library.hpp
index 215a63ad542..ac908b3a123 100644
--- a/include/lbann/utils/lbann_library.hpp
+++ b/include/lbann/utils/lbann_library.hpp
@@ -34,18 +34,31 @@ namespace lbann {
 
 const int lbann_default_random_seed = 42;
 
-std::unique_ptr<thread_pool> construct_io_thread_pool(lbann_comm *comm);
+#define MAX_RNG_SEEDS_DISPLAY "RNG seeds per trainer to display"
+#define NUM_IO_THREADS "Num. IO threads"
+
+void construct_std_options();
+
+std::unique_ptr<trainer> construct_trainer(lbann_comm *comm,
+                                           lbann_data::Trainer* pb_trainer,
+                                           lbann_data::LbannPB &pb,
+                                           options *opts);
+
+std::unique_ptr<thread_pool> construct_io_thread_pool(lbann_comm *comm, options *opts);
 
 std::unique_ptr<model> build_model_from_prototext(
     int argc, char **argv,
+    const lbann_data::Trainer* pb_trainer,
     lbann_data::LbannPB &pb,
     lbann_comm *comm,
-    std::shared_ptr<thread_pool> io_thread_pool,
-    bool first_model);
+    options *opts,
+    thread_pool& io_thread_pool,
+    std::vector<std::shared_ptr<callback_base>>& shared_callbacks,
+    int training_dr_linearized_data_size);
 
-void print_lbann_configuration(
-    lbann_data::Model *pb_model, lbann_comm *comm,
-    int io_threads_per_process, int io_threads_offset);
+void print_lbann_configuration(lbann_comm *comm,
+                               int io_threads_per_process,
+                               int io_threads_offset);
 
 } // namespace lbann
 
diff --git a/include/lbann/utils/memory.hpp b/include/lbann/utils/memory.hpp
index fc293fa819e..07ea2dd1385 100644
--- a/include/lbann/utils/memory.hpp
+++ b/include/lbann/utils/memory.hpp
@@ -8,7 +8,7 @@ namespace lbann {
 
 #ifdef LBANN_HAS_STD_MAKE_UNIQUE
 
-using std::make_unique;
+using ::std::make_unique;
 
 #else
 
@@ -23,6 +23,13 @@ std::unique_ptr<T> make_unique(Ts&&... params)
 
 #endif
 
+/** @brief Convert the raw pointer to a unique_ptr. */
+template <typename T>
+std::unique_ptr<T> to_unique_ptr(T* ptr)
+{
+  return std::unique_ptr<T>(ptr);
+}
+
 }// namespace lbann
 
 #endif /* LBANN_MEMORY_HPP_ */
diff --git a/include/lbann/utils/numerical_traits.hpp b/include/lbann/utils/numerical_traits.hpp
new file mode 100644
index 00000000000..e59e7795504
--- /dev/null
+++ b/include/lbann/utils/numerical_traits.hpp
@@ -0,0 +1,39 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_NUMERICAL_TRAITS_HPP
+#define LBANN_NUMERICAL_TRAITS_HPP
+
+namespace lbann
+{
+template <typename T>
+struct NumericalTraits
+{
+    static constexpr T zero() { return T(0); }
+    static constexpr T one() { return T(1); }
+};
+}// namespace lbann
+#endif // LBANN_NUMERICAL_TRAITS_HPP
diff --git a/include/lbann/utils/nvshmem.hpp b/include/lbann/utils/nvshmem.hpp
new file mode 100644
index 00000000000..98c0053efd4
--- /dev/null
+++ b/include/lbann/utils/nvshmem.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_NVSHMEM_HPP_INCLUDED
+#define LBANN_UTILS_NVSHMEM_HPP_INCLUDED
+
+#include "lbann/base.hpp"
+#ifdef LBANN_HAS_NVSHMEM
+#include "lbann/utils/cuda.hpp"
+#include "lbann/utils/exception.hpp"
+#include <mpi.h>
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+namespace lbann {
+namespace nvshmem {
+
+/** Whether NVSHMEM has been initialized. */
+bool is_initialized() noexcept;
+
+/** Whether NVSHMEM has been finalized. */
+bool is_finalized() noexcept;
+
+/** Whether NVSHMEM is active.
+ *
+ *  Returns true if NVSHMEM has been initialized and has not been
+ *  finalized.
+ */
+bool is_active() noexcept;
+
+/** @brief Initialize NVSHMEM library.
+ *
+ *  Does nothing if NVSHMEM has already been initialized and throws an
+ *  exception if it has already been finalized. This is _not_
+ *  thread-safe.
+ */
+void initialize(MPI_Comm comm=MPI_COMM_WORLD);
+
+/** @brief Finalize NVSHMEM library.
+ *
+ *  Does nothing if NVSHMEM has not been initialized or has already
+ *  been finalized. This is _not_ thread-safe.
+ */
+void finalize();
+
+/** @brief Allocate GPU buffer on the NVSHMEM symmetric heap.
+ *
+ *  Initializes NVSHMEM if needed.
+ */
+template <typename T=void>
+T* malloc(size_t size);
+
+/** @brief Resize GPU buffer on the NVSHMEM symmetric heap.
+ *
+ *  Initializes NVSHMEM if needed.
+ */
+template <typename T=void>
+T* realloc(T* ptr, size_t size);
+
+} // namespace nvshmem
+} // namespace lbann
+
+// =============================================
+// Implementation
+// =============================================
+
+namespace lbann {
+namespace nvshmem {
+
+template <typename T>
+T* malloc(size_t size) {
+  initialize();
+  if (size == 0) {
+    return nullptr;
+  }
+  CHECK_CUDA(cudaDeviceSynchronize());
+  auto* ptr = nvshmem_malloc(size * sizeof(T));
+  if (ptr == nullptr) {
+    LBANN_ERROR(
+      "NVSHMEM failed to allocate a GPU buffer ",
+      "from the symmetric heap ",
+      "(requested ",size," B)");
+  }
+  return reinterpret_cast<T*>(ptr);
+}
+
+template <typename T>
+T* realloc(T* ptr, size_t size) {
+  initialize();
+
+  /// @todo Use nvshmem_realloc once it's supported
+  if (ptr != nullptr) {
+    nvshmem_free(ptr);
+  }
+  return malloc<T>(size);
+
+}
+
+} // namespace nvshmem
+} // namespace lbann
+
+#endif // LBANN_HAS_NVSHMEM
+
+#endif // LBANN_UTILS_NVSHMEM_HPP_INCLUDED
diff --git a/include/lbann/utils/opencv.hpp b/include/lbann/utils/opencv.hpp
new file mode 100644
index 00000000000..bf0b360316a
--- /dev/null
+++ b/include/lbann/utils/opencv.hpp
@@ -0,0 +1,118 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_OPENCV_HPP_INCLUDED
+#define LBANN_UTILS_OPENCV_HPP_INCLUDED
+
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/type_erased_matrix.hpp"
+#include <opencv2/core.hpp>
+
+namespace lbann {
+namespace utils {
+
+/**
+ * Check whether data is an image.
+ * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first
+ * (channel) being 1 or 3.
+ *
+ * @param data The data to check.
+ * @param dims The dimensions associated with data.
+ */
+inline bool check_is_image(const utils::type_erased_matrix& data,
+                           const std::vector<size_t>& dims) {
+  try {
+    // Check if we can do the conversion.
+    const auto& unused = data.template get<uint8_t>();
+    (void) unused;
+  } catch (const utils::bad_any_cast&) {
+    return false;
+  }
+  if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) {
+    return false;
+  }
+  return true;
+}
+
+/**
+ * Throw an error if data is not an image.
+ * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first
+ * (channel) being 1 or 3.
+ * Also throws an error if OpenCV is not supported.
+ *
+ * @param data The data to check.
+ * @param dims The dimensions associated with data.
+ */
+inline void assert_is_image(const utils::type_erased_matrix& data,
+                            const std::vector<size_t>& dims) {
+  try {
+    // Check if we can do the conversion.
+    const auto& unused = data.template get<uint8_t>();
+    (void) unused;
+  } catch (const utils::bad_any_cast&) {
+    LBANN_ERROR("Data is not an image: not uint8_t.");
+  }
+  if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) {
+    LBANN_ERROR("Data is not an image: bad dims.");
+  }
+}
+
+/**
+ * Construct an OpenCV Mat that refers to data.
+ * No data is copied, this just sets up a cv::Mat header.
+ * @param data The matrix with data to use.
+ * @param dims Dimensions of the data.
+ */
+inline cv::Mat get_opencv_mat(utils::type_erased_matrix& data, const std::vector<size_t>& dims) {
+  assert_is_image(data, dims);
+  auto& mat = data.template get<uint8_t>();
+  return cv::Mat(dims[1], dims[2], dims[0] == 1 ? CV_8UC1 : CV_8UC3,
+                 mat.Buffer());
+}
+
+/**
+ * Construct an OpenCV Mat that refers to data.
+ * No data is copied, this just sets up a cv::Mat header.
+ * @param data The matrix with data to use.
+ * @param dims Dimensions of the data.
+ */
+inline cv::Mat get_opencv_mat(El::Matrix<uint8_t>& data, const std::vector<size_t>& dims) {
+  if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) {
+    LBANN_ERROR("Data is not an image: bad dims.");
+  }
+  return cv::Mat(dims[1], dims[2], dims[0] == 1 ? CV_8UC1 : CV_8UC3,
+                 data.Buffer());
+}
+
+/** Get the linearized size of dims. */
+inline size_t get_linearized_size(const std::vector<size_t>& dims) {
+  return std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
+}
+
+}  // namespace utils
+}  // namespace lbann
+
+#endif  // LBANN_UTILS_OPENCV_HPP_INCLUDED
diff --git a/include/lbann/utils/profiling.hpp b/include/lbann/utils/profiling.hpp
index 8985dc2faf5..e3d4f70bf72 100644
--- a/include/lbann/utils/profiling.hpp
+++ b/include/lbann/utils/profiling.hpp
@@ -25,6 +25,8 @@
 //
 // profiling .hpp .cpp - Various routines for interfacing with profilers
 ///////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_UTILS_PROFILING_HPP
+#define LBANN_UTILS_PROFILING_HPP
 
 namespace lbann {
 
@@ -43,3 +45,5 @@ void prof_region_begin(const char *s, int c, bool sync);
 void prof_region_end(const char *s, bool sync);
 
 }  // namespace lbann
+
+#endif // LBANN_UTILS_PROFILING_HPP
diff --git a/include/lbann/utils/protobuf_utils.hpp b/include/lbann/utils/protobuf_utils.hpp
index 48340c89db9..83ef42497d3 100644
--- a/include/lbann/utils/protobuf_utils.hpp
+++ b/include/lbann/utils/protobuf_utils.hpp
@@ -1,9 +1,14 @@
 #ifndef LBANN_UTILS_PROTOBUF_UTILS_HPP_INCLUDED
 #define LBANN_UTILS_PROTOBUF_UTILS_HPP_INCLUDED
 
+#include <memory>
+#include <string>
 #include <vector>
-#include "lbann/lbann.hpp"
-#include <lbann.pb.h>
+
+// Forward-declare protobuf class
+namespace lbann_data {
+class LbannPB;
+}
 
 namespace lbann {
 
diff --git a/include/lbann/utils/python.hpp b/include/lbann/utils/python.hpp
new file mode 100644
index 00000000000..e762ae3f842
--- /dev/null
+++ b/include/lbann/utils/python.hpp
@@ -0,0 +1,170 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_PYTHON_HPP_INCLUDED
+#define LBANN_UTILS_PYTHON_HPP_INCLUDED
+
+#include "lbann/base.hpp"
+#ifdef LBANN_HAS_PYTHON
+
+#include <Python.h>
+
+#include <mutex>
+#include <string>
+
+namespace lbann {
+namespace python {
+
+/** @brief Start embedded Python session.
+ *
+ *  Does nothing if Python has already been started. This function is
+ *  thread-safe.
+ *
+ *  Be warned that restarting Python after it has been ended is a bad
+ *  idea since any Python objects left over from the first session
+ *  will be invalid in the second. Expect segfaults.
+ */
+void initialize();
+
+/** @brief End embedded Python session.
+ *
+ *  Does nothing if Python is not running. This function is
+ *  thread-safe.
+ */
+void finalize();
+
+/** @brief Check if embedded Python session is running. */
+bool is_active();
+
+/** @brief Check if a Python error has occurred.
+ *
+ *  If a Python error is detected, then the Python error indicator is
+ *  cleared and a C++ exception is thrown. The GIL is acquired
+ *  internally.
+ *
+ *  @param force_error Whether to force an exception to be thrown.
+ */
+void check_error(bool force_error = false);
+
+/** @brief RAII wrapper for Python GIL.
+ *
+ *  The Python interpreter is not thread-safe, so it uses the "global
+ *  interpreter lock" to ensure only one thread is executing at a
+ *  time. Make sure to acquire the GIL before calling Python C API
+ *  functions. The GIL can be acquired recursively, i.e. you can
+ *  acquire the GIL even if you already control it.
+ *
+ *  If an Python session is not running, one is started.
+ */
+class global_interpreter_lock {
+public:
+  global_interpreter_lock();
+  ~global_interpreter_lock();
+private:
+  global_interpreter_lock(const global_interpreter_lock&) = delete;
+  global_interpreter_lock& operator=(const global_interpreter_lock&) = delete;
+  PyGILState_STATE m_gil_state;
+};
+
+/** @brief Wrapper around a Python object pointer.
+ *
+ *  Manages the reference count for a @c PyObject pointer and is
+ *  implicitly convertible to the pointer. This is especially
+ *  convenient for interacting with Python C API functions that @a
+ *  borrow references and return @a new references (this is the most
+ *  common kind).
+ *
+ *  This class is @a not thread-safe. However, it's best practice to
+ *  acquire the GIL before doing any Python operations, so access will
+ *  typically be serialized.
+ *
+ *  Handling reference counts is a tricky part of the Python C API. Be
+ *  especially careful with functions that @a steal references or
+ *  return @a borrowed references. See
+ *
+ *    https://docs.python.org/3.7/c-api/intro.html#reference-counts
+ *
+ *  for an explanation of reference counts.
+ */
+class object {
+public:
+
+  /** @brief Take ownership of a Python object pointer.
+   *  @details @a Steals the reference.
+   */
+  object(PyObject* ptr);
+
+  /** @brief Create a Python string. */
+  object(const std::string& val);
+  /** @brief Create a Python integer. */
+  object(long val);
+  /** @brief Create a Python floating point number. */
+  object(double val);
+
+  object() {}
+  /** @details @a Borrows the reference. */
+  object(const object& other);
+  /** @details @a Borrows the reference. */
+  object& operator=(const object& other);
+  /** @details @a Steals the reference. */
+  object(object&& other) noexcept;
+  /** @details @a Steals the reference. */
+  object& operator=(object&& other);
+  ~object();
+
+  /** @returns @a Borrowed reference. */
+  inline PyObject* get() noexcept                  { return m_ptr; }
+  /** @returns @a Borrowed reference. */
+  inline const PyObject* get() const noexcept      { return m_ptr; }
+  /** @returns @a Borrowed reference. */
+  inline operator PyObject*() noexcept             { return get(); }
+  /** @returns @a Borrowed reference. */
+  inline operator const PyObject*() const noexcept { return get(); }
+
+  /** @brief Release ownership of Python object pointer.
+   *  @returns @a New reference.
+   */
+  PyObject* release() noexcept;
+
+  /** Convert Python @c str to C++ @c std::string. */
+  operator std::string();
+  /** Convert Python @c int to C++ @c long. */
+  operator long();
+  /** Convert Python @c float to C++ @c double. */
+  operator double();
+
+private:
+
+  /** Python object pointer. */
+  PyObject* m_ptr = nullptr;
+
+};
+
+} // namespace python
+} // namespace lbann
+
+#endif // LBANN_HAS_PYTHON
+#endif // LBANN_UTILS_PYTHON_HPP_INCLUDED
diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp
index dd48d1ee787..f9f688f85e9 100644
--- a/include/lbann/utils/random.hpp
+++ b/include/lbann/utils/random.hpp
@@ -24,56 +24,20 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef LBANN_UTILS_RNG_HPP
-#define LBANN_UTILS_RNG_HPP
+#ifndef LBANN_UTILS_RANDOM_HPP
+#define LBANN_UTILS_RANDOM_HPP
 
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
 #include "lbann/io/persist.hpp"
-#include <random>
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/random_number_generators.hpp"
 
 namespace lbann {
 
 /** Probability distributions. */
 enum class probability_distribution {invalid, gaussian, bernoulli, uniform};
 
-using rng_gen = std::mt19937;  // Mersenne Twister
-using fast_rng_gen = std::minstd_rand;  // Minimum standard, LC
-
-/**
- * Return a reference to the global LBANN random number generator.
- * @note If compiling with OpenMP, this is stored in a threadprivate variable.
- */
-rng_gen& get_generator();
-
-/**
- * Return a reference to a possibly-faster global LBANN random number generator.
- * Compared to get_generator, this should be slightly faster.
- * @note If compiling with OpenMP, this is stored in a threadprivate variable.
- */
-fast_rng_gen& get_fast_generator();
-
-/**
- * Return a reference to the global LBANN random number generator used
- * for shuffling the data samples within each mini-batch
- * @note This is stored in a thread_local variable.
- */
-rng_gen& get_data_seq_generator();
-
-/**
- * Return a reference to the global LBANN random number generator used
- * for shuffling the data samples within each mini-batch
- * @note This is stored in a thread_local variable.
- */
-rng_gen& get_io_generator();
-
-/**
- * Return a reference to the fast global LBANN random number generator used
- * for the I/O threads
- * @note This is stored in a thread_local variable.
- */
-fast_rng_gen& get_fast_io_generator();
-
 /**
  * Return random integers uniformly distributed in [0, max).
  * @param g C++ uniform random bit generator.
@@ -84,6 +48,11 @@ fast_rng_gen& get_fast_io_generator();
  */
 template <typename Generator, typename T>
 inline T fast_rand_int(Generator& g, T max) {
+#ifdef LBANN_DEBUG
+  if (max == 0) {
+    LBANN_ERROR("fast_rand_int called with max=0");
+  }
+#endif
   typename Generator::result_type x;
   do {
     x = g();
@@ -107,32 +76,47 @@ inline T fast_rand_int_pow2(Generator& g, T max) {
   return x & ((typename Generator::result_type) max);
 }
 
-/** @brief Initialize the random number generator (with optional seed).
- *
- *  @param seed Seed value for the random number generator
- *  @param comm If present, mixes the process's rank within the model
- *              into the seed; if not, uses the MPI world rank.
- *
- */
-void init_random(int seed = -1, lbann_comm *comm = nullptr);
+// Methods for generating uniformly random values in [0, 1).
 
-/**
- * Initialize a random number generator (with optional seed) that is
- * specifically used for sequencing the training / testing data
- * samples.  Using a separate RNG for the data sequences helps provide
- * a stable training result that does not vary with how much I/O
- * parallelism is applied.
- */
-void init_data_seq_random(int seed = -1);
+namespace details {
 
-/**
- * Initialize a random number generator (with optional seed) that is
- * specifically used by the I/O threads for tasks such as data
- * preprocessing, etc.
- *
- * Called from init_random
+/** Generates uniform random value in the range [0, 1). The generator
+ *  is assumed to produce at least 32 random bits.
  */
-void init_io_random(int seed = -1);
+template <typename Generator, typename T>
+struct random_uniform_impl {
+  static T generate(Generator&);
+};
+template <typename Generator>
+struct random_uniform_impl<Generator, float> {
+  static float generate(Generator& g) {
+    // float has a 24-bit significand, including an implicit bit. See
+    // section on converting uint64_ts to doubles in
+    // http://xoshiro.di.unimi.it/
+    constexpr uint64_t mask32 = 0xFFFFFFFFull;
+    const uint64_t r = uint64_t(g()) & mask32;
+    return (r >> 8) * (1.0f / 16777216.0f);
+  }
+};
+template <typename Generator>
+struct random_uniform_impl<Generator, double> {
+  static double generate(Generator& g) {
+    // double has a 53-bit significand, including an implicit bit. See
+    // section on converting uint64_ts to doubles in
+    // http://xoshiro.di.unimi.it/
+    constexpr uint64_t mask32 = 0xFFFFFFFFull;
+    const uint64_t r = (uint64_t(g()) << 32) | (uint64_t(g()) & mask32);
+    return (r >> 11) * (1.0 / 9007199254740992.0);
+  }
+};
+
+} // namespace details
+
+/** @brief Generate uniform random value in the range [0, 1). */
+template <typename T, typename Generator>
+inline T random_uniform(Generator& g) {
+  return details::random_uniform_impl<Generator, T>::generate(g);
+}
 
 /**
  * Make mat into an m x n matrix where each entry is independently drawn from
@@ -141,21 +125,24 @@ void init_io_random(int seed = -1);
  * not change as the grid it is distributed over changes; that is, it will have
  * the same entries when mat spans any number of processes.
  */
-void gaussian_fill(AbsDistMat& mat, El::Int m, El::Int n, DataType mean = 0.0f,
-                   DataType stddev = 1.0f);
+template <typename TensorDataType>
+void gaussian_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, TensorDataType mean = 0.0,
+                   TensorDataType stddev = 1.0);
 /**
  * Make mat into an m x n matrix where each entry is an indepenent Bernoulli
  * random variable with parameter p.
  * This makes the same guarantees as gaussian_fill.
  */
-void bernoulli_fill(AbsDistMat& mat, El::Int m, El::Int n, double p = 0.5);
+template <typename TensorDataType>
+void bernoulli_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, double p = 0.5);
 /**
  * Make mat into an m x n matrix where each entry is independently uniformly
  * sampled from a ball with the given center and radius.
  * This makes the same guarantees as gaussian_fill.
  */
-void uniform_fill(AbsDistMat& mat, El::Int m, El::Int n, DataType center = 0.0f,
-                  DataType radius = 1.0f);
+template <typename TensorDataType>
+void uniform_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, TensorDataType center = 0.0,
+                  TensorDataType radius = 1.0);
 
 /**
  * Make mat into an m x n matrix where each entry is independently drawn from
@@ -163,24 +150,28 @@ void uniform_fill(AbsDistMat& mat, El::Int m, El::Int n, DataType center = 0.0f,
  * This always ensures that the entries of the matrix do not change as the grid
  * it is distributed over changes.
  */
-void gaussian_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n,
-                           DataType mean = 0.0f, DataType stddev = 1.0f);
+template <typename TensorDataType>
+void gaussian_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                           TensorDataType mean = 0.0, TensorDataType stddev = 1.0);
 /**
  * Make mat into an m x n matrix where each entry is an independent Bernoulli
  * random variable with parameter p.
  * This makes the same guarantees as gaussian_fill_procdet.
  */
-void bernoulli_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, double p = 0.5);
+template <typename TensorDataType>
+void bernoulli_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, double p = 0.5);
 /**
  * Make mat into an m x n matrix where each entry is independently uniformly
  * sampled from a ball with the given center and radius.
  * This makes the same guarantees as gaussian_fill_procdet.
  */
-void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n,
-                          DataType center = 0.0f, DataType radius = 1.0f);
+template <typename TensorDataType>
+void uniform_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                          TensorDataType center = 0.0, TensorDataType radius = 1.0);
 
-bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm);
-bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm);
+bool save_rng_to_checkpoint_shared(persist& p, lbann_comm* comm);
+bool save_rng_to_checkpoint_distributed(persist& p, lbann_comm* comm);
+bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm);
 
 template<typename DistType,typename DType=DataType>
 class rng {
@@ -218,6 +209,22 @@ void rng_bernoulli(const float p, DistMat *m) {
   }
 }
 
+#ifndef LBANN_RANDOM_INSTANTIATE
+#define PROTO(T)                                                                                                         \
+  extern template void gaussian_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T mean, T stddev);         \
+  extern template void bernoulli_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, double p);                \
+  extern template void uniform_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T center, T radius);        \
+  extern template void gaussian_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T mean, T stddev); \
+  extern template void bernoulli_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, double p);        \
+  extern template void uniform_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T center, T radius)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_RANDOM_INSTANTIATE
 
 }// end namespace
-#endif // LBANN_UTILS_RNG_HPP
+#endif // LBANN_UTILS_RANDOM_HPP
diff --git a/include/lbann/utils/random_number_generators.hpp b/include/lbann/utils/random_number_generators.hpp
new file mode 100644
index 00000000000..4daca972971
--- /dev/null
+++ b/include/lbann/utils/random_number_generators.hpp
@@ -0,0 +1,149 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_RNG_HPP
+#define LBANN_UTILS_RNG_HPP
+
+#include "lbann/comm.hpp"
+#include "lbann/utils/exception.hpp"
+#include <random>
+#include <atomic>
+#include <thread>
+
+namespace lbann {
+
+using rng_gen = std::mt19937;  // Mersenne Twister
+using fast_rng_gen = std::minstd_rand;  // Minimum standard, LC
+
+struct io_rng_t {
+  lbann::rng_gen generator;
+  lbann::fast_rng_gen fast_generator;
+  // Track the owner so that it is easy to ensure the right thread is
+  // using this structure.
+  std::atomic<std::thread::id> active_thread_id;
+
+  io_rng_t()
+    : generator(42ULL),
+      fast_generator(42ULL),
+      active_thread_id(std::thread::id()) {}
+
+  io_rng_t(const io_rng_t& other)
+    : generator(other.generator),
+      fast_generator(other.fast_generator),
+      active_thread_id(other.active_thread_id.load()) {}
+};
+
+struct locked_io_rng_ref {
+  io_rng_t* rng_;
+  locked_io_rng_ref(io_rng_t& rng)
+    : rng_(&rng)
+  {
+    std::thread::id prev_tid = rng_->active_thread_id.exchange( std::this_thread::get_id());
+    if(prev_tid != std::thread::id()) {
+      LBANN_ERROR("Acquired a \'locked\' RNG that isn't owned by this thread");
+    }
+  }
+  explicit operator io_rng_t&() { return *rng_; }
+  ~locked_io_rng_ref() {
+    std::thread::id prev_tid = rng_->active_thread_id.exchange(std::thread::id());
+    if(prev_tid != std::this_thread::get_id()) {
+      LBANN_WARNING("Releasing a \'locked\' RNG that isn't owned by this thread");
+    }
+  }
+  locked_io_rng_ref(locked_io_rng_ref&& ) = default;
+};
+
+/**
+ * Return a reference to the global LBANN random number generator.
+ * @note If compiling with OpenMP, this is stored in a threadprivate variable.
+ */
+rng_gen& get_generator();
+
+/**
+ * Return a reference to a possibly-faster global LBANN random number generator.
+ * Compared to get_generator, this should be slightly faster.
+ * @note If compiling with OpenMP, this is stored in a threadprivate variable.
+ */
+fast_rng_gen& get_fast_generator();
+
+/**
+ * Return a reference to the global LBANN random number generator used
+ * for shuffling the data samples within each mini-batch
+ * @note This is stored in a thread_local variable.
+ */
+rng_gen& get_data_seq_generator();
+
+/** @brief Returns the number of provisioned I/O generators. */
+int get_num_io_generators();
+
+/** @brief Sets the local index for a thread to access the correct I/O RNGs. */
+locked_io_rng_ref set_io_generators_local_index(size_t idx);
+
+/**
+ * Return a reference to the global LBANN random number generator used
+ * for shuffling the data samples within each mini-batch
+ * @note This is stored in a thread_local variable.
+ */
+rng_gen& get_io_generator();
+
+/**
+ * Return a reference to the fast global LBANN random number generator used
+ * for the I/O threads
+ * @note This is stored in a thread_local variable.
+ */
+fast_rng_gen& get_fast_io_generator();
+
+/** @brief Initialize the random number generator (with optional seed).
+ *
+ *  @param seed Seed value for the random number generator
+ *  @param comm If present, mixes the process's rank within the model
+ *              into the seed; if not, uses the MPI world rank.
+ *
+ */
+void init_random(int seed = -1, int num_io_RNGs = 1, lbann_comm *comm = nullptr);
+
+/**
+ * Initialize a random number generator (with optional seed) that is
+ * specifically used for sequencing the training / testing data
+ * samples.  Using a separate RNG for the data sequences helps provide
+ * a stable training result that does not vary with how much I/O
+ * parallelism is applied.
+ */
+void init_data_seq_random(int seed = -1);
+
+/**
+ * Initialize a random number generator (with optional seed) that is
+ * specifically used by the I/O threads for tasks such as data
+ * preprocessing, etc.
+ * Includes the number of I/O RNGs required.
+ *
+ * Called from init_random
+ */
+void init_io_random(int seed = -1, int num_io_RNGs = 1);
+
+} // namespace lbann
+
+#endif // LBANN_UTILS_RNG_HPP
diff --git a/include/lbann/utils/serialization.hpp b/include/lbann/utils/serialization.hpp
new file mode 100644
index 00000000000..6c44672a240
--- /dev/null
+++ b/include/lbann/utils/serialization.hpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_UTILS_SERIALIZATION_HPP_INCLUDED
+#define LBANN_UTILS_SERIALIZATION_HPP_INCLUDED
+
+/** @file
+ *
+ *  Serialization functions for arithmetic types. Specializations for
+ *  Cereal's Binary, JSON, and XML archives are provided.
+ */
+
+#include "lbann_config.hpp"
+
+// Half-precision support comes from here:
+#include <El.hpp>
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+#include <cereal/archives/xml.hpp>
+
+/** @namespace cereal
+ *
+ *  Extensions to Cereal for extra arithmetic types used by LBANN.
+ */
+namespace cereal {
+#ifdef LBANN_HAS_HALF
+#ifdef LBANN_HAS_GPU_FP16
+
+/** @name General templates */
+///@{
+
+/** @brief Save a GPU half-precision value. */
+template <typename OutputArchiveT>
+void save(OutputArchiveT& archive, __half const& value) {
+  float x = value;
+  archive(x);
+}
+
+/** @brief Load a GPU half-precision value. */
+template <typename InputArchiveT>
+void load(InputArchiveT& archive, __half& value) {
+  float x = 0.f;
+  archive(x);
+  value = x;
+}
+
+///@}
+/** @name Binary archives */
+///@{
+
+/** @brief Save this half-precision value in Binary */
+void save(BinaryOutputArchive&, __half const&);
+
+/** @brief Load this half-precision value from Binary */
+void load(BinaryInputArchive&, __half&);
+
+///@}
+#endif // LBANN_HAS_GPU_FP16
+
+/** @name XML archives */
+///@{
+
+// Remove the default definitions from Cereal
+inline void save(XMLOutputArchive&, half_float::half const&) = delete;
+inline void load(XMLInputArchive&, half_float::half&) = delete;
+
+/** @brief Save this half-precision value as a float for XML */
+float save_minimal(XMLOutputArchive const&,
+                   half_float::half const&) noexcept;
+
+/** @brief Load this half-precision value as a float from XML */
+void load_minimal(
+  XMLInputArchive const&, half_float::half&, float const&) noexcept;
+
+///@}
+/** @name JSON archives */
+///@{
+
+/** @brief Save this half-precision value in JSON */
+void save(JSONOutputArchive&, half_float::half const&);
+
+/** @brief Load this half-precision value from JSON */
+void load(JSONInputArchive&, half_float::half&);
+
+///@}
+#endif // LBANN_HAS_HALF
+}// namespace cereal
+
+#endif // LBANN_UTILS_SERIALIZATION_HPP_INCLUDED
diff --git a/include/lbann/utils/summary.hpp b/include/lbann/utils/summary.hpp
index dea6e19cd2b..bc4eb038179 100644
--- a/include/lbann/utils/summary.hpp
+++ b/include/lbann/utils/summary.hpp
@@ -22,10 +22,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_summary .hpp .cpp - Write summary statistics to Tensorboard
 ////////////////////////////////////////////////////////////////////////////////
 
+// lbann_summary - Write summary statistics to Tensorboard
+
 #ifndef LBANN_SUMMARY_HPP_INCLUDED
 #define LBANN_SUMMARY_HPP_INCLUDED
 
@@ -40,6 +40,9 @@
 
 namespace lbann {
 
+template <typename T, typename U>
+using BiggerOf = typename std::conditional<(sizeof(T) > sizeof(U)), T, U>::type;
+
 #ifdef LBANN_HAS_TBINF
 
 /**
@@ -59,7 +62,7 @@ namespace lbann {
  * NON-TENSORBOARD BUILDS BELOW!
  */
 class lbann_summary {
- public:
+public:
 
   /**
    * Create a new summary manager.
@@ -70,24 +73,37 @@ class lbann_summary {
   ~lbann_summary();
 
   /** Report the mean of mat. */
-  void reduce_mean(const std::string tag, const AbsDistMat& mat, int step);
+  template <typename TensorDataType>
+  void reduce_mean(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
   /** Report the minimum value of mat. */
-  void reduce_min(const std::string tag, const AbsDistMat& mat, int step);
+  template <typename TensorDataType>
+  void reduce_min(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
   /** Report the maximum value of mat. */
-  void reduce_max(const std::string tag, const AbsDistMat& mat, int step);
+  template <typename TensorDataType>
+  void reduce_max(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
   /** Report the standard deviation of mat. */
-  void reduce_stdev(const std::string tag, const AbsDistMat& mat, int step);
+  template <typename TensorDataType>
+  void reduce_stdev(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
   /** Report a scalar from each model (only meaningful on model masters). */
-  void reduce_scalar(const std::string tag, DataType s, int step);
+  template <typename TensorDataType>
+  void reduce_scalar(const std::string tag, TensorDataType s, int step);
   /** Do a trainer_reduce (sum) on a scalar, then report that sum. */
-  void sum_reduce_scalar(const std::string tag, DataType s, int step);
+  template <typename TensorDataType>
+  void sum_reduce_scalar(const std::string tag, TensorDataType s, int step);
   /** Report a scalar from every rank. */
-  void reduce_scalar_all(const std::string tag, DataType s, int step);
+  template <typename TensorDataType>
+  void reduce_scalar_all(const std::string tag, TensorDataType s, int step);
   /** Report a histogram of the values in mat. */
-  void reduce_histogram(const std::string tag, const AbsDistMat& mat, int step);
+  template <typename TensorDataType>
+  void reduce_histogram(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
   /** Report the (squared) 2-norm of mat. */
-  void reduce_2norm(const std::string tag, const AbsDistMat& mat, int step);
-
+  template <typename TensorDataType>
+  void reduce_2norm(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step);
+  void report_image(std::string const& /*tag*/,
+                    std::string const& /*img_format*/,
+                    CPUMat const& /*image*/,
+                    std::vector<int> const& /*dims*/,
+                    int /*step*/);
   /**
    * Write all summaries out.
    * @todo This can be made faster by combining collective operations element-
@@ -99,28 +115,32 @@ class lbann_summary {
   lbann_comm *m_comm;
   TBinf::SummaryWriter *m_sw;
 
-  /** Represent a pending summary operation. */
+  /** Represent a pending summary operation.
+    * Note that TensorBoard takes scalars as floats
+    */
   struct pending_op {
-    pending_op(const std::string tag_, int step_, DataType local_,
-               DataType local2_ = 0.0f, int num_ = 0) :
+    pending_op(const std::string tag_, int step_, float local_,
+               float local2_ = 0.0f, int num_ = 0) :
       tag(tag_), step(step_), local(local_), local2(local2_), num(num_) {}
     /** Associated tag. */
     const std::string tag;
     /** Global step. */
     int step;
     /** Locally-computed data. */
-    DataType local;
+    float local;
     /** More locally-computed data (for stdev). */
-    DataType local2;
+    float local2;
     /** Size of matrix (needed for mean/stdev). */
     int num;
   };
-  /** Represent a pending histogram operation. */
+  /** Represent a pending histogram operation.
+    * Note that TensorBoard takes histograms as double
+    */
   struct pending_histogram {
     pending_histogram(const std::string tag_, int step_,
-                      std::vector<float> buckets_,
-                      DataType min_, DataType max_, DataType num_,
-                      DataType sum_, DataType sqsum_) :
+                      std::vector<double> buckets_,
+                      double min_, double max_, double num_,
+                      double sum_, double sqsum_) :
       tag(tag_), step(step_), buckets(buckets_), min(min_), max(max_),
       num(num_), sum(sum_), sqsum(sqsum_) {}
     /** Associated tag. */
@@ -128,17 +148,17 @@ class lbann_summary {
     /** Global step. */
     int step;
     /** Histogram buckets, using histogram_buckets as the limits. */
-    std::vector<float> buckets;
+    std::vector<double> buckets;
     /** Minimum value in the data. */
-    DataType min;
+    double min;
     /** Maximum value in the data. */
-    DataType max;
+    double max;
     /** Number of values in the data. */
-    DataType num;
+    double num;
     /** Sum of the values in the data. */
-    DataType sum;
+    double sum;
     /** Sum of the squares of the values in the data. */
-    DataType sqsum;
+    double sqsum;
   };
 
   /** Currently-pending reduce_means. */
@@ -178,24 +198,31 @@ class lbann_summary {
   void flush_histograms();
 
   /** Compute the sum of elements in mat. */
-  DataType local_sum(const Mat& mat) const;
+  template <typename TensorDataType>
+  auto local_sum(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float>;
   /** Compute the sum of square of elements in mat. */
-  void local_sum_sqsum(const Mat& mat, DataType& sum, DataType& sqsum) const;
+  template <typename TensorDataType, typename AccumT>
+  void local_sum_sqsum(const El::AbstractMatrix<TensorDataType>& mat, AccumT& sum, AccumT& sqsum) const;
   /** Compute the minimum element in mat. */
-  DataType local_min(const Mat& mat) const;
+  template <typename TensorDataType>
+  auto local_min(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float>;
   /** Compute the maximum element in mat. */
-  DataType local_max(const Mat& mat) const;
+  template <typename TensorDataType>
+  auto local_max(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float>;
   /** Compute the element-wise L2 norm of mat. */
-  DataType local_2norm(const Mat& mat) const;
+  template <typename TensorDataType>
+  auto local_2norm(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float>;
   /** Prepend "model<model>/" to tag. */
   std::string prepend_model(const std::string tag, int model) const;
   /** Gather and write out a scalar summary for each model. */
-  void gather_scalar_summary(const std::string tag, DataType s, int step);
+  void gather_scalar_summary(const std::string tag, float s, int step);
   /** Gather and write out a scalar summary for each entry in a vector. */
   void gather_scalar_summary(const std::vector<pending_op>& ops,
-                             std::vector<DataType>& scalars);
+                             std::vector<float>& scalars);
 };
 
+#include "lbann/utils/summary_impl.hpp"
+
 #else
 
 /** Dummy class when TBinf is not present. */
@@ -203,15 +230,31 @@ class lbann_summary {
  public:
   lbann_summary(std::string logdir, lbann_comm *comm) {}
 
-  void reduce_mean(const std::string tag, const AbsDistMat& mat, int step) {}
-  void reduce_min(const std::string tag, const AbsDistMat& mat, int step) {}
-  void reduce_max(const std::string tag, const AbsDistMat& mat, int step) {}
-  void reduce_stdev(const std::string tag, const AbsDistMat& mat, int step) {}
-  void reduce_scalar(const std::string tag, DataType s, int step) {}
-  void sum_reduce_scalar(const std::string tag, DataType s, int step) {}
-  void reduce_scalar_all(const std::string tag, DataType s, int step) {}
-  void reduce_histogram(const std::string tag, const AbsDistMat& mat, int step) {}
-  void reduce_2norm(const std::string tag, const AbsDistMat& mat, int step) {}
+  void report_image(std::string const& tag,
+                    std::string const& img_format,
+                    CPUMat const& image,
+                    std::vector<int> const& dims,
+                    int step)
+  {}
+
+  template <typename TensorDataType>
+  void reduce_mean(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
+  template <typename TensorDataType>
+  void reduce_min(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
+  template <typename TensorDataType>
+  void reduce_max(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
+  template <typename TensorDataType>
+  void reduce_stdev(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
+  template <typename TensorDataType>
+  void reduce_scalar(const std::string tag, TensorDataType s, int step) {}
+  template <typename TensorDataType>
+  void sum_reduce_scalar(const std::string tag, TensorDataType s, int step) {}
+  template <typename TensorDataType>
+  void reduce_scalar_all(const std::string tag, TensorDataType s, int step) {}
+  template <typename TensorDataType>
+  void reduce_histogram(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
+  template <typename TensorDataType>
+  void reduce_2norm(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat, int step) {}
   void flush() {}
 };
 
diff --git a/include/lbann/utils/summary_impl.hpp b/include/lbann/utils/summary_impl.hpp
new file mode 100644
index 00000000000..fe6a08cf762
--- /dev/null
+++ b/include/lbann/utils/summary_impl.hpp
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+// Included in the lbann namespace
+
+#ifndef LBANN_SUMMARY_IMPL_HPP_INCLUDED
+#define LBANN_SUMMARY_IMPL_HPP_INCLUDED
+
+#ifdef LBANN_HAS_TBINF
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_mean(const std::string tag,
+                                const El::AbstractDistMatrix<TensorDataType>& mat,
+                                int step) {
+  using AccumT = BiggerOf<TensorDataType, float>;
+  // Local sum
+  AccumT sum = 0.0;
+
+  // Check distributed matrix format
+  El::DistData mat_format(mat);
+  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
+    // Compute local sum on master process if matrix is Star,Star
+    if(m_comm->am_trainer_master()) {
+      sum = local_sum(mat.LockedMatrix());
+    }
+  } else {
+    // Compute local sum on all processes if matrix is in MC,MR;
+    // Star,VC; or similar format
+    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
+    // formats
+    sum = local_sum(mat.LockedMatrix());
+  }
+
+  // Add local sum to list of pending means
+  m_pending_means.emplace_back(tag, step, sum, 0.0f, mat.Height() * mat.Width());
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_min(const std::string tag,
+                                      const El::AbstractDistMatrix<TensorDataType>& mat,
+                                      int step) {
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT mat_local_min = local_min(mat.LockedMatrix());
+  m_pending_mins.emplace_back(tag, step, mat_local_min);
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_max(const std::string tag,
+                                      const El::AbstractDistMatrix<TensorDataType>& mat,
+                                      int step) {
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT mat_local_max = local_max(mat.LockedMatrix());
+  m_pending_maxes.emplace_back(tag, step, mat_local_max);
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_stdev(const std::string tag,
+                                        const El::AbstractDistMatrix<TensorDataType>& mat,
+                                        int step) {
+  using AccumT = BiggerOf<TensorDataType, float>;
+  // Local sum and squared sum
+  AccumT sum = 0.0;
+  AccumT sqsum = 0.0;
+
+  // Check distributed matrix format
+  El::DistData mat_format(mat);
+  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
+    // Compute local sums on master process if matrix is Star,Star
+    if(m_comm->am_trainer_master()) {
+      local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
+    }
+  } else {
+    // Compute local sums on all processes if matrix is in MC,MR;
+    // Star,VC; or similar format
+    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
+    // formats
+    local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
+  }
+
+  // Add local sums to list of pending stdevs.
+  m_pending_stdevs.emplace_back(tag, step, sum, sqsum, mat.Height() * mat.Width());
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_scalar(const std::string tag,
+                                         TensorDataType s,
+                                         int step) {
+  if (m_comm->am_trainer_master()) {
+    m_pending_scalars.emplace_back(tag, step, s);
+  }
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::sum_reduce_scalar(const std::string tag,
+                                             TensorDataType s,
+                                             int step) {
+  m_pending_sum_scalars.emplace_back(tag, step, s);
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_scalar_all(const std::string tag,
+                                             TensorDataType s,
+                                             int step) {
+  m_pending_scalar_alls.emplace_back(tag, step, s);
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_histogram(const std::string tag,
+                                            const El::AbstractDistMatrix<TensorDataType>& mat,
+                                            int step) {
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT mat_local_min = local_min(mat.LockedMatrix());
+  AccumT mat_local_max = local_max(mat.LockedMatrix());
+  // Local sum and squared sum
+  AccumT sum = 0.0;
+  AccumT sqsum = 0.0;
+  // Check distributed matrix format
+  El::DistData mat_format(mat);
+  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
+    // Compute local sums on master process if matrix is Star,Star
+    if(m_comm->am_trainer_master()) {
+      local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
+    }
+  } else {
+    // Compute local sums on all processes if matrix is in MC,MR;
+    // Star,VC; or similar format
+    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
+    // formats
+    local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
+  }
+  // Compute local buckets.
+  std::vector<double> buckets(m_histogram_buckets.size()+1, 0.0);
+  const auto height = mat.LocalHeight();
+  const auto width = mat.LocalWidth();
+  const auto ldim = mat.LDim();
+  const auto*__restrict__ mat_buf = mat.LockedMatrix().LockedBuffer();
+  for (auto row = 0; row < height; ++row) {
+    for (auto col = 0; col < width; ++col) {
+      // Note: This could be optimized; upper_bound takes O(logn) time.
+      auto bucket = std::distance(
+        m_histogram_buckets.begin(),
+        std::upper_bound(
+          m_histogram_buckets.begin(), m_histogram_buckets.end(),
+          mat_buf[row + col * ldim]));
+#ifdef LBANN_DEBUG
+      buckets.at(bucket) += 1.0;
+#else
+      buckets[bucket] += 1.0;
+#endif // LBANN_DEBUG
+    }
+  }
+  // Add to list of pending histograms.
+  m_pending_histograms.emplace_back(
+    tag, step, std::move(buckets),
+    mat_local_min, mat_local_max,
+    mat.Height() * mat.Width(),
+    sum, sqsum);
+  // TODO: Support histograms on multiple models.
+}
+
+template <typename TensorDataType>
+inline void lbann_summary::reduce_2norm(const std::string tag, const El::AbstractDistMatrix<TensorDataType>& mat,
+                                        int step) {
+  // Using a squared 2-norm so that we can just sum this.
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT local_norm = local_2norm(mat.LockedMatrix());
+  sum_reduce_scalar(tag, local_norm * local_norm, step);
+}
+
+template <typename TensorDataType>
+inline auto lbann_summary::local_sum(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float> {
+  // Note there are more numerically stable ways to compute a sum.
+  const El::Int height = mat.Height();
+  const El::Int width = mat.Width();
+  const El::Int ldim = mat.LDim();
+  const auto* __restrict__ mat_buf = mat.LockedBuffer();
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT sum = AccumT(0);
+  if (ldim == height) {
+    const El::Int size = height*width;
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum))
+    for (El::Int i = 0; i < size; ++i) {
+      sum += mat_buf[i];
+    }
+  } else {
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum) collapse(2))
+    for (El::Int row = 0; row < height; ++row) {
+      for (El::Int col = 0; col < width; ++col) {
+        sum += mat_buf[row + col * ldim];
+      }
+    }
+  }
+  return sum;
+}
+
+template <typename TensorDataType, typename AccumT>
+inline void lbann_summary::local_sum_sqsum(
+  const El::AbstractMatrix<TensorDataType>& mat, AccumT& sum, AccumT& sqsum) const {
+  // Note there are more numerically stable ways to compute a sum.
+  const El::Int height = mat.Height();
+  const El::Int width = mat.Width();
+  const El::Int ldim = mat.LDim();
+  const auto* __restrict__ mat_buf = mat.LockedBuffer();
+  sum = AccumT(0);
+  sqsum = AccumT(0);
+  if (ldim == height) {
+    const El::Int size = height*width;
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum,sqsum))
+    for (El::Int i = 0; i < size; ++i) {
+      const DataType val = mat_buf[i];
+      sum += val;
+      sqsum += val*val;
+    }
+  } else {
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum,sqsum) collapse(2))
+    for (El::Int row = 0; row < height; ++row) {
+      for (El::Int col = 0; col < width; ++col) {
+        const DataType val = mat_buf[row + col*ldim];
+        sum += val;
+        sqsum += val * val;
+      }
+    }
+  }
+}
+
+template <typename TensorDataType>
+inline auto lbann_summary::local_min(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float> {
+  const El::Int height = mat.Height();
+  const El::Int width = mat.Width();
+  const El::Int ldim = mat.LDim();
+  const auto* __restrict__ mat_buf = mat.LockedBuffer();
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT min = std::numeric_limits<AccumT>::max();
+  if (ldim == height) {
+    const El::Int size = height*width;
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(min:min))
+    for (El::Int i = 0; i < size; ++i) {
+      min = El::Min(min, mat_buf[i]);
+    }
+  } else {
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(min:min) collapse(2))
+    for (El::Int row = 0; row < height; ++row) {
+      for (El::Int col = 0; col < width; ++col) {
+        min = El::Min(min, mat_buf[row + col*ldim]);
+      }
+    }
+  }
+  return min;
+}
+
+template <typename TensorDataType>
+inline auto lbann_summary::local_max(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float> {
+  const El::Int height = mat.Height();
+  const El::Int width = mat.Width();
+  const El::Int ldim = mat.LDim();
+  const auto* __restrict__ mat_buf = mat.LockedBuffer();
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT max = std::numeric_limits<AccumT>::min();
+  if (ldim == height) {
+    const El::Int size = height*width;
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(max:max))
+    for (El::Int i = 0; i < size; ++i) {
+      max = El::Max(max, mat_buf[i]);
+    }
+  } else {
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(max:max) collapse(2))
+    for (El::Int row = 0; row < height; ++row) {
+      for (El::Int col = 0; col < width; ++col) {
+        max = El::Max(max, mat_buf[row + col*ldim]);
+      }
+    }
+  }
+  return max;
+}
+
+template <typename TensorDataType>
+inline auto lbann_summary::local_2norm(const El::AbstractMatrix<TensorDataType>& mat) const -> BiggerOf<TensorDataType, float> {
+  // Note there are more numerically stable ways to compute this.
+  const El::Int height = mat.Height();
+  const El::Int width = mat.Width();
+  const El::Int ldim = mat.LDim();
+  const auto* __restrict__ mat_buf = mat.LockedBuffer();
+  using AccumT = BiggerOf<TensorDataType, float>;
+  AccumT norm = AccumT(0);
+  if (ldim == height) {
+    const El::Int size = height*width;
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:norm))
+    for (El::Int i = 0; i < size; ++i) {
+      norm += mat_buf[i] * mat_buf[i];
+    }
+  } else {
+    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:norm) collapse(2))
+    for (El::Int row = 0; row < height; ++row) {
+      for (El::Int col = 0; col < width; ++col) {
+        norm += mat_buf[row + col * ldim] * mat_buf[row + col * ldim];
+      }
+    }
+  }
+  return El::Sqrt(norm);
+}
+
+#endif  // LBANN_HAS_TBINF
+
+#endif // LBANN_SUMMARY_IMPL_HPP_INCLUDED
diff --git a/include/lbann/utils/system_info.hpp b/include/lbann/utils/system_info.hpp
new file mode 100644
index 00000000000..9d0e960becb
--- /dev/null
+++ b/include/lbann/utils/system_info.hpp
@@ -0,0 +1,88 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED
+#define LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED
+
+#include <string>
+
+namespace lbann {
+namespace utils {
+
+/** @class SystemInfo
+ *  @brief Query basic system information
+ *
+ *  The class structure here is, strictly speaking, unnecessary. It is
+ *  used to provide a "hook" for stubbing this information during
+ *  testing.
+ */
+class SystemInfo
+{
+public:
+  /** @brief Virtual destructor */
+  virtual ~SystemInfo() noexcept = default;
+
+  /** @brief Get the current process ID.
+   *
+   *  This returns the value as a string to avoid system differences
+   *  in `pid_t`. However, it's probably safe to return either int64_t
+   *  or uint64_t here.
+   */
+  virtual std::string pid() const;
+
+  /** @brief Get the host name for this process. */
+  virtual std::string host_name() const;
+
+  /** @brief Get the MPI rank of this process.
+   *
+   *  If this is not an MPI job, or cannot be determined to be an MPI
+   *  job, this will return 0.
+   *
+   *  The return type is chosen for consistency with MPI 3.0.
+   */
+  virtual int mpi_rank() const;
+
+  /** @brief Get the size of the MPI universe in which this process is
+   *         participating.
+   *
+   *  If this is not an MPI job, or cannot be determined to be an MPI
+   *  job, this will return 1.
+   *
+   *  The return type is chosen for consistency with MPI 3.0.
+   */
+  virtual int mpi_size() const;
+
+  /** @brief Get the value of the given variable from the environment.
+   *
+   *  If the variable doesn't exist, the empty string is returned.
+   */
+  virtual std::string env_variable_value(std::string const& var_name) const;
+
+};
+
+}// namespace utils
+}// namespace lbann
+#endif // LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED
diff --git a/include/lbann/utils/threads/CMakeLists.txt b/include/lbann/utils/threads/CMakeLists.txt
index de816a2e06e..9ad74091c0a 100644
--- a/include/lbann/utils/threads/CMakeLists.txt
+++ b/include/lbann/utils/threads/CMakeLists.txt
@@ -2,6 +2,7 @@
 set_full_path(THIS_DIR_HEADERS
   thread_pool.hpp
   thread_safe_queues.hpp
+  thread_topology.hpp
   type_erased_function.hpp
   memory.hpp
   thread_utils.hpp
diff --git a/include/lbann/utils/threads/thread_pool.hpp b/include/lbann/utils/threads/thread_pool.hpp
index 81cfcf4c7a0..80d1380123a 100644
--- a/include/lbann/utils/threads/thread_pool.hpp
+++ b/include/lbann/utils/threads/thread_pool.hpp
@@ -1,15 +1,26 @@
-#ifndef __LBANN_THREAD_POOL_HPP__
-#define __LBANN_THREAD_POOL_HPP__
+#ifndef LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED
+#define LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED
 
-#include <future>
-#include <thread>
-#include <vector>
-#include <unordered_map>
+#include "lbann_config.hpp"
 
 #include "thread_safe_queue.hpp"
 #include "type_erased_function.hpp"
 #include "lbann/utils/exception.hpp"
 
+#if defined(LBANN_TOPO_AWARE)
+#include <hwloc.h>
+#if defined(HWLOC_API_VERSION) && (HWLOC_API_VERSION < 0x00010b00)
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#endif
+
+#include <sched.h>
+
+#include <future>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
 namespace lbann {
 
 class thread_pool {
@@ -111,8 +122,9 @@ class thread_pool {
 private:
   /** @brief The task executed by each thread */
   void do_thread_work_();
-  void do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set);
-
+#if defined(LBANN_TOPO_AWARE)
+  void do_thread_work_pinned_thread_(int tid, hwloc_topology_t topo, hwloc_cpuset_t cpuset);
+#endif // LBANN_TOPO_AWARE
 private:
 
   /** @brief Container holding the threads */
@@ -138,4 +150,4 @@ class thread_pool {
 };// class thread_pool
 
 }// namespace lbann
-#endif /* __LBANN_THREAD_POOL_HPP__ */
+#endif /* LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED */
diff --git a/include/lbann/utils/threads/thread_safe_queue.hpp b/include/lbann/utils/threads/thread_safe_queue.hpp
index 29e729aa8fe..882da33da67 100644
--- a/include/lbann/utils/threads/thread_safe_queue.hpp
+++ b/include/lbann/utils/threads/thread_safe_queue.hpp
@@ -1,11 +1,12 @@
-#ifndef __LBANN_THREAD_SAFE_QUEUE_HPP__
-#define __LBANN_THREAD_SAFE_QUEUE_HPP__
+#ifndef LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED
+#define LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED
+
+#include <lbann/utils/memory.hpp>
 
 #include <condition_variable>
+#include <memory>
 #include <mutex>
 
-#include <lbann/utils/memory.hpp>
-
 namespace lbann {
 
 /** @class thread_safe_queue
@@ -146,4 +147,4 @@ class thread_safe_queue {
 };// class thread_safe_queue
 
 }// namespace lbann
-#endif /* __LBANN_THREAD_SAFE_QUEUE_HPP__ */
+#endif /* LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED */
diff --git a/include/lbann/utils/threads/thread_topology.hpp b/include/lbann/utils/threads/thread_topology.hpp
new file mode 100644
index 00000000000..0095898c9ba
--- /dev/null
+++ b/include/lbann/utils/threads/thread_topology.hpp
@@ -0,0 +1,64 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_HW_TOPOLOGY_HPP_INCLUDED
+#define LBANN_UTILS_HW_TOPOLOGY_HPP_INCLUDED
+
+// Defines, among other things, DataType.
+#include "lbann_config.hpp"
+
+#if defined(LBANN_TOPO_AWARE)
+#include <hwloc.h>
+#if defined(HWLOC_API_VERSION) && (HWLOC_API_VERSION < 0x00010b00)
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#endif
+
+namespace lbann {
+  int get_num_numa_nodes();
+#if defined(LBANN_TOPO_AWARE)
+  void hwloc_print_topo();
+
+  int hwloc_bitmap_singlify_per_core(hwloc_topology_t topology, hwloc_bitmap_t cpuset, unsigned which);
+
+
+  hwloc_cpuset_t get_local_cpuset_for_current_thread(hwloc_topology_t topo);
+
+  /** @brief Return the allocated cpuset for the current thread, masking out
+   *  PUs from spurious "unbound" cores.
+   *  Allocates a bitmap that must be freed by calling function */
+  hwloc_cpuset_t get_allocated_cpuset_for_current_thread(const hwloc_topology_t topo);
+
+  /** @brief Given two sets of CPU bitmaps return the common set of
+      cores */
+  void find_common_core_set_from_cpu_masks(hwloc_topology_t topo,
+                                           hwloc_bitmap_t core_set,
+                                           hwloc_bitmap_t primary_set,
+                                           hwloc_bitmap_t ht_set);
+#endif // LBANN_TOPO_AWAR
+}
+
+#endif // LBANN_UTILS_HW_TOPOLOGY_HPP_INCLUDED
diff --git a/include/lbann/utils/threads/thread_utils.hpp b/include/lbann/utils/threads/thread_utils.hpp
index 514ff1e4549..7e19d9a21ab 100644
--- a/include/lbann/utils/threads/thread_utils.hpp
+++ b/include/lbann/utils/threads/thread_utils.hpp
@@ -24,8 +24,8 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef LBANN_THREAD_UTILS_HPP
-#define LBANN_THREAD_UTILS_HPP
+#ifndef LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED
+#define LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED
 
 #include "lbann/comm.hpp"
 
@@ -36,4 +36,4 @@ int free_core_offset(const lbann_comm *comm);
 
 } // namespace lbann
 
-#endif // LBANN_THREAD_UTILS_HPP
+#endif // LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED
diff --git a/include/lbann/utils/threads/type_erased_function.hpp b/include/lbann/utils/threads/type_erased_function.hpp
index 1b46f082f4c..3c7339a5d6d 100644
--- a/include/lbann/utils/threads/type_erased_function.hpp
+++ b/include/lbann/utils/threads/type_erased_function.hpp
@@ -1,10 +1,12 @@
-#ifndef __LBANN_TYPE_ERASED_FUNCTION_HPP__
-#define __LBANN_TYPE_ERASED_FUNCTION_HPP__
-
-#include <type_traits>
+#ifndef LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED
+#define LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED
 
 #include <lbann/utils/memory.hpp>
 
+#include <memory>
+#include <type_traits>
+#include <utility>
+
 namespace lbann {
 
 /** @class type_erased_function
@@ -88,4 +90,4 @@ class type_erased_function {
 };// class type_erased_function
 
 }// namespace lbann
-#endif /* __LBANN_TYPE_ERASED_FUNCTION_HPP__ */
+#endif /* LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED */
diff --git a/include/lbann/utils/trainer_file_utils.hpp b/include/lbann/utils/trainer_file_utils.hpp
new file mode 100644
index 00000000000..018ae9c5113
--- /dev/null
+++ b/include/lbann/utils/trainer_file_utils.hpp
@@ -0,0 +1,50 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+/// @todo Rename this file to file.hpp
+
+#ifndef LBANN_TRAINER_UTILS_FILE_HPP_INCLUDED
+#define LBANN_TRAINER_UTILS_FILE_HPP_INCLUDED
+
+#include "lbann/comm.hpp"
+
+namespace lbann {
+
+namespace file {
+
+/** @brief Trainer master creates directory if needed.
+ *
+ *  Coordinate efforts to allow the trainer master to make a directory
+ *  if needed and synchronizes all other trainer ranks to ensure they
+ *  see a consistent state of the file system.
+ */
+  void trainer_master_make_directory(const std::string& path, lbann_comm* comm);
+
+} // namespace file
+
+} // namespace lbann
+
+#endif // LBANN_TRAINER_UTILS_FILE_HPP_INCLUDED
diff --git a/include/lbann/utils/typename.hpp b/include/lbann/utils/typename.hpp
new file mode 100644
index 00000000000..baaca3e8782
--- /dev/null
+++ b/include/lbann/utils/typename.hpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_TYPENAME_HPP_INCLUDED
+#define LBANN_UTILS_TYPENAME_HPP_INCLUDED
+
+#include <lbann_config.hpp>
+
+namespace lbann {
+
+template <typename T>
+std::string TypeName();
+
+#define ADD_TYPENAME_INST(Type)                                     \
+  template <> inline std::string TypeName<Type>() { return #Type; }
+
+ADD_TYPENAME_INST(float)
+ADD_TYPENAME_INST(double)
+#ifdef LBANN_HAS_HALF
+ADD_TYPENAME_INST(cpu_fp16)
+#endif
+#ifdef LBANN_HAS_GPU_FP16
+ADD_TYPENAME_INST(fp16)
+#endif
+ADD_TYPENAME_INST(std::complex<float>)
+ADD_TYPENAME_INST(std::complex<double>)
+
+} // namespace lbann
+
+#endif // LBANN_UTILS_TYPENAME_HPP_INCLUDED
diff --git a/include/lbann/weights/CMakeLists.txt b/include/lbann/weights/CMakeLists.txt
index 79e614472e3..ed2864b32e8 100644
--- a/include/lbann/weights/CMakeLists.txt
+++ b/include/lbann/weights/CMakeLists.txt
@@ -1,8 +1,10 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
+  data_type_weights.hpp
   initializer.hpp
   variance_scaling_initializers.hpp
   weights.hpp
+  weights_helper.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/weights/data_type_weights.hpp b/include/lbann/weights/data_type_weights.hpp
new file mode 100644
index 00000000000..a523a903f9d
--- /dev/null
+++ b/include/lbann/weights/data_type_weights.hpp
@@ -0,0 +1,197 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_TYPE_WEIGHTS_HPP
+#define LBANN_DATA_TYPE_WEIGHTS_HPP
+
+#include "lbann/weights/weights.hpp"
+#include "lbann/weights/initializer.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+
+namespace lbann_data {
+class WeightsData;
+}
+
+namespace lbann {
+
+// Forward declaration
+// template <typename TensorDataType>
+// class data_type_optimizer;
+
+/** Neural network weights.
+ *  Weights are tensors that act as trainable parameters for a neural
+ *  network. The values can be initialized with a weights initializer
+ *  and are optimized with first-order methods (e.g. stochastic
+ *  gradient descent).
+ *
+ *  Internally, the weight values are stored in a 2D distributed
+ *  matrix. The "matrix height dimensions" are tensor dimensions that
+ *  correspond to the matrix height. The remaining dimensions, the
+ *  "matrix width dimensions," correspond to the matrix width.
+ *
+ *  Note that LBANN weights are similar to Tensorflow variables and
+ *  Caffe parameters.
+ */
+template <typename TensorDataType>
+class data_type_weights
+  : public Cloneable<data_type_weights<TensorDataType>, weights> {
+  using BaseType = Cloneable<data_type_weights<TensorDataType>, weights>;
+
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  /** @brief This type. */
+  using WeightsType = data_type_weights<TensorDataType>;
+
+  /** @brief The Optimizer type used by this object. */
+  using OptimizerType = data_type_optimizer<TensorDataType>;
+
+  /** @brief The Initializer type used by this object. */
+  using InitializerType = data_type_weights_initializer<TensorDataType>;
+
+  ///@}
+
+public:
+  data_type_weights(lbann_comm* comm);
+  data_type_weights(const data_type_weights& other);
+  data_type_weights& operator=(const data_type_weights& other);
+  virtual ~data_type_weights() = default;
+
+  bool has_optimizer() const override { return m_optimizer != nullptr; }
+
+  // -----------------------------------------------
+  // Dimension accessors
+  // -----------------------------------------------
+  // -----------------------------------------------
+  // Initializer accessors
+  // -----------------------------------------------
+  /** Get weights initializer. */
+  InitializerType* get_initializer() override;
+  /** Get weights initializer (const). */
+  const InitializerType* get_initializer() const override;
+  /** Set weights initializer.
+   *  The contents of 'init' are moved to a class member.
+   */
+  void set_initializer(std::unique_ptr<weights_initializer>&& init) override;
+
+  // -----------------------------------------------
+  // Optimizer accessors
+  // -----------------------------------------------
+  /** Get weights optimizer.
+   *  Returns a null pointer if the weights are frozen.
+   */
+  OptimizerType* get_optimizer() override;
+  /** Get weights optimizer.
+   *  Returns a null pointer if the weights are frozen.
+   */
+  const OptimizerType* get_optimizer() const override;
+  /** Set weights optimizer.
+   *  The contents of opt are moved to a class member.
+   */
+  void set_optimizer(std::unique_ptr<optimizer>&& opt) override;
+
+  // -----------------------------------------------
+  // Weight matrix accessors
+  // -----------------------------------------------
+
+  /** Get the weight matrix. */
+  AbsDistMatrixType& get_values() override;
+  /** Get the weight matrix. */
+  const AbsDistMatrixType& get_values() const override;
+  /** Set the weight matrix. */
+  void set_values(const AbsDistMatrixType& values);
+
+  /** Set a weight value. */
+  void set_value(TensorDataType value, int index);
+  /** Set an entry in the weight tensor. */
+  void set_value(TensorDataType value, std::vector<int> pos);
+  /** Set an entry in the weight matrix. */
+  void set_value(TensorDataType value, int row, int col);
+
+  /** Reconcile weight values.
+   *  If weight values are duplicated across multiple processes, they
+   *  are set to the average across the processes.
+   */
+  void reconcile_values() override;
+  /** Asynchronously reconcile weight values.
+   *  If weight values are duplicated across multiple processes, they
+   *  are set to the average across the processes.
+   */
+  void reconcile_values(Al::request& req) override;
+
+  // -----------------------------------------------
+  // Checkpointing
+  // -----------------------------------------------
+  bool save_to_checkpoint_shared(persist& p) override;
+  bool load_from_checkpoint_shared(persist& p) override;
+  bool load_from_save(std::string const& ckpt_dir, std::vector<std::string> const& weight_list) override;
+  bool save_to_checkpoint_distributed(persist& p) override;
+  bool load_from_checkpoint_distributed(persist& p) override;
+
+  /** Write weights to proto file */
+  void write_proto(lbann_data::WeightsData* proto) const override;
+
+private:
+  void do_augment_description_(description&) const override;
+  void do_setup_() override;
+  void do_set_dims_(std::vector<int> const& matrix_height_dims,
+                    std::vector<int> const& matrix_width_dims) override;
+private:
+
+  /** Weight matrix. */
+  std::unique_ptr<AbsDistMatrixType> m_values;
+
+  /** Weights initializer.
+   *  Default is nullptr, which corresponds to zero initialization.
+   */
+  std::unique_ptr<InitializerType> m_initializer;
+  /** Weights optimizer.
+   *  Default is nullptr, which corresponds to no optimizer.
+   */
+  std::unique_ptr<OptimizerType> m_optimizer;
+
+  friend class data_type_optimizer<TensorDataType>;
+};
+
+#ifndef LBANN_DATA_TYPE_WEIGHTS_INSTANTIATE
+#define PROTO(T)                           \
+  extern template class data_type_weights<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_DATA_TYPE_WEIGHTS_INSTANTIATE
+
+} // namespace lbann
+
+#endif // LBANN_DATA_TYPE_WEIGHTS_HPP
diff --git a/include/lbann/weights/initializer.hpp b/include/lbann/weights/initializer.hpp
index 84f696d9554..a5b1b38119d 100644
--- a/include/lbann/weights/initializer.hpp
+++ b/include/lbann/weights/initializer.hpp
@@ -28,12 +28,16 @@
 #define LBANN_WEIGHTS_INITIALIZER_HPP
 
 #include "lbann/base.hpp"
+#include "lbann/utils/cloneable.hpp"
 #include "lbann/utils/description.hpp"
 
+#include <google/protobuf/message.h>
+
 namespace lbann {
 
 /** @brief Scheme for initializing weight values. */
-class weights_initializer {
+class weights_initializer
+  : public Cloneable<HasAbstractFunction<weights_initializer>> {
 public:
   weights_initializer() = default;
   virtual ~weights_initializer() = default;
@@ -44,30 +48,57 @@ class weights_initializer {
   /** Human-readable description of class instance. */
   virtual description get_description() const;
 
-  /** Create a copy. */
-  virtual weights_initializer* copy() const = 0;
+};
+
+/** @brief Scheme for initializing weight values. */
+template <typename TensorDataType>
+class data_type_weights_initializer
+  : public Cloneable<
+      HasAbstractFunction<data_type_weights_initializer<TensorDataType>>,
+      weights_initializer> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+public:
+  data_type_weights_initializer() = default;
+  virtual ~data_type_weights_initializer() = default;
+
+  /** Human-readable string describing concrete class. */
+  std::string get_type() const override { return "data_type_weights"; }
 
   /** Initialize entries in a weights matrix. */
-  virtual void fill(AbsDistMat& matrix) = 0;
+  virtual void fill(AbsDistMatrixType& matrix) = 0;
 
 };
 
 /** @brief Fill weights with a constant value. */
-class constant_initializer : public weights_initializer {
+template <typename TensorDataType>
+class constant_initializer
+  : public Cloneable<constant_initializer<TensorDataType>,
+                     data_type_weights_initializer<TensorDataType>> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
 public:
-  constant_initializer(DataType value)
-    : weights_initializer(), m_value(value) {}
-  constant_initializer* copy() const override {
-    return new constant_initializer(*this);
-  }
-  std::string get_type() const { return "constant"; }
-  description get_description() const;
-  void fill(AbsDistMat& matrix) override;
+  constant_initializer(TensorDataType value)
+    : m_value(value)
+  {}
+  std::string get_type() const override { return "constant"; }
+  description get_description() const override;
+  void fill(AbsDistMatrixType& matrix) override;
 
 private:
 
   /** Weights value. */
-  DataType m_value;
+  TensorDataType m_value;
 
 };
 
@@ -76,69 +107,127 @@ class constant_initializer : public weights_initializer {
  *  The number of weight entries must exactly match the number of
  *  provided values.
  */
-class value_initializer : public weights_initializer {
+template <typename TensorDataType>
+class value_initializer
+  : public Cloneable<value_initializer<TensorDataType>,
+                     data_type_weights_initializer<TensorDataType>> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
 public:
-  value_initializer(std::vector<DataType> values)
-    : weights_initializer(), m_values(std::move(values)) {}
-  value_initializer* copy() const override {
-    return new value_initializer(*this);
-  }
-  std::string get_type() const { return "value"; }
-  void fill(AbsDistMat& matrix) override;
+  value_initializer(std::vector<TensorDataType> values)
+    : m_values{std::move(values)}
+  {}
+  std::string get_type() const override { return "value"; }
+  void fill(AbsDistMatrixType& matrix) override;
 
 private:
 
   /** List of weights values. */
-  std::vector<DataType> m_values;
+  std::vector<TensorDataType> m_values;
 
 };
 
 /** @brief Draw weights values from a uniform random distribution. */
-class uniform_initializer : public weights_initializer {
+template <typename TensorDataType>
+class uniform_initializer
+  : public Cloneable<uniform_initializer<TensorDataType>,
+                     data_type_weights_initializer<TensorDataType>> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
  public:
-  uniform_initializer(DataType min = DataType(0),
-                      DataType max = DataType(1))
-    : weights_initializer(), m_min(min), m_max(max) {}
-  uniform_initializer* copy() const override {
-    return new uniform_initializer(*this);
-  }
-  std::string get_type() const { return "uniform"; }
-  description get_description() const;
-  void fill(AbsDistMat& matrix) override;
+  uniform_initializer(TensorDataType min = El::To<TensorDataType>(0),
+                      TensorDataType max = El::To<TensorDataType>(1))
+    : m_min{min}, m_max{max}
+  {}
+  std::string get_type() const override{ return "uniform"; }
+  description get_description() const override;
+  void fill(AbsDistMatrixType& matrix) override;
 
 private:
 
   /** Uniform distribution minimum. */
-  DataType m_min;
+  TensorDataType m_min;
   /** Uniform distribution maximum. */
-  DataType m_max;
+  TensorDataType m_max;
 
 };
 
 /** @brief Draw weights values from a normal random distribution. */
-class normal_initializer : public weights_initializer {
+template <typename TensorDataType>
+class normal_initializer
+  : public Cloneable<normal_initializer<TensorDataType>,
+                     data_type_weights_initializer<TensorDataType>> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
-  normal_initializer(DataType mean = DataType(0),
-                     DataType standard_deviation = DataType(1))
-    : weights_initializer(),
-      m_mean(mean),
-      m_standard_deviation(standard_deviation) {}
-  normal_initializer* copy() const override {
-    return new normal_initializer(*this);
-  }
-  std::string get_type() const { return "normal"; }
-  description get_description() const;
-  void fill(AbsDistMat& matrix) override;
+  normal_initializer(
+    TensorDataType mean = El::TypeTraits<TensorDataType>::Zero(),
+    TensorDataType standard_deviation = El::TypeTraits<TensorDataType>::One())
+    : m_mean{mean},
+      m_standard_deviation{standard_deviation}
+  {}
+  std::string get_type() const override { return "normal"; }
+  description get_description() const override;
+  void fill(AbsDistMatrixType& matrix) override;
 
 private:
 
   /** Normal distribution mean. */
-  DataType m_mean;
+  TensorDataType m_mean;
   /** Normal distribution standard deviation. */
-  DataType m_standard_deviation;
+  TensorDataType m_standard_deviation;
 
 };
 
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_constant_initializer_from_pbuf(google::protobuf::Message const& msg);
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_value_initializer_from_pbuf(google::protobuf::Message const& msg);
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg);
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_normal_initializer_from_pbuf(google::protobuf::Message const& msg);
+
+#ifndef LBANN_INITIALIZER_INSTANTIATE
+#define PROTO(T)                                          \
+  extern template class data_type_weights_initializer<T>; \
+  extern template class constant_initializer<T>;          \
+  extern template class value_initializer<T>;             \
+  extern template class uniform_initializer<T>;           \
+  extern template class normal_initializer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_INITIALIZER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_WEIGHTS_INITIALIZER_HPP
diff --git a/include/lbann/weights/variance_scaling_initializers.hpp b/include/lbann/weights/variance_scaling_initializers.hpp
index c6256cfe956..3f8ea0f2560 100644
--- a/include/lbann/weights/variance_scaling_initializers.hpp
+++ b/include/lbann/weights/variance_scaling_initializers.hpp
@@ -28,6 +28,7 @@
 #define LBANN_WEIGHTS_VARIANCE_SCALING_INITIALIZER_HPP
 
 #include "lbann/weights/initializer.hpp"
+#include "lbann/utils/cloneable.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
@@ -42,20 +43,33 @@ namespace lbann {
  *  with layers that set fan-in and fan-out parameters, e.g. the
  *  convolution and fully-connected layers.
  */
-class variance_scaling_initializer : public weights_initializer {
+template <typename TensorDataType>
+class variance_scaling_initializer
+  : public Cloneable<
+      HasAbstractFunction<variance_scaling_initializer<TensorDataType>>,
+      data_type_weights_initializer<TensorDataType>> {
+public:
+  /** @name Public Types */
+  ///@{
+
+  /** @brief The tensor type expected in this object. */
+  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
+
+  ///@}
+
 public:
   variance_scaling_initializer(probability_distribution dist);
-  description get_description() const;
-  void fill(AbsDistMat& matrix) override;
+  description get_description() const override;
+  void fill(AbsDistMatrixType& matrix) override;
 
   /** Set fan-in parameter. */
   void set_fan_in(El::Int fan_in) { m_fan_in = fan_in; }
   /** Set fan-out parameter. */
   void set_fan_out(El::Int fan_out) { m_fan_out = fan_out; }
 
-protected:
+private:
   /** Get probability distribution variance. */
-  virtual DataType get_variance(El::Int fan_in, El::Int fan_out) = 0;
+  virtual TensorDataType get_variance(El::Int fan_in, El::Int fan_out) = 0;
 
 private:
   /** Probability distribution. */
@@ -71,44 +85,77 @@ class variance_scaling_initializer : public weights_initializer {
  *
  *  Also called Xavier initialization.
  */
-class glorot_initializer : public variance_scaling_initializer {
+template <typename TensorDataType>
+class glorot_initializer
+  : public Cloneable<glorot_initializer<TensorDataType>,
+                     variance_scaling_initializer<TensorDataType>> {
+  using BaseType = Cloneable<glorot_initializer<TensorDataType>,
+                             variance_scaling_initializer<TensorDataType>>;
 public:
   glorot_initializer(probability_distribution prob_dist)
-    : variance_scaling_initializer(prob_dist) {}
-  glorot_initializer* copy() const override {
-    return new glorot_initializer(*this);
-  }
-  std::string get_type() const { return "Glorot"; }
-protected:
-  DataType get_variance(El::Int fan_in, El::Int fan_out) override;
+    : BaseType(prob_dist) {}
+  std::string get_type() const override { return "Glorot"; }
+private:
+  TensorDataType get_variance(El::Int fan_in, El::Int fan_out) override;
 };
 
 /** @brief Fill weights with variance of 2 / fan-in. */
-class he_initializer : public variance_scaling_initializer {
+template <typename TensorDataType>
+class he_initializer
+  : public Cloneable<he_initializer<TensorDataType>,
+                     variance_scaling_initializer<TensorDataType>> {
+  using BaseType = Cloneable<he_initializer<TensorDataType>,
+                             variance_scaling_initializer<TensorDataType>>;
 public:
   he_initializer(probability_distribution prob_dist)
-    : variance_scaling_initializer(prob_dist) {}
-  he_initializer* copy() const override {
-    return new he_initializer(*this);
-  }
-  std::string get_type() const { return "He"; }
-protected:
-  DataType get_variance(El::Int fan_in, El::Int fan_out) override;
+    : BaseType(prob_dist) {}
+  std::string get_type() const override { return "He"; }
+private:
+  TensorDataType get_variance(El::Int fan_in, El::Int fan_out) override;
 };
 
 /** @brief Fill weights with variance of 1 / fan-in. */
-class lecun_initializer : public variance_scaling_initializer {
+template <typename TensorDataType>
+class lecun_initializer
+  : public Cloneable<lecun_initializer<TensorDataType>,
+                     variance_scaling_initializer<TensorDataType>> {
+  using BaseType = Cloneable<lecun_initializer<TensorDataType>,
+                             variance_scaling_initializer<TensorDataType>>;
 public:
   lecun_initializer(probability_distribution prob_dist)
-    : variance_scaling_initializer(prob_dist) {}
-  lecun_initializer* copy() const override {
-    return new lecun_initializer(*this);
-  }
-  std::string get_type() const { return "LeCun"; }
-protected:
-  DataType get_variance(El::Int fan_in, El::Int fan_out) override;
+    : BaseType(prob_dist) {}
+  std::string get_type() const override { return "LeCun"; }
+private:
+  TensorDataType get_variance(El::Int fan_in, El::Int fan_out) override;
 };
 
+void set_fan_in(weights_initializer& initializer, double value);
+void set_fan_out(weights_initializer& initializer, double value);
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg);
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_he_initializer_from_pbuf(google::protobuf::Message const& msg);
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_lecun_initializer_from_pbuf(google::protobuf::Message const& msg);
+
+#ifndef LBANN_VARIANCE_SCALING_INITIALIZER_INSTANTIATE
+#define PROTO(T)                               \
+  extern template class glorot_initializer<T>; \
+  extern template class he_initializer<T>;     \
+  extern template class lecun_initializer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+#endif // LBANN_VARIANCE_SCALING_INITIALIZER_INSTANTIATE
+
 } // namespace lbann
 
 #endif // LBANN_WEIGHTS_VARIANCE_SCALING_INITIALIZER_HPP
diff --git a/include/lbann/weights/weights.hpp b/include/lbann/weights/weights.hpp
index 784869331fa..651a3b5ace5 100644
--- a/include/lbann/weights/weights.hpp
+++ b/include/lbann/weights/weights.hpp
@@ -27,21 +27,24 @@
 #ifndef LBANN_WEIGHTS_HPP
 #define LBANN_WEIGHTS_HPP
 
-#include <string>
-#include <vector>
-#include <memory>
-
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
-#include "lbann/weights/initializer.hpp"
-#include "lbann/optimizers/optimizer.hpp"
 #include "lbann/io/persist.hpp"
+#include "lbann/utils/cloneable.hpp"
 #include "lbann/utils/description.hpp"
-#include <lbann.pb.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lbann_data {
+class WeightsData;
+}
 
 namespace lbann {
 
 // Forward declaration
+class weights_initializer;
 class optimizer;
 
 /** Neural network weights.
@@ -58,13 +61,18 @@ class optimizer;
  *  Note that LBANN weights are similar to Tensorflow variables and
  *  Caffe parameters.
  */
-class weights {
-  friend class optimizer;
+class weights : public Cloneable<HasAbstractFunction<weights>> {
+private:
+  weights();
+  // -----------------------------------------------
+  // Internal method for setting the comm pointer
+  // -----------------------------------------------
+  void set_comm(lbann_comm& comm);
+  void setup_default_matrix_distribution();
 
 public:
   weights(lbann_comm* comm);
-  weights(const weights& other);
-  weights& operator=(const weights& other);
+  virtual ~weights() = default;
 
   /** Set weights name.
    *  Each set of weights in a model should have a unique,
@@ -74,16 +82,16 @@ class weights {
   /** Get weights name. */
   std::string get_name() const { return m_name; }
 
-  /** Create a copy of the weights.
-   *  This function dynamically allocates memory for a weights
-   *  instance and instantiates a copy. The caller is responsible for
-   *  deallocating the instance.
-   */
-  weights* copy() const { return new weights(*this); }
+  lbann_comm& get_comm() const {
+    if(m_comm == nullptr) { LBANN_ERROR("weights class has null comm pointer"); }
+    return *m_comm;
+  }
 
   /** Human-readable description. */
   description get_description() const;
 
+  virtual bool has_optimizer() const = 0;
+
   // -----------------------------------------------
   // Dimension accessors
   // -----------------------------------------------
@@ -128,17 +136,45 @@ class weights {
   /** Set weight tensor dimensions as a 1D tensor. */
   void set_dims(int size) { set_dims({size}, {}); }
 
+  // -----------------------------------------------
+  // Matrix distribution accessors
+  // -----------------------------------------------
+  El::DistData get_matrix_distribution() const;
+  void set_matrix_distribution(El::DistData dist);
+
+  /** @name Matrix accessors */
+  ///@{
+  /** @brief Set the values matrix to the given matrix.
+   *
+   *  The input matrix must be compatible with the established matrix
+   *  dimensions. If the data type of the input matrix is different
+   *  from that expected by the weights object, they will be cast to
+   *  the data type expected by the weights object.
+   *
+   *  @throws lbann::exception If the input matrix has incompatible
+   *                           dimensions.
+   *
+   *  @todo (trb 05/28/2020): Should this check the DistData of the
+   *  input against the expected DistData for the weights object?
+   */
+  void set_values(El::BaseDistMatrix const& values);
+
+  /** @brief Access the matrix of weights values. */
+  virtual El::BaseDistMatrix& get_values() = 0;
+  virtual El::BaseDistMatrix const& get_values() const = 0;
+  ///@}
+
   // -----------------------------------------------
   // Initializer accessors
   // -----------------------------------------------
   /** Get weights initializer. */
-  weights_initializer* get_initializer();
+  virtual weights_initializer* get_initializer() = 0;
   /** Get weights initializer (const). */
-  const weights_initializer* get_initializer() const;
+  virtual const weights_initializer* get_initializer() const = 0;
   /** Set weights initializer.
    *  The contents of 'init' are moved to a class member.
    */
-  void set_initializer(std::unique_ptr<weights_initializer>& init);
+  virtual void set_initializer(std::unique_ptr<weights_initializer>&& init) = 0;
 
   // -----------------------------------------------
   // Optimizer accessors
@@ -146,78 +182,68 @@ class weights {
   /** Get weights optimizer.
    *  Returns a null pointer if the weights are frozen.
    */
-  optimizer* get_optimizer();
+  virtual optimizer* get_optimizer() = 0;
   /** Get weights optimizer.
    *  Returns a null pointer if the weights are frozen.
    */
-  const optimizer* get_optimizer() const;
+  virtual const optimizer* get_optimizer() const = 0;
   /** Set weights optimizer.
    *  The contents of opt are moved to a class member.
    */
-  void set_optimizer(std::unique_ptr<optimizer>& opt);
+  virtual void set_optimizer(std::unique_ptr<optimizer>&& opt) = 0;
 
   // -----------------------------------------------
-  // Matrix distribution accessors
+  // Setup
   // -----------------------------------------------
-  El::DistData get_matrix_distribution() const;
-  void set_matrix_distribution(El::DistData dist);
+  void setup();
 
   // -----------------------------------------------
-  // Setup
+  // Freezing
   // -----------------------------------------------
-  void setup();
+  /** Disable weight optimization. */
+  void freeze() { m_frozen = true; }
+  /** Enable weight optimization. */
+  void unfreeze() { m_frozen = false; }
+  /** Whether weight optimization is enabled. */
+  bool is_frozen() const { return m_frozen; }
 
   // -----------------------------------------------
   // Weight matrix accessors
   // -----------------------------------------------
 
-  /** Get the weight matrix. */
-  AbsDistMat& get_values();
-  /** Get the weight matrix. */
-  const AbsDistMat& get_values() const;
-  /** Set the weight matrix. */
-  void set_values(const AbsDistMat& values);
-
-  /** Set a weight value. */
-  void set_value(DataType value, int index);
-  /** Set an entry in the weight tensor. */
-  void set_value(DataType value, std::vector<int> pos);
-  /** Set an entry in the weight matrix. */
-  void set_value(DataType value, int row, int col);
-
   /** Reconcile weight values.
    *  If weight values are duplicated across multiple processes, they
    *  are set to the average across the processes.
    */
-  void reconcile_values();
+  virtual void reconcile_values() = 0;
   /** Asynchronously reconcile weight values.
    *  If weight values are duplicated across multiple processes, they
    *  are set to the average across the processes.
    */
-  void reconcile_values(Al::request& req);
-
-  // -----------------------------------------------
-  // Freezing
-  // -----------------------------------------------
-  /** Disable weight optimization. */
-  void freeze() { m_frozen = true; }
-  /** Enable weight optimization. */
-  void unfreeze() { m_frozen = false; }
-  /** Whether weight optimization is enabled. */
-  bool is_frozen() const { return m_frozen; }
+  virtual void reconcile_values(Al::request& req) = 0;
 
   // -----------------------------------------------
   // Checkpointing
   // -----------------------------------------------
-  bool save_to_checkpoint_shared(persist& p);
-  bool load_from_checkpoint_shared(persist& p);
-  bool load_from_save(std::string const& ckpt_dir, std::vector<std::string> const& weight_list);
-  bool save_to_checkpoint_distributed(persist& p);
-  bool load_from_checkpoint_distributed(persist& p);
+  virtual bool save_to_checkpoint_shared(persist& p) = 0;
+  virtual bool load_from_checkpoint_shared(persist& p) = 0;
+  virtual bool load_from_save(std::string const& ckpt_dir, std::vector<std::string> const& weight_list) = 0;
+  virtual bool save_to_checkpoint_distributed(persist& p) = 0;
+  virtual bool load_from_checkpoint_distributed(persist& p) = 0;
 
   /** Write weights to proto file */
-  void write_proto(lbann_data::WeightsData* proto) const;
+  virtual void write_proto(lbann_data::WeightsData* proto) const = 0;
+
+protected:
+
+  weights(const weights& other);
+  weights& operator=(const weights& other);
 
+private:
+  virtual void do_augment_description_(description&) const = 0;
+  virtual void do_setup_() = 0;
+  virtual void do_set_dims_(std::vector<int> const& matrix_height_dims,
+                            std::vector<int> const& matrix_width_dims) = 0;
 private:
 
   /** Weights name.
@@ -243,18 +269,6 @@ class weights {
   /** Whether weight optimization is disabled. */
   bool m_frozen;
 
-  /** Weight matrix. */
-  std::unique_ptr<AbsDistMat> m_values;
-
-  /** Weights initializer.
-   *  Default is nullptr, which corresponds to zero initialization.
-   */
-  std::unique_ptr<weights_initializer> m_initializer;
-  /** Weights optimizer.
-   *  Default is nullptr, which corresponds to no optimizer.
-   */
-  std::unique_ptr<optimizer> m_optimizer;
-
 };
 
 } // namespace lbann
diff --git a/include/lbann/weights/weights_helpers.hpp b/include/lbann/weights/weights_helpers.hpp
new file mode 100644
index 00000000000..e18a5168b31
--- /dev/null
+++ b/include/lbann/weights/weights_helpers.hpp
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_WEIGHTS_WEIGHTS_HELPERS_HPP_INCLUDED
+#define LBANN_WEIGHTS_WEIGHTS_HELPERS_HPP_INCLUDED
+
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/typename.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/weights/weights.hpp"
+
+/** @file
+ *
+ *  A hacky utility for dealing with layers that require access to
+ *  mutable weights values (e.g. batchnorm layers). A future refactor
+ *  should focus on cleaning this up; a suitable metric for at least
+ *  some success would be the deletion of this file.
+ *
+ *  This file mostly exists because the code is common across several
+ *  layers, but it is not needed in the header.
+ */
+
+namespace lbann {
+namespace weights_details {
+
+/** @class SafeWeightsAccessor
+ *  @brief Ensure safe access to weights objects' data.
+ */
+template <typename TensorDataType>
+struct SafeWeightsAccessor
+{
+  using ValuesType = El::AbstractDistMatrix<TensorDataType>;
+  using DataTypeWeights = data_type_weights<TensorDataType>;
+
+  static ValuesType& mutable_values(weights& w)
+  {
+    auto* dtw = dynamic_cast<DataTypeWeights*>(&w);
+    if (!dtw)
+      LBANN_ERROR("Weights object named \"", w.get_name(),
+                  "\" does not have weights of dynamic type \"",
+                  TypeName<TensorDataType>(), "\".");
+    return dtw->get_values();
+  }
+};// class SafeWeightsAccessor
+
+}// namespace weights_details
+}// namespace lbann
+#endif // LBANN_WEIGHTS_WEIGHTS_HELPERS_HPP_INCLUDED
diff --git a/include/lbann/weights/weights_proxy.hpp b/include/lbann/weights/weights_proxy.hpp
new file mode 100644
index 00000000000..989bf79c6c7
--- /dev/null
+++ b/include/lbann/weights/weights_proxy.hpp
@@ -0,0 +1,398 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+#ifndef LBANN_WEIGHTS_WEIGHTS_PROXY_HPP_INCLUDED
+#define LBANN_WEIGHTS_WEIGHTS_PROXY_HPP_INCLUDED
+
+#include "lbann_config.hpp"
+
+#include "lbann/base.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/weights/weights.hpp"
+
+#if defined LBANN_DEBUG
+#define LBANN_DEBUG_ASSERT_POINTER(ptr) \
+  do { if (!ptr) LBANN_ERROR("Pointer \"" #ptr "\" is null."); } while (0)
+#define LBANN_IN_DEBUG_MODE true
+#else
+#define LBANN_DEBUG_ASSERT_POINTER(ptr)
+#define LBANN_IN_DEBUG_MODE false
+#endif
+
+namespace lbann {
+
+/** @class WeightsProxy
+ *  @brief Proxy a weights object as a different data type.
+ *
+ *  This class is intended to be an implementation detail of the
+ *  layers' interactions with weights objects. Thus, the
+ *  implementation employs a programming-by-contract approach in an
+ *  effort to avoid, e.g., safe dereferences to internal pointer
+ *  members.
+ *
+ *  @sec Class contract
+ *
+ *  It is invalid to attempt to access the values() or
+ *  master_weights() of a WeightsProxy object for which empty()
+ *  returns @c true.
+ *
+ *  It is invalid to derive meaning from the values() of a
+ *  WeightsProxy object after construction or after modifying the
+ *  master weights externally until synchronize_with_master() is
+ *  called on that WeightsProxy object
+ *
+ *  If the memory for the values matrix of a master weights object
+ *  watched by a WeightsProxy object is replaced for any reason, the
+ *  user is responsible for calling setup() on that object again. The
+ *  local values are subsequently considered invalid until
+ *  synchronize_with_master() is called on that object.
+ *
+ *  @tparam TensorDataType The type to which the weights are proxied.
+ */
+template <typename TensorDataType>
+class WeightsProxy
+{
+  /** @brief The data_type_weights type for which the proxy would just
+   *         be a view. */
+  using DataTypeWeights = data_type_weights<TensorDataType>;
+  /** @brief The type of weights values. */
+  using ValuesType = El::AbstractDistMatrix<TensorDataType>;
+  /** @brief Convenience typedef for poitners to weights values. */
+  using ValuesPtrType = std::unique_ptr<ValuesType>;
+public:
+
+  /** @name Constructors */
+  ///@{
+
+  /** @brief Construct an empty proxy. */
+  WeightsProxy() = default;
+
+  /** @brief Construct a proxy given the master object.
+   *
+   *  @param w Master weights object, which must have a valid storage
+   *           matrix initialized internally.
+   */
+  WeightsProxy(weights const& w)
+    : master_weights_{&w},
+      values_{setup_values_(w)}
+  {}
+
+  /** @brief Construct a proxy given the master object.
+   *
+   *  This is a shortcut for the case that the master weights dynamic
+   *  type is already known.
+   *
+   *  @param w Master weights object, which must have a valid storage
+   *           matrix initialized internally.
+   *
+   *  @tparam T (Deduced) The type of the input weights object's
+   *                      values.
+   */
+  template <typename T>
+  WeightsProxy(data_type_weights<T> const& w)
+    : master_weights_{&w},
+      values_{setup_values_(w)}
+  {}
+
+  /** @brief Copy a WeightsProxy object.
+   *
+   *  Creates a new proxy to the same weights object.
+   */
+  WeightsProxy(WeightsProxy const& other)
+    : master_weights_(other.master_weights_),
+      values_(other.master_weights_
+              ? setup_values_(*other.master_weights_)
+              : nullptr)
+  {}
+
+  /** @brief Copy a WeightsProxy object.
+   *
+   *  Creates a new proxy to the same weights object.
+   *
+   *  @tparam T (Deduced) The type of the input weights object's
+   *                      values.
+   */
+  template <typename T>
+  WeightsProxy(WeightsProxy<T> const& other)
+    : WeightsProxy()
+  {
+    if (!other.empty())
+      this->setup(other.master_weights());
+  }
+
+  /** @brief Move a WeightsProxy object.
+   *
+   *  Unlike copy construction, move construction is only supported
+   *  for WeightsProxy objects of the same static type.
+   */
+  WeightsProxy(WeightsProxy&& other) noexcept
+    : master_weights_{other.master_weights_},
+      values_{std::move(other.values_)}
+  {
+    other.clear();
+  }
+
+  /** @brief Destructor */
+  ~WeightsProxy() noexcept
+  {
+    this->clear();
+  }
+
+  ///@}
+
+  /** @name Assignment operators */
+  ///@{
+
+  /** @brief Copy assignment operator. */
+  WeightsProxy& operator=(WeightsProxy const& other)
+  {
+    WeightsProxy<TensorDataType>(other).swap(*this);
+    return *this;
+  }
+
+  /** @brief Assignment from WeightsProxy object of a different type.
+   *
+   *  After assignment, @c this and @c other both proxy the weights
+   *  proxied by @c other.
+   *
+   *  @tparam T (Deduced) The type of the input weights object's
+   *                      values.
+   */
+  template <typename T>
+  WeightsProxy& operator=(WeightsProxy<T> const& other)
+  {
+    WeightsProxy<TensorDataType>(other).swap(*this);
+    return *this;
+  }
+
+  /** @brief Assignment from data_type_weights object.
+   *
+   *  @tparam T (Deduced) The type of the input weights object's
+   *                      values.
+   */
+  template <typename T>
+  WeightsProxy& operator=(data_type_weights<T> const& w)
+  {
+    this->setup(w);
+    return *this;
+  }
+
+  /** @brief Assignment from data_type_weights object */
+  WeightsProxy& operator=(weights const& w)
+  {
+    this->setup(w);
+    return *this;
+  }
+
+  /** @brief Move assignment from another proxy object. */
+  WeightsProxy& operator=(WeightsProxy&& other) noexcept
+  {
+    // "Move-and-swap" idiom
+    WeightsProxy<TensorDataType>(std::move(other)).swap(*this);
+    return *this;
+  }
+
+  ///@}
+  /** @name Master object management and synchronization. */
+  ///@{
+
+  /** @brief Restore the default state of the proxy.
+   *
+   *  After this function is called, the object will be empty().
+   */
+  void clear() noexcept
+  {
+    values_.reset();
+    master_weights_ = nullptr;
+  }
+
+  /** @brief Provide setup function for delayed construction.
+   *
+   *  This overwrites any existing data.
+   *
+   *  @param w The weights object to be proxied.
+   */
+  void setup(weights const& w)
+  {
+    this->internal_setup_(w);
+  }
+
+  /** @brief Provide setup function for delayed construction.
+   *
+   *  This overwrites any existing data.
+   *
+   *  @param w The weights object to be proxied.
+   *
+   *  @tparam T (Deduced) The type of the input weights object's
+   *                      values.
+   */
+  template <typename T>
+  void setup(data_type_weights<T> const& w)
+  {
+    this->internal_setup_(w);
+  }
+
+  /** @brief Synchronize the held values with the master set.
+   *
+   *  If empty(), this function takes the view that there is no master
+   *  with which to synchronize, so no action is required -- it is a
+   *  no-op.
+   */
+  void synchronize_with_master()
+  {
+    if (!empty() && !values_->Viewing()) {
+      El::Copy(master_weights_->get_values(), *values_);
+    }
+  }
+
+  ///@}
+  /** @name Queries and accessors */
+  ///@{
+
+  /** @brief Check if the proxy is referencing a weights object. */
+  bool empty() const noexcept { return values_ == nullptr; }
+
+  /** @brief Access the values.
+   *
+   *  The contract of this class specifies that this function is only
+   *  valid if not empty(). Users are expected to ensure this
+   *  contract.
+   */
+  ValuesType const& values() const noexcept(!LBANN_IN_DEBUG_MODE)
+  {
+    LBANN_DEBUG_ASSERT_POINTER(values_);
+    return *values_;
+  }
+
+  /** @brief Access the master weights object directly.
+   *
+   *  The contract of this class specifies that this function is only
+   *  valid if not empty(). Users are expected to ensure this
+   *  contract.
+   */
+  weights const& master_weights() const noexcept(!LBANN_IN_DEBUG_MODE)
+  {
+    LBANN_DEBUG_ASSERT_POINTER(master_weights_);
+    return *master_weights_;
+  }
+
+  ///@}
+  /** @name Utility functions */
+  ///@{
+
+  /** @brief Swap contents with another WeightsProxy object. */
+  void swap(WeightsProxy<TensorDataType>& other)
+  {
+    std::swap(master_weights_, other.master_weights_);
+    std::swap(values_, other.values_);
+  }
+
+  ///@}
+
+private:
+  /** @name Private setup functions */
+  ///@{
+
+  /** @brief Internal mechanics of the setup routine.
+   *
+   *  The purpose for this simple indirection is to ignore a more
+   *  complicated SFINAE indirection in the public interface. This
+   *  function is only called by the public setup() functions.
+   *
+   *  @tparam WeightsType (Deduced) The type of the input weights
+   *                      object.
+   */
+  template <class WeightsType>
+  void internal_setup_(WeightsType const& w)
+  {
+    auto vals = setup_values_(w);
+    master_weights_ = &w;
+    std::swap(vals, values_);
+  }
+
+  /** @brief Establish the view of the master data. */
+  ValuesPtrType setup_values_(DataTypeWeights const& dtw) const
+  {
+    auto const& vals = dtw.get_values();
+    ValuesPtrType ret(vals.Construct(vals.Grid(), vals.Root()));
+    El::LockedView(*ret, vals);
+    return ret;
+  }
+
+  /** @brief Establish the target matrix storage.
+   *
+   *  This only participates in overload resolution if @c OtherT is
+   *  different from @c TensorDataType, which has a dedicated
+   *  overload.
+   *
+   *  @tparam OtherT (Deduced) The type of the input weights object's
+   *                 values.
+   */
+  template <typename OtherT>
+  ValuesPtrType setup_values_(data_type_weights<OtherT> const& w) const
+  {
+    return setup_values_as_copy_(w);
+  }
+
+  /** @brief Establish the target matrix storage. */
+  ValuesPtrType setup_values_(weights const& w) const
+  {
+    if (auto dtw = dynamic_cast<data_type_weights<TensorDataType> const*>(&w))
+      return setup_values_(*dtw);
+    return setup_values_as_copy_(w);
+  }
+
+  /** @brief Create the matrix object to store the copied weights. */
+  ValuesPtrType setup_values_as_copy_(weights const& w) const
+  {
+    // In this case, w has some other dynamic type. So we need to
+    // deep-copy every time. Thus, we allocate a target for this deep
+    // copy here.
+    ValuesPtrType ret{ValuesType::Instantiate(w.get_matrix_distribution())};
+    ret->Resize(w.get_matrix_height(), w.get_matrix_width());
+    return ret;
+  }
+  ///@}
+private:
+  // These members should never observably differ in nullity.
+  /** @name Private members */
+  ///@{
+
+  /** @brief The proxied master weights. */
+  weights const* master_weights_ = nullptr;
+
+  /** @brief The values in this data type. */
+  ValuesPtrType values_;
+
+  ///@}
+};
+
+// Conform to LBANN's scheme
+template <typename TensorDataType>
+using weights_proxy = WeightsProxy<TensorDataType>;
+
+}// namespace lbann
+#undef LBANN_IN_DEBUG_MODE
+#endif // LBANN_WEIGHTS_WEIGHTS_PROXY_HPP_INCLUDED
diff --git a/model_zoo/CMakeLists.txt b/model_zoo/CMakeLists.txt
index 6d63af9db54..cfe4a3dbc4b 100644
--- a/model_zoo/CMakeLists.txt
+++ b/model_zoo/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 # Parallel Tests
 add_executable( lbann-bin lbann.cpp )
 target_link_libraries(lbann-bin lbann )
@@ -7,10 +8,6 @@ add_executable( lbann-help lbann_help.cpp )
 target_link_libraries(lbann-help lbann )
 
 #this can be done simler - quick copy/paste hack //d hysom
-add_executable( lbann-bin2 lbann2.cpp )
-target_link_libraries(lbann-bin2 lbann )
-set_target_properties(lbann-bin2 PROPERTIES OUTPUT_NAME lbann2)
-
 add_executable( lbann-gan-bin lbann_gan.cpp )
 target_link_libraries(lbann-gan-bin lbann )
 set_target_properties(lbann-gan-bin PROPERTIES OUTPUT_NAME lbann_gan)
@@ -29,8 +26,8 @@ set_target_properties(lbann-inf-bin PROPERTIES OUTPUT_NAME lbann_inf)
 
 # Install the binaries
 install(
-  TARGETS lbann-bin lbann-bin2 lbann-gan-bin lbann-cycgan-bin lbann-aecycgan-bin
-  lbann-help
+  TARGETS lbann-bin lbann-gan-bin lbann-cycgan-bin lbann-aecycgan-bin
+  lbann-help lbann-inf-bin
   EXPORT LBANNTargets
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -40,5 +37,5 @@ install(
 
 # Install the relevant prototext
 install(FILES README.md DESTINATION ${CMAKE_INSTALL_DATADIR}/model_zoo)
-install(DIRECTORY data_readers models optimizers tests vision
+install(DIRECTORY data_readers models optimizers tests
   DESTINATION ${CMAKE_INSTALL_DATADIR}/model_zoo)
diff --git a/model_zoo/cosmoflow/README.md b/model_zoo/cosmoflow/README.md
new file mode 100644
index 00000000000..24a15013e29
--- /dev/null
+++ b/model_zoo/cosmoflow/README.md
@@ -0,0 +1,15 @@
+## Reference
+```
+Amrita Mathuriya, Deborah Bard, Peter Mendygral, Lawrence Meadows,
+James Arnemann, Lei Shao, Siyu He, Tuomas Karna, Diana Moise,
+Simon J. Pennycook, Kristyn Maschhoff, Jason Sewall, Nalini Kumar,
+Shirley Ho, Michael F. Ringenburg, Prabhat, and Victor Lee.
+"Cosmoflow: Using deep learning to learn the universe at scale."
+Proceedings of the International Conference for High Performance
+Computing, Networking, Storage, and Analysis, SC'18, pp. 65:1-65:11,
+2018.
+```
+
+Available at:
+* [The ACM Digital Library](https://dl.acm.org/citation.cfm?id=3291743)
+* [arXiv](https://arxiv.org/abs/1808.04728)
diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py
new file mode 100755
index 00000000000..1df51412411
--- /dev/null
+++ b/model_zoo/cosmoflow/cosmoflow.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+import argparse
+import os.path
+import google.protobuf.text_format as txtf
+import lbann
+import lbann.contrib.lc.launcher
+import lbann.modules as lm
+import lbann.proto as lp
+from lbann.weights import Weights
+
+import numpy as np
+
+# ----------------------------------
+# The CosmoFlow module
+# ----------------------------------
+
+class CosmoFlow(lm.Module):
+    """
+    CosmoFlow neural network.
+
+    See:
+        Amrita Mathuriya, Deborah Bard, Peter Mendygral, Lawrence Meadows,
+        James Arnemann, Lei Shao, Siyu He, Tuomas Karna, Diana Moise,
+        Simon J. Pennycook, Kristyn Maschhoff, Jason Sewall, Nalini Kumar,
+        Shirley Ho, Michael F. Ringenburg, Prabhat, and Victor Lee.
+        "Cosmoflow: Using deep learning to learn the universe at scale."
+        Proceedings of the International Conference for High Performance
+        Computing, Networking, Storage, and Analysis, SC'18, pp. 65:1-65:11,
+        2018.
+
+    Note that this model is somewhat different from the model.
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, output_size,
+                 input_width,
+                 name=None):
+        """Initialize CosmFlow.
+
+        Args:
+            output_size (int): Size of output tensor.
+            input_width (int): Width of input tensor.
+            name (str, optional): Module name
+                (default: 'cosmoflow_module<index>').
+
+        """
+        CosmoFlow.global_count += 1
+        self.instance = 0
+        self.name = (name if name
+                     else 'cosmoflow_module{0}'.format(CosmoFlow.global_count))
+        self.input_width = input_width
+        assert self.input_width in [128, 256, 512]
+
+        self.layer_params = [
+            {"type": "conv", "out_channels": 16,  "kernel_size": 3, "stride": 1},
+            {"type": "pool"},
+            {"type": "conv", "out_channels": 32,  "kernel_size": 3, "stride": 1},
+            {"type": "pool"},
+            {"type": "conv", "out_channels": 64,  "kernel_size": 3, "stride": 1},
+            {"type": "pool"},
+            {"type": "conv", "out_channels": 128, "kernel_size": 3, "stride": 2},
+            {"type": "pool"},
+            {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1},
+            {"type": "pool"},
+            {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1},
+            {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1},
+        ]
+        for p in self.layer_params:
+            if p["type"] == "conv":
+                p["padding"] = int((p["kernel_size"]-1)/2)
+
+        additional_pools = []
+        if self.input_width == 256:
+            additional_pools = [6]
+        elif self.input_width == 512:
+            additional_pools = [6, 7]
+
+        for i in additional_pools:
+            conv_idx = list(np.cumsum([1 if x["type"] == "conv" else 0 for x in self.layer_params])).index(i)
+            self.layer_params.insert(conv_idx+1, {"type": "pool"})
+
+        width = self.input_width
+        for p in self.layer_params:
+            if p["type"] == "conv":
+                output_width = int(width / p["stride"])
+            else:
+                output_width = int(width / 2)
+
+            p["width"] = output_width
+            width = output_width
+            assert width > 0
+
+        for i, param in enumerate(filter(lambda x: x["type"] == "conv", self.layer_params)):
+            conv_name ="conv"+str(i+1)
+            conv_weights = [Weights(initializer=lbann.GlorotUniformInitializer())]
+
+            param_actual = dict(param)
+            param_actual.pop("type", None)
+            param_actual.pop("width", None)
+
+            conv = lm.Convolution3dModule(
+                **param_actual,
+                activation=lbann.LeakyRelu,
+                name=self.name+"_"+conv_name,
+                bias=False,
+                weights=conv_weights)
+            setattr(self, conv_name, conv)
+
+        # Create fully-connected layers
+        fc_params = [
+            {"size": 2048},
+            {"size": 256},
+            {"size": output_size},
+        ]
+        for i, param in enumerate(fc_params):
+            fc_name ="fc"+str(i+1)
+            fc = lm.FullyConnectedModule(
+                **param,
+                activation=lbann.LeakyRelu if i < len(fc_params)-1 else None,
+                name=self.name+"_"+fc_name,
+                weights=[Weights(initializer=lbann.GlorotUniformInitializer()),
+                         Weights(initializer=lbann.ConstantInitializer(value=0.1))],
+            )
+            setattr(self, fc_name, fc)
+
+    def forward(self, x):
+        self.instance += 1
+
+        def create_pooling(x, i, w):
+            return lbann.Pooling(
+                x, num_dims=3, has_vectors=False,
+                pool_dims_i=3,
+                pool_pads_i=1,
+                pool_strides_i=2,
+                pool_mode='average',
+                name='{0}_pool{1}_instance{2}'.format(self.name,i,self.instance))
+
+        def create_dropout(x, i):
+            return lbann.Dropout(x, keep_prob=0.8,
+                                 name='{0}_drop{1}_instance{2}'.format(self.name,i,self.instance))
+
+        # Convolutional network
+        i_conv = 1
+        i_pool = 1
+        for param in self.layer_params:
+            if param["type"] == "conv":
+                x = getattr(self, "conv{}".format(i_conv))(x)
+                i_conv += 1
+
+            else:
+                x = create_pooling(x, i_pool, param["width"])
+                i_pool += 1
+
+        # Fully-connected layers
+        x = create_dropout(x,1)
+        x = self.fc1(x)
+        x = create_dropout(x,2)
+        x = self.fc2(x)
+        x = create_dropout(x,3)
+        x = self.fc3(x)
+
+        return x
+
+def create_data_reader(train_path, val_path, test_path):
+    readerArgs = []
+    for role, data_filename in [("train",    train_path),
+                                ("validate", val_path),
+                                ("test",     test_path)]:
+        if not data_filename is None:
+            readerArgs.append({"role": role, "data_filename": data_filename})
+
+    readers = []
+    for readerArg in readerArgs:
+        reader = lp.lbann_pb2.Reader(
+            name="numpy_npz_conduit_reader",
+            shuffle=True,
+            validation_percent=0,
+            absolute_sample_count=0,
+            percent_of_data_to_use=1.0,
+            scaling_factor_int16=1.0,
+            **readerArg)
+
+        readers.append(reader)
+
+    return lp.lbann_pb2.DataReader(reader=readers)
+
+# ----------------------------------
+# Command-line arguments
+# ----------------------------------
+
+desc = ('Construct and run the CosmoFlow network. '
+        'Running the experiment is only supported on LC systems.')
+parser = argparse.ArgumentParser(description=desc)
+parser.add_argument(
+    '--partition', action='store', type=str,
+    help='scheduler partition', metavar='NAME')
+parser.add_argument(
+    '--account', action='store', type=str,
+    help='scheduler account', metavar='NAME')
+parser.add_argument(
+    '--experiment-dir', action='store', type=str,
+    help='experiment directory', metavar='NAME')
+parser.add_argument(
+    "--learn-rate", action="store", default=0.0005, type=float,
+    help="The initial learning-rate")
+parser.add_argument(
+        "--nodes", action="store", default=32, type=int,
+        help="The number of nodes")
+parser.add_argument(
+        "--mini-batch-size", action="store", default=128, type=int,
+        help="The mini-batch size")
+parser.add_argument(
+        "--epochs", action="store", default=130, type=int,
+        help="The number of epochs")
+parser.add_argument(
+        "--output-size", action="store", default=4, type=int,
+        help="Size of output tensor")
+parser.add_argument(
+        "--input-width", action="store", default=256, type=int,
+        help="Width of input tensor")
+for role, label, required in [("train", "training",   True),
+                              ("val",   "validation", False),
+                              ("test",  "test",       False)]:
+    parser.add_argument(
+            "--{}-path".format(role), type=str, required=required,
+            help="Path to {} dataset".format(label), default=None)
+args = parser.parse_args()
+
+# ----------------------------------
+# Construct layer graph
+# ----------------------------------
+
+# Input data
+input = lbann.Input(io_buffer='partitioned',
+                    target_mode='regression')
+universes = lbann.Identity(input)
+secrets = lbann.Identity(input)
+
+# CosmoFlow
+x = CosmoFlow(args.output_size,
+              args.input_width).forward(universes)
+
+# Loss function
+loss = lbann.MeanSquaredError([x, secrets])
+
+# Metrics
+metrics = [lbann.Metric(loss, name="MSE", unit="")]
+
+# Callbacks
+callbacks = [
+    lbann.CallbackPrint(),
+    lbann.CallbackTimer(),
+    lbann.CallbackPolyLearningRate(
+        power=1.0,
+        num_epochs=100, # TODO: Warn if args.epochs < 100
+    ),
+    lbann.CallbackGPUMemoryUsage(),
+    lbann.CallbackDumpOutputs(
+        directory="dump_acts/",
+        layers="cosmoflow_module1_fc3_instance1 layer3",
+        execution_modes="test"
+    ),
+    lbann.CallbackProfiler(skip_init=True)
+]
+
+# ----------------------------------
+# Setup experiment
+# ----------------------------------
+
+# Setup model
+model = lbann.Model(args.mini_batch_size,
+                    args.epochs,
+                    layers=lbann.traverse_layer_graph(input),
+                    objective_function=loss,
+                    metrics=metrics,
+                    callbacks=callbacks)
+
+# Setup optimizer
+opt = lbann.Adam(learn_rate=args.learn_rate,
+                 beta1=0.9,
+                 beta2=0.99,
+                 eps=1e-8)
+
+# Setup data reader
+data_reader_proto = create_data_reader(args.train_path,
+                                       args.val_path,
+                                       args.test_path)
+
+# ----------------------------------
+# Run experiment
+# ----------------------------------
+# Note: Use `lbann.run` instead for non-LC systems.
+
+kwargs = {}
+if args.partition: kwargs['partition'] = args.partition
+if args.account: kwargs['account'] = args.account
+if args.experiment_dir: kwargs['experiment_dir'] = args.experiment_dir
+
+lbann.contrib.lc.launcher.run(model, data_reader_proto, opt,
+                              lbann_args=" --use_data_store --preload_data_store",
+                              job_name='lbann_cosmoflow',
+                              nodes=args.nodes,
+                              **kwargs)
diff --git a/model_zoo/data_readers/data_reader_ascii.prototext b/model_zoo/data_readers/data_reader_ascii.prototext
deleted file mode 100644
index f7a956b99d0..00000000000
--- a/model_zoo/data_readers/data_reader_ascii.prototext
+++ /dev/null
@@ -1,19 +0,0 @@
-data_reader {
-  reader {
-    name: "ascii"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/tinyshakespeare/"
-    data_filename: "input.txt"
-    validation_percent: 0.1
-    percent_of_data_to_use: 1.0
-  }
-  reader {
-    name: "ascii"
-    role: "test"
-    shuffle: false
-    data_filedir: "/p/lscratchh/brainusr/datasets/tinyshakespeare/"
-    data_filename: "input.txt"
-    percent_of_data_to_use: 1.0
-  }
-}
diff --git a/model_zoo/data_readers/data_reader_cifar10.prototext b/model_zoo/data_readers/data_reader_cifar10.prototext
index 17825df6524..566e47d80b1 100644
--- a/model_zoo/data_readers/data_reader_cifar10.prototext
+++ b/model_zoo/data_readers/data_reader_cifar10.prototext
@@ -3,30 +3,20 @@ data_reader {
     name: "cifar10"
     role: "train"
     shuffle: true
-    data_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin"
-    label_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin"
+    data_filedir: "/p/lscratchh/brainusr/datasets/cifar10-bin/"
     validation_percent: 0.1
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
+
+    transforms {
+      horizontal_flip {
+        p: 0.5
       }
-      augmenter {
-        disable: true
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.44653 0.48216 0.4914"
+        stddevs: "0.26159 0.24349 0.24703"
       }
     }
   }
@@ -34,29 +24,19 @@ data_reader {
     name: "cifar10"
     role: "test"
     shuffle: true
-    data_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin"
-    label_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin"
+    data_filedir: "/p/lscratchh/brainusr/datasets/cifar10-bin/"
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
+
+    transforms {
+      horizontal_flip {
+        p: 0.5
       }
-      augmenter {
-        disable: true
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    }
+    transforms {
+      normalize_to_lbann_layout {
+        means: "0.44653 0.48216 0.4914"
+        stddevs: "0.26159 0.24349 0.24703"
       }
     }
   }
diff --git a/model_zoo/data_readers/data_reader_imagenet.prototext b/model_zoo/data_readers/data_reader_imagenet.prototext
deleted file mode 100644
index b96333dd35b..00000000000
--- a/model_zoo/data_readers/data_reader_imagenet.prototext
+++ /dev/null
@@ -1,100 +0,0 @@
-data_reader {
-  reader {
-    name: "imagenet"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt"
-    label_filename: ""
-    validation_percent: 0.01
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 1000
-
-    image_preprocessor {
-      subtractor {
-        disable: true
-        image_to_sub: "mean-256x256x3-6.bin"
-      }
-
-      cropper {
-        disable: false
-        crop_width: 224
-        crop_height: 224
-        crop_randomly: true
-        resized_width: 256
-        resized_height: 256
-      }
-
-      colorizer {
-        disable: false
-      }
-
-      augmenter {
-        disable: false
-        horizontal_flip: true
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-
-      normalizer {
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-    }
-  }
-
-  reader {
-    name: "imagenet"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt"
-    label_filename: ""
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 1000
-
-    image_preprocessor {
-      subtractor {
-        disable: true
-        image_to_sub: "mean-256x256x3-6.bin"
-      }
-
-      cropper {
-        disable: false
-        crop_width: 224
-        crop_height: 224
-        crop_randomly: false
-        resized_width: 256
-        resized_height: 256
-      }
-
-      colorizer {
-        disable: false
-      }
-
-      augmenter {
-        disable: true
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-
-      normalizer {
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-    }
-  }
-}
diff --git a/model_zoo/data_readers/data_reader_jag.prototext b/model_zoo/data_readers/data_reader_jag.prototext
new file mode 100644
index 00000000000..6c5dc722528
--- /dev/null
+++ b/model_zoo/data_readers/data_reader_jag.prototext
@@ -0,0 +1,50 @@
+########################################################################
+# The JAG normalization values were computed over the 10M + 1MA + 1MB random
+# pulls from the 100M data set.  They are valid for the directories:
+# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B)
+# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
+# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B
+# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
+########################################################################
+
+data_reader {
+  requires_data_set_metadata: true
+
+  reader {
+    name: "jag_conduit"
+    role: "train"
+    shuffle: true
+    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/"
+    index_list: "100Kindex.txt"
+    index_list_per_trainer: false
+    index_list_per_model: false
+
+    validation_percent: 0
+    absolute_sample_count: 0
+    #Use 1000 of 100K samples
+    percent_of_data_to_use: 0.001
+    disable_responses: true
+    disable_labels: true
+
+    num_labels: 5
+  }
+
+  reader {
+    name: "jag_conduit"
+    role: "test"
+    shuffle: false
+    # change to a lustre path
+    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/"
+    index_list: "t1_sample_list.txt"
+    index_list_per_trainer: false
+    index_list_per_model: false
+
+    validation_percent: 0
+    absolute_sample_count: 0
+    percent_of_data_to_use: 0.005
+    disable_responses: true
+    disable_labels: true
+
+    num_labels: 5
+  }
+}
diff --git a/model_zoo/data_readers/data_reader_mnist.prototext b/model_zoo/data_readers/data_reader_mnist.prototext
index 7ac51189a10..f9911977b92 100644
--- a/model_zoo/data_readers/data_reader_mnist.prototext
+++ b/model_zoo/data_readers/data_reader_mnist.prototext
@@ -9,24 +9,9 @@ data_reader {
     validation_percent: 0.1
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
       }
     }
   }
@@ -39,24 +24,9 @@ data_reader {
     label_filename: "t10k-labels-idx1-ubyte"
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
       }
     }
   }
diff --git a/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext b/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext
index 55b4f5c8068..7a235b99c01 100644
--- a/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext
+++ b/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext
@@ -11,24 +11,9 @@ data_reader {
     num_labels: 10
     scaling_factor_int16: 0.000030518509476 # 1 / 0x7FFF
 
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
       }
     }
   }
@@ -43,24 +28,9 @@ data_reader {
     num_labels: 10
     scaling_factor_int16: 0.000030518509476 # 1 / 0x7FFF
 
-    image_preprocessor {
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: true
-        factor: 0.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
       }
     }
   }
diff --git a/model_zoo/data_readers/data_reader_moving_mnist.prototext b/model_zoo/data_readers/data_reader_moving_mnist.prototext
deleted file mode 100644
index 868fe0799f3..00000000000
--- a/model_zoo/data_readers/data_reader_moving_mnist.prototext
+++ /dev/null
@@ -1,23 +0,0 @@
-data_reader {
-  reader {
-    name: "moving_mnist"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "train-images-idx3-ubyte"
-    label_filename: "train-labels-idx1-ubyte"
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-  }
-  reader {
-    name: "moving_mnist"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "t10k-images-idx3-ubyte"
-    label_filename: "t10k-labels-idx1-ubyte"
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-  }
-}
diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt
index 115b5d06ff0..c1ba7f8519c 100644
--- a/model_zoo/jag_utils/CMakeLists.txt
+++ b/model_zoo/jag_utils/CMakeLists.txt
@@ -1,59 +1,109 @@
-if (LBANN_HAS_CONDUIT)
+# Add a target to control building all the utilities
+add_custom_target(jag-utils)
 
-  add_executable( build_index-bin build_index.cpp )
-  target_link_libraries(build_index-bin lbann )
-  set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index)
+add_executable(build_index
+  EXCLUDE_FROM_ALL build_index.cpp)
+target_link_libraries(build_index lbann)
+add_dependencies(jag-utils build_index)
 
-  add_executable( extract_random_samples-bin extract_random_samples.cpp )
-  target_link_libraries(extract_random_samples-bin lbann )
-  set_target_properties(extract_random_samples-bin PROPERTIES OUTPUT_NAME extract_random_samples)
+add_executable(extract_random_samples
+  EXCLUDE_FROM_ALL extract_random_samples.cpp)
+target_link_libraries(extract_random_samples lbann)
+add_dependencies(jag-utils extract_random_samples)
 
-  add_executable( dump_bundle-bin dump_bundle.cpp )
-  target_link_libraries(dump_bundle-bin lbann )
-  set_target_properties(dump_bundle-bin PROPERTIES OUTPUT_NAME dump_bundle)
+add_executable(dump_bundle
+  EXCLUDE_FROM_ALL dump_bundle.cpp)
+target_link_libraries(dump_bundle lbann)
+add_dependencies(jag-utils dump_bundle)
 
-  add_executable( check_images-bin check_images.cpp )
-  target_link_libraries(check_images-bin lbann )
-  set_target_properties(check_images-bin PROPERTIES OUTPUT_NAME check_images)
+add_executable(check_images
+  EXCLUDE_FROM_ALL check_images.cpp)
+target_link_libraries(check_images lbann)
+add_dependencies(jag-utils check_images)
 
-  add_executable( detect_corruption-bin detect_corruption.cpp )
-  target_link_libraries(detect_corruption-bin lbann )
-  set_target_properties(detect_corruption-bin PROPERTIES OUTPUT_NAME detect_corruption)
+add_executable(detect_corruption
+  EXCLUDE_FROM_ALL detect_corruption.cpp)
+target_link_libraries(detect_corruption lbann)
+add_dependencies(jag-utils detect_corruption)
 
-  add_executable( load_bundle2raw-bin load_bundle2raw.cpp )
-  target_link_libraries(load_bundle2raw-bin lbann )
-  set_target_properties(load_bundle2raw-bin PROPERTIES OUTPUT_NAME load_bundle2raw)
+add_executable(load_bundle2raw
+  EXCLUDE_FROM_ALL load_bundle2raw.cpp)
+target_link_libraries(load_bundle2raw lbann)
+add_dependencies(jag-utils load_bundle2raw)
 
-  add_executable( compute_min_max_images-bin compute_min_max_images.cpp )
-  target_link_libraries(compute_min_max_images-bin lbann )
-  set_target_properties(compute_min_max_images-bin PROPERTIES OUTPUT_NAME compute_min_max_images)
+add_executable(compute_min_max_images
+  EXCLUDE_FROM_ALL compute_min_max_images.cpp)
+target_link_libraries(compute_min_max_images lbann)
+add_dependencies(jag-utils compute_min_max_images)
 
-  add_executable( compute_per_channel_image_avg_min_max-bin compute_per_channel_image_avg_min_max.cpp )
-  target_link_libraries(compute_per_channel_image_avg_min_max-bin lbann )
-  set_target_properties(compute_per_channel_image_avg_min_max-bin PROPERTIES OUTPUT_NAME compute_per_channel_image_avg_min_max)
+add_executable(compute_per_channel_image_avg_min_max
+  EXCLUDE_FROM_ALL compute_per_channel_image_avg_min_max.cpp)
+target_link_libraries(compute_per_channel_image_avg_min_max lbann)
+add_dependencies(jag-utils compute_per_channel_image_avg_min_max)
 
-  add_executable( load_balance-bin load_balance.cpp )
-  target_link_libraries(load_balance-bin lbann )
-  set_target_properties(load_balance-bin PROPERTIES OUTPUT_NAME load_balance)
+add_executable(load_balance
+  EXCLUDE_FROM_ALL load_balance.cpp)
+target_link_libraries(load_balance lbann)
+add_dependencies(jag-utils load_balance)
 
-  add_executable( check_for_duplicate_samples-bin check_for_duplicate_samples.cpp )
-  target_link_libraries(check_for_duplicate_samples-bin lbann )
-  set_target_properties(check_for_duplicate_samples-bin PROPERTIES OUTPUT_NAME check_for_duplicate_samples)
+add_executable(check_for_duplicate_samples
+  EXCLUDE_FROM_ALL check_for_duplicate_samples.cpp)
+target_link_libraries(check_for_duplicate_samples lbann)
+add_dependencies(jag-utils extract_random_samples)
 
-  add_executable( test_conduit_hdf5-bin test_conduit_hdf5.cpp )
-  target_link_libraries(test_conduit_hdf5-bin lbann )
-  set_target_properties(test_conduit_hdf5-bin PROPERTIES OUTPUT_NAME test_conduit_hdf5)
+add_executable(test_conduit_hdf5
+  EXCLUDE_FROM_ALL test_conduit_hdf5.cpp)
+target_link_libraries(test_conduit_hdf5 lbann)
+add_dependencies(jag-utils test_conduit_hdf5)
 
-  add_executable( select_samples-bin select_samples.cpp )
-  target_link_libraries(select_samples-bin lbann )
-  set_target_properties(select_samples-bin PROPERTIES OUTPUT_NAME select_samples)
+add_executable(select_samples
+  EXCLUDE_FROM_ALL select_samples.cpp)
+target_link_libraries(select_samples lbann)
+add_dependencies(jag-utils select_samples)
 
-  add_executable( build_sample_id_mapping-bin build_sample_id_mapping.cpp )
-  target_link_libraries(build_sample_id_mapping-bin lbann )
-  set_target_properties(build_sample_id_mapping-bin PROPERTIES OUTPUT_NAME build_sample_id_mapping)
+add_executable(build_sample_id_mapping
+  EXCLUDE_FROM_ALL build_sample_id_mapping.cpp)
+target_link_libraries(build_sample_id_mapping lbann)
+add_dependencies(jag-utils build_sample_id_mapping)
 
-  add_executable( generate_corrupt_samples-bin generate_corrupt_samples.cpp )
-  target_link_libraries(generate_corrupt_samples-bin lbann )
-  set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples)
+add_executable(generate_corrupt_samples
+  EXCLUDE_FROM_ALL generate_corrupt_samples.cpp)
+target_link_libraries(generate_corrupt_samples lbann)
+add_dependencies(jag-utils generate_corrupt_samples)
 
-endif ()
+add_executable(compute_hydra_normalization
+  EXCLUDE_FROM_ALL compute_hydra_normalization.cpp)
+target_link_libraries(compute_hydra_normalization lbann)
+add_dependencies(jag-utils compute_hydra_normalization)
+
+add_executable(test_reading_speed
+  EXCLUDE_FROM_ALL test_reading_speed.cpp)
+target_link_libraries(test_reading_speed lbann)
+add_dependencies(jag-utils test_reading_speed)
+
+add_executable(convert
+  EXCLUDE_FROM_ALL convert.cpp)
+target_link_libraries(convert lbann)
+add_dependencies(jag-utils convert)
+
+add_executable(convert_npz_to_conduit
+  EXCLUDE_FROM_ALL convert_npz_to_conduit.cpp)
+target_link_libraries(convert_npz_to_conduit lbann)
+add_dependencies(jag-utils convert_npz_to_conduit)
+
+# Install the binaries
+install(
+  TARGETS select_samples build_sample_id_mapping build_index convert_npz_to_conduit 
+  OPTIONAL
+  EXPORT LBANNTargets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+
+# The use of `OPTIONAL` here will trigger CMake warnings. These can
+# safely be ignored and tests confirm that. See these for more info:
+#
+# https://gitlab.kitware.com/cmake/cmake/issues/18258
+# https://cmake.org/pipermail/cmake/2011-August/046014.html
diff --git a/model_zoo/jag_utils/build_index.cpp b/model_zoo/jag_utils/build_index.cpp
index 9a92153b940..89a0a5c3303 100644
--- a/model_zoo/jag_utils/build_index.cpp
+++ b/model_zoo/jag_utils/build_index.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -43,8 +41,7 @@
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
 
   if (master) {
@@ -130,6 +127,19 @@ if (j >= 400) break;
       conduit::Node n_ok;
       for (size_t h=0; h<cnames.size(); h++) {
         const std::string key_1 = "/" + cnames[h] + "/performance/success";
+
+        // adding this since hydra has one top-level child in each file
+        // that is not the root or a complete sample. Instead it's some
+        // sort of meta-data
+        bool good = conduit::relay::io::hdf5_has_path(hdf5_file_hnd, key_1);
+        if (!good) {
+          std::cerr << "missing path: " << key_1 << " (this is probably OK for hydra)\n";
+          s5 << cnames[h] << " ";
+          ++num_samples_bad;
+          ++local_num_samples_bad;
+          continue;
+        }
+
         try {
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key_1, n_ok);
         } catch (...) {
@@ -209,5 +219,3 @@ if (j >= 400) break;
   // Clean up
   return EXIT_SUCCESS;
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/build_sample_id_mapping.cpp b/model_zoo/jag_utils/build_sample_id_mapping.cpp
index 3814ef676d1..09a85f03ae8 100644
--- a/model_zoo/jag_utils/build_sample_id_mapping.cpp
+++ b/model_zoo/jag_utils/build_sample_id_mapping.cpp
@@ -19,8 +19,7 @@ using namespace std;
 using namespace lbann;
 
 int main(int argc, char **argv) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   int rank, np;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -30,15 +29,24 @@ int main(int argc, char **argv) {
   opts->init(argc, argv);
 
   // sanity check the cmd line
-  if (argc != 2) {
+  if (argc < 2) {
     if (master) {
-      cerr << "\nusage: " << argv[0] << " --base_dir=<string>\n"
+      cerr << "\nusage: " << argv[0] << " --base_dir=<string> [--hydra]\n"
            << "assumes: the file '<base_dir>/index.txt' exists\n"
-           << "output: writes the file <base_dir>/id_mapping.txt\n\n";
+           << "output: writes the file <base_dir>/id_mapping.txt\n"
+           << "hydra: you must include --hydra when building a mapping for\n"
+           << "       hydra conduit nodes, else the output file will be\n"
+           << "       meaningless, and will result in undefined behavior.";
+
     }
     return(0);
   }
 
+std::unordered_set<std::string> names;
+int total = 0;
+
+  //bool hydra = opts->get_bool("hydra");
+
   // get list of conduit filenames
   if (master) cerr << "reading filelist\n";
   vector<string> filenames;
@@ -50,7 +58,10 @@ int main(int argc, char **argv) {
   sprintf(b, "%s/index.txt", base_dir.c_str());
   std::string fn;
   std::ifstream in(b);
-  if (!in) LBANN_ERROR("can't open file for writing");
+  if (!in) {
+    std::string fn2(b);
+    LBANN_ERROR("can't open file for reading: " + fn2);
+  }
   std::string line;
   getline(in, line);
   getline(in, line);
@@ -72,6 +83,7 @@ int main(int argc, char **argv) {
   // each proc builds a map: sample_id -> local index, for the
   // conduit files for which it's responsible
   size_t q = 0;
+  conduit::Node n_ok;
   if (master) cerr << "building map\n";
   for (size_t j=rank; j<filenames.size(); j+= np) {
     out << filenames[j] << " ";
@@ -82,7 +94,57 @@ int main(int argc, char **argv) {
     std::vector<std::string> cnames;
     conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
     for (size_t h=0; h<cnames.size(); h++) {
-      out << cnames[h] << " ";
+if (cnames[h].find("META") == string::npos) {
+  ++total;
+  if (names.find(cnames[h]) != names.end()) {
+    std::cout << "XX duplicate: " << cnames[h] << "\n";
+  }
+  names.insert(cnames[h]);
+}
+      const std::string key_1 = "/" + cnames[h] + "/performance/success";
+      bool good = conduit::relay::io::hdf5_has_path(hdf5_file_hnd, key_1);
+      if (!good) {
+        std::cerr << "missing path: " << key_1 << " (this is probably OK for hydra)\n";
+        continue;
+      }
+
+      try {
+        conduit::relay::io::hdf5_read(hdf5_file_hnd, key_1, n_ok);
+      } catch (...) {
+        std::cerr << "exception hdf5_read file: " << filenames[j] << "; key: " << key_1 << "\n";
+        continue;
+      }
+      int success = n_ok.to_int64();
+      if (success == 1) {
+          // the IDs that John provided look like this:
+          // 274e5a16-7c3a-11e9-90fd-0894ef80059f/runno/run0001
+          // however, the top-level fields, e.g, "274e5a16-7c3a-11e9-90fd-0894ef80059f,"
+          // are unique, at least for the current set of hydra bricks, so for now I'm using
+          // that field as the sample_id. This has the advantage that the sample_ids are at
+          // the top level, as they are for JAG samples
+          out << cnames[h] << " ";
+
+        #if 0
+        if (hydra) {
+          const std::string key_3 = "/" + cnames[h] + "/runno";
+          if (conduit::relay::io::hdf5_has_path(hdf5_file_hnd, key_3)) {
+            try {
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key_3, n_ok);
+            } catch (...) {
+              std::cerr << "failed to read: " << key_3 << "; continuing; this is only for hydra, and may be an error\n";
+              continue;
+            }
+            std::string s3 = n_ok.as_string();
+            out << cnames[h] << "/runno/" << s3 << " ";
+          }
+        }
+
+        else {
+          out << cnames[h] << " ";
+        }
+        #endif
+
+      }
     }
     out << "\n";
   }
@@ -116,4 +178,6 @@ int main(int argc, char **argv) {
     }
   }
 
+std::cout << "\n\ntotal: " << total << " uniq: " << names.size() << "\n\n";
+
 }
diff --git a/model_zoo/jag_utils/check_for_duplicate_samples.cpp b/model_zoo/jag_utils/check_for_duplicate_samples.cpp
index 553f7aaa4be..9eb60c328b1 100644
--- a/model_zoo/jag_utils/check_for_duplicate_samples.cpp
+++ b/model_zoo/jag_utils/check_for_duplicate_samples.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -46,8 +44,7 @@ using namespace lbann;
 void get_input_names(std::unordered_set<std::string> &s);
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -99,7 +96,7 @@ int main(int argc, char *argv[]) {
       std::vector<std::string> cnames;
       try {
         conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
-      } catch (std::exception e) {
+      } catch (const std::exception&) {
         throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: hdf5_group_list_child_names failed; " + files[j]);
       }
 
@@ -139,10 +136,7 @@ int main(int argc, char *argv[]) {
         testme.insert(the_test);
       }
     }
-  } catch (exception const &e) {
-    El::ReportException(e);
-    return EXIT_FAILURE;
-  } catch (std::exception const &e) {
+  } catch (const std::exception& e) {
     El::ReportException(e);
     return EXIT_FAILURE;
   }
@@ -158,5 +152,3 @@ void get_input_names(std::unordered_set<std::string> &s) {
   s.insert("shape_model_initial_modes:(2,1)");
   s.insert("shape_model_initial_modes:(1,0)");
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/check_images.cpp b/model_zoo/jag_utils/check_images.cpp
index 29dc779fdf3..ea25c080edf 100644
--- a/model_zoo/jag_utils/check_images.cpp
+++ b/model_zoo/jag_utils/check_images.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -47,8 +45,7 @@ using namespace lbann;
 
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -87,9 +84,6 @@ int main(int argc, char *argv[]) {
       if (h % 10 == 0) std::cout << rank << " :: processed " << h << " files\n";
       try {
         hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( files[j] );
-      } catch (std::exception e) {
-        std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n";
-        continue;
       } catch (...) {
         std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n";
         continue;
@@ -98,7 +92,7 @@ int main(int argc, char *argv[]) {
       std::vector<std::string> cnames;
       try {
         conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
-      } catch (std::exception e) {
+      } catch (const std::exception&) {
         std::cerr << rank << " :: exception hdf5_group_list_child_names: " << files[j] << "\n";
         continue;
       }
@@ -108,7 +102,7 @@ int main(int argc, char *argv[]) {
         key = "/" + cnames[i] + "/performance/success";
         try {
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
-        } catch (exception const &e) {
+        } catch (const exception& e) {
           throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: caught exception reading success flag for child " + std::to_string(i) + " of " + std::to_string(cnames.size()) + "; " + e.what());
         }
         int success = n_ok.to_int64();
@@ -118,17 +112,14 @@ int main(int argc, char *argv[]) {
           std::vector<std::string> image_names;
           try {
             conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, key, image_names);
-          } catch (std::exception const &e) {
+          } catch (const std::exception&) {
             std::cerr << rank << " :: exception :hdf5_group_list_child_names for images: " << files[j] << "\n";
             continue;
           }
         }
       }
     }
-  } catch (exception const &e) {
-    El::ReportException(e);
-    return EXIT_FAILURE;
-  } catch (std::exception const &e) {
+  } catch (const std::exception& e) {
     El::ReportException(e);
     return EXIT_FAILURE;
   }
@@ -136,4 +127,3 @@ int main(int argc, char *argv[]) {
   // Clean up
   return EXIT_SUCCESS;
 }
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/compute_hydra_normalization.cpp b/model_zoo/jag_utils/compute_hydra_normalization.cpp
new file mode 100644
index 00000000000..2a29e5980b5
--- /dev/null
+++ b/model_zoo/jag_utils/compute_hydra_normalization.cpp
@@ -0,0 +1,261 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann_config.hpp"
+
+#include "conduit/conduit.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_hdf5.hpp"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include "lbann/lbann.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include <time.h>
+#include <cfloat>
+
+using namespace lbann;
+using namespace std;
+
+vector<string> get_input_names();
+vector<string> get_scalar_names();
+vector<string> get_image_names();
+
+//==========================================================================
+#define MAGIC_NUMBER 9
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+  const int rank = comm->get_rank_in_world();
+
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    ofstream out("normalize.txt");
+    if (!out) {
+      LBANN_ERROR("failed to open: normalize.txt for writing");
+    }
+
+    if (!(opts->has_string("filelist"))) {
+      if (master) {
+        throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: usage: " + argv[0] + " --filelist=<string>");
+      }
+    }
+
+    hid_t hdf5_file_hnd;
+    std::string key;
+    conduit::Node n_ok;
+    conduit::Node tmp;
+
+    int num_samples = 0;
+    vector<string> input_names = get_input_names();
+    size_t sz = input_names.size();
+    std::vector<double> inputs_v_max(sz, DBL_MIN);
+    std::vector<double> inputs_v_min(sz, DBL_MAX);
+    std::vector<double> inputs_sum(sz, 0.0);
+
+    vector<string> scalar_names = get_scalar_names();
+    sz = scalar_names.size();
+    std::vector<double> scalars_v_max(sz, DBL_MIN);
+    std::vector<double> scalars_v_min(sz, DBL_MAX);
+    std::vector<double> scalars_sum(sz, 0.0);
+
+    vector<string> image_names = get_image_names();
+    sz = image_names.size();
+    vector<vector<double>> images_v_max(sz);
+    vector<vector<double>> images_v_min(sz);
+    for (size_t h=0; h<images_v_max.size(); h++) {
+      images_v_max[h].resize(MAGIC_NUMBER, DBL_MIN);
+      images_v_min[h].resize(MAGIC_NUMBER, DBL_MAX);
+    }
+
+    ifstream in(opts->get_string("filelist").c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open " + opts->get_string("filelist") + " for reading");
+    }
+
+    size_t hhh = 0;
+    string filename;
+    while (!in.eof()) {
+      getline(in, filename);
+      if (filename.size() < 2) {
+        continue;
+      }
+      hhh += 1;
+      if (hhh % 10 == 0) std::cout << rank << " :: processed " << hhh << " filenames\n";
+
+      try {
+        hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() );
+      } catch (...) {
+        LBANN_ERROR("failed to open " + filename + " for reading");
+      }
+
+      std::vector<std::string> cnames;
+      try {
+        conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
+      } catch (...) {
+        LBANN_ERROR("exception hdf5_group_list_child_names; " + filename);
+      }
+
+      for (size_t i=0; i<cnames.size(); i++) {
+        key = "/" + cnames[i] + "/performance/success";
+        try {
+          conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
+        } catch (...) {
+          cout << "exception reading success flag for file: " + filename + " and key: " + key << endl;
+          continue;
+        }
+
+        int success = n_ok.to_int64();
+        if (success == 1) {
+          try {
+
+            for (size_t h=0; h<input_names.size(); h++) {
+              key = cnames[i] + "/inputs/" + input_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              if (v < inputs_v_min[h]) inputs_v_min[h] = v;
+              if (v > inputs_v_max[h]) inputs_v_max[h] = v;
+              inputs_sum[h] += v;
+            }
+
+            for (size_t h=0; h<scalar_names.size(); h++) {
+              key = cnames[i] + "/scalars/" + scalar_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              if (v < scalars_v_min[h]) scalars_v_min[h] = v;
+              if (v > scalars_v_max[h]) scalars_v_max[h] = v;
+              scalars_sum[h] += v;
+            }
+
+            for (size_t h=0; h<image_names.size(); h++) {
+              key = cnames[i] + "/images/" + image_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              conduit::float64_array emi = tmp.value();
+              const size_t image_size = emi.number_of_elements();
+              if (image_size != 3*3*64*64) {
+                LBANN_ERROR("image_size != 3*3*64*64");
+              }
+              int idx = 0;
+              for (int g=0; g<MAGIC_NUMBER; g++) {
+                for (int hh=0; hh<(64*64); hh++) {
+                  if (emi[idx] < images_v_min[h][g]) {
+                    images_v_min[h][g] = emi[idx];
+                  }
+                  if (emi[idx] > images_v_max[h][g]) {
+                    images_v_max[h][g] = emi[idx];
+                  }
+                  ++idx;
+                }
+              }
+            }
+
+          } catch (...) {
+            LBANN_ERROR("error reading " + key + " from file " + filename);
+          }
+        }
+        ++num_samples;
+      }
+    }
+
+    out << "    jag_input_normalization_params: [\n";
+    for (size_t h=0; h<input_names.size(); h++) {
+      double scale = 1.0 / (inputs_v_max[h] - inputs_v_min[h]);
+      double bias =  -1*inputs_v_min[h] / (inputs_v_max[h] - inputs_v_min[h]);
+      if (h < input_names.size()-1) {
+        out << "      { scale: " << scale << "  bias: " << bias << " }, #" << input_names[h] << " avg= " << inputs_sum[h] / num_samples << "}\n";
+      } else {
+        out << "      { scale: " << scale << "  bias: " << bias << " } #" << input_names[h] << " avg= " << inputs_sum[h] / num_samples << "}\n";
+      }
+    }
+    out << "    ]\n";
+
+    out << "    jag_scalar_normalization_params: [\n";
+    for (size_t h=0; h<scalar_names.size(); h++) {
+      double scale = 1.0 / (scalars_v_max[h] - scalars_v_min[h]);
+      double bias =  -1*scalars_v_min[h] / (scalars_v_max[h] - scalars_v_min[h]);
+      if (h < scalar_names.size()-1) {
+        out << "      { scale: " << scale << "  bias: " << bias << " }, #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "\n";
+      } else {
+        out << "      { scale: " << scale << "  bias: " << bias << " } #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "\n";
+      }
+    }
+    out << "    ]\n";
+
+    out << "    jag_image_normalization_params: [\n";
+    for (size_t h=0; h<image_names.size(); h++) {
+      for (size_t g=0; g<MAGIC_NUMBER; g++) {
+cout << h << " "<< g << " " << images_v_min[h][g] << " " << images_v_max[h][g] << "\n";
+        double scale = 1.0 / (images_v_max[h][g] - images_v_min[h][g]);
+        double bias =  -1*images_v_min[h][g] / (images_v_max[h][g] - images_v_min[h][g]);
+      if (h < image_names.size()-1) {
+        out << "      { scale: " << scale << "  bias: " << bias << " }, #" << image_names[h] << "\n"; // avg= TODO" << "\n";
+      } else {
+        out << "      { scale: " << scale << "  bias: " << bias << " } #" << image_names[h] << "\n"; // avg= TODO" << "\n";
+      }
+    }
+  }
+  out << "    ]\n";
+
+  cout << "\noutput was written to file: normalize.txt\n";
+  return EXIT_SUCCESS;
+}
+
+vector<string> get_input_names() {
+  vector<string> f;
+  f.push_back("p_preheat");
+  f.push_back("sc_peak");
+  f.push_back("t_3rd");
+  f.push_back("t_end");
+  return f;
+}
+
+vector<string> get_scalar_names() {
+  vector<string> f;
+  f.push_back("avg_rhor");
+  f.push_back("peak_eprod");
+  f.push_back("peak_tion_bw_DT");
+  f.push_back("bt_tion_bw_DT");
+  f.push_back("avg_tion_bw_DT");
+  f.push_back("adiabat");
+  f.push_back("bangt");
+  f.push_back("burnwidth");
+  f.push_back("bt_rhor");
+  f.push_back("bt_eprodr");
+  f.push_back("peak_eprodr");
+  return f;
+}
+
+vector<string> get_image_names() {
+  vector<string> f;
+  f.push_back("(90,0)/bang/image/data");
+  f.push_back("(0,0)/bang/image/data");
+  return f;
+}
diff --git a/model_zoo/jag_utils/compute_min_max_images.cpp b/model_zoo/jag_utils/compute_min_max_images.cpp
index 42167e082ac..888230ad075 100644
--- a/model_zoo/jag_utils/compute_min_max_images.cpp
+++ b/model_zoo/jag_utils/compute_min_max_images.cpp
@@ -27,7 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
 
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
@@ -45,8 +44,7 @@ using namespace lbann;
 
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -236,7 +234,3 @@ std::cerr << rank << " :: opening for reading: " << files[j] << "\n";
   // Clean up
   return EXIT_SUCCESS;
 }
-
-
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp b/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp
index 8a5745c1a29..32a014f1de0 100644
--- a/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp
+++ b/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -45,8 +43,7 @@ using namespace lbann;
 
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -242,7 +239,3 @@ std::cerr << rank << " :: opening for reading: " << files[j] << "\n";
   // Clean up
   return EXIT_SUCCESS;
 }
-
-
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/convert.cpp b/model_zoo/jag_utils/convert.cpp
new file mode 100644
index 00000000000..479b9ad5694
--- /dev/null
+++ b/model_zoo/jag_utils/convert.cpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann_config.hpp"
+
+#include "conduit/conduit.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_hdf5.hpp"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include "lbann/lbann.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include <time.h>
+#include <cfloat>
+
+using namespace lbann;
+using namespace std;
+
+vector<string> get_input_names_jag();
+vector<string> get_scalar_names_jag();
+vector<string> get_image_names_jag();
+vector<string> get_input_names_hydra();
+vector<string> get_scalar_names_hydra();
+vector<string> get_image_names_hydra();
+void test_hydra(string filename);
+void test_jag(string filename);
+
+//==========================================================================
+#define MAX_SAMPLES 10000
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+
+  options *opts = options::get();
+  opts->init(argc, argv);
+
+  if (!(opts->has_string("filelist") && opts->has_string("output_dir") && opts->has_string("format"))) {
+    LBANN_ERROR("usage: test_speed_hydra_ --filelist=<string> --output_dir=<string> --format=<hdf5|conduit_bin>");
+  }
+
+  string filelist = opts->get_string("filelist");
+  string format = opts->get_string("format");
+  string output_dir = opts->get_string("output_dir");
+  stringstream s;
+  s << "mkdir -p " << output_dir;
+  system(s.str().c_str());
+
+    hid_t hdf5_file_hnd;
+    std::string key;
+    conduit::Node n_ok;
+    conduit::Node tmp;
+
+    vector<string> input_names = get_input_names_hydra();
+    vector<string> scalar_names = get_scalar_names_hydra();
+    vector<string> image_names = get_image_names_hydra();
+
+    int num_samples = 0;
+    int num_files = 0;
+    ifstream in(filelist.c_str());
+    int sample_id = 0;
+    string filename;
+    while (!in.eof()) {
+      getline(in, filename);
+      if (filename.size() < 2) {
+        continue;
+      }
+      ++num_files;
+      conduit::Node node;
+      hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() );
+      cout << "reading: " << filename << endl;
+
+      size_t k = filename.rfind("/");
+      stringstream s2;
+      s2 << output_dir << "/" << filename.substr(k+1);
+
+      std::vector<std::string> cnames;
+      conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
+      cout << "samples per file: " << cnames.size() << endl;
+
+      for (size_t i=0; i<cnames.size(); i++) {
+        key = "/" + cnames[i] + "/performance/success";
+        try {
+          conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
+        } catch (...) {
+          cout << "failed to read success flag for file: " + filename + " and key: " + key << "; if the key contains 'META' (for HYDRA data) this is OK\n";
+          continue;
+        }
+
+        int success = n_ok.to_int64();
+        if (success == 1) {
+          conduit::Node node2;
+          node2["/performance/success"] = 1;
+
+          for (size_t h=0; h<input_names.size(); h++) {
+            key = cnames[i] + "/inputs/" + input_names[h];
+            tmp.reset();
+            conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+            node2[ "/inputs/" + input_names[h]] = tmp;
+          }
+
+          for (size_t h=0; h<scalar_names.size(); h++) {
+            tmp.reset();
+            key = cnames[i] + "/scalars/" + scalar_names[h];
+            conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+            node2[ "/scalars/" + scalar_names[h]] = tmp;
+          }
+
+          for (size_t h=0; h<image_names.size(); h++) {
+            tmp.reset();
+            key = cnames[i] + "/images/" + image_names[h];
+            conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+            node2["/images/" + image_names[h]] = tmp;
+          }
+
+          ++num_samples;
+          node[to_string(sample_id)] = node2;
+          ++sample_id;
+        }
+      }
+
+      cout << "calling save: " << s2.str() << " format: " << format << endl;
+      conduit::relay::io::save(node, s2.str(), format);
+    }
+    in.close();
+
+}
+
+vector<string> get_input_names_hydra() {
+  vector<string> f;
+  f.push_back("p_preheat");
+  f.push_back("sc_peak");
+  f.push_back("t_3rd");
+  f.push_back("t_end");
+  return f;
+}
+
+vector<string> get_scalar_names_hydra() {
+  vector<string> f;
+  f.push_back("avg_rhor");
+  f.push_back("peak_eprod");
+  f.push_back("peak_tion_bw_DT");
+  f.push_back("bt_tion_bw_DT");
+  f.push_back("avg_tion_bw_DT");
+  f.push_back("adiabat");
+  f.push_back("bangt");
+  f.push_back("burnwidth");
+  f.push_back("bt_rhor");
+  f.push_back("bt_eprodr");
+  f.push_back("peak_eprodr");
+  return f;
+}
+
+vector<string> get_image_names_hydra() {
+  vector<string> f;
+  f.push_back("(90,0)/bang/image/data");
+  f.push_back("(0,0)/bang/image/data");
+  return f;
+}
diff --git a/model_zoo/jag_utils/convert_npz_to_conduit.cpp b/model_zoo/jag_utils/convert_npz_to_conduit.cpp
new file mode 100644
index 00000000000..a20803e6d5a
--- /dev/null
+++ b/model_zoo/jag_utils/convert_npz_to_conduit.cpp
@@ -0,0 +1,145 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "lbann/comm.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include "lbann/utils/commify.hpp"
+#include <cnpy.h>
+#include <numeric>
+#include "conduit/conduit_node.hpp"
+#include "lbann/data_store/data_store_conduit.hpp" //LBANN_DATA_ID_STR
+#include "conduit/conduit_relay_io_hdf5.hpp"
+
+
+using namespace lbann;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  bool master = comm->am_world_master();
+
+  try {
+    // Initialize options db (this parses the command line)
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    if (! opts->has_string("filelist")) {
+      if (master) {
+        std::cerr << "usage: " << argv[1] << " --filelist=<string>\n"
+                  << "function: converts npz files to conduit\n";
+      }
+      comm->global_barrier();
+      return EXIT_FAILURE;
+    }
+
+    const std::string input_fn = opts->get_string("filelist");
+
+    int rank = comm->get_rank_in_world();
+    int np = comm->get_procs_in_world();
+
+    // get list of input filenames
+    std::vector<std::string> filenames;
+    read_filelist(comm.get(), input_fn, filenames);
+
+
+    // get the shape vectors; note that shape[0] is the number of
+    // samples (aka, frames)
+    std::map<std::string, cnpy::NpyArray> aaa = cnpy::npz_load(filenames[0]);
+    std::unordered_map<std::string, std::vector<size_t>> shapes;
+    for (const auto &t : aaa) {
+      const std::vector<size_t> &shape = t.second.shape;
+      if (shape.size() == 1) {
+        shapes[t.first].push_back(1);
+      } else {
+        for (size_t x=1; x<shape.size(); x++) {
+          shapes[t.first].push_back(shape[x]);
+        }
+      }
+    }
+
+    // get the number of elements in each field
+    std::unordered_map<std::string, size_t> num_words;
+    for (const auto &t : shapes) {
+      num_words[t.first] = std::accumulate(t.second.begin(), t.second.end(), 1, std::multiplies<int>());
+    }
+
+    for (size_t j=rank; j<filenames.size(); j+=np) {
+      std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filenames[j]);
+      conduit::Node node;
+      int num_samples = a["frames"].shape[0];
+      for (int sample_index = 0; sample_index < num_samples; sample_index++) {
+        for (const auto &t : a) {
+          const std::string &name = t.first;
+
+          if (name == "frames") {
+            //pass
+          } 
+
+          else if (name == "bbs") {
+            float *data = a[name].data<float>();
+            size_t offset = sample_index*num_words["bbs"];
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/data"].set(data + offset, num_words[name]);
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/shape"].set(shapes[name]);
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/size"].set(num_words[name]);
+          } 
+
+          else { // rots, states, tilts, density_sig1, probs
+            size_t offset = sample_index*num_words[name];
+            double *data = a[name].data<double>();
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/data"].set(data + offset, num_words[name]);
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/shape"].set(shapes[name]);
+            node[LBANN_DATA_ID_STR(sample_index) + "/" + name + "/size"].set(num_words[name]);
+          }
+
+        }
+      }
+
+      // save to file
+      size_t n2 = filenames[j].rfind(".");
+      if (n2 == std::string::npos) {
+        LBANN_ERROR("n2 == std::string::npos");
+      }
+      std::string fn2 = filenames[j].substr(0, n2) + ".bin";
+      std::string fn3 = filenames[j].substr(0, n2) + ".hdf5";
+      node.save(fn2);
+      conduit::relay::io::hdf5_save(node, fn3);
+    }
+
+  } catch (std::exception const &e) {
+    if (master) std::cerr << "caught exception: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (...) {
+    std::cerr << "unknown exception in main\n";
+    return EXIT_FAILURE;
+  }
+
+  // Clean up
+  return EXIT_SUCCESS;
+}
+
diff --git a/model_zoo/jag_utils/detect_corruption.cpp b/model_zoo/jag_utils/detect_corruption.cpp
index b42b67271b3..df62c0b1e79 100644
--- a/model_zoo/jag_utils/detect_corruption.cpp
+++ b/model_zoo/jag_utils/detect_corruption.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -50,8 +48,7 @@ void print_errs(world_comm_ptr &comm, int np, int rank, std::ostringstream &s, c
 
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -259,4 +256,3 @@ void print_errs(world_comm_ptr &comm, int np, int rank, std::ostringstream &s, c
   }
   comm->global_barrier();
 }
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/dump_bundle.cpp b/model_zoo/jag_utils/dump_bundle.cpp
index 7191a65fa96..01bdab464cc 100644
--- a/model_zoo/jag_utils/dump_bundle.cpp
+++ b/model_zoo/jag_utils/dump_bundle.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -42,8 +40,7 @@
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   int np = comm->get_procs_in_world();
   if (np != 1 || argc == 1) {
@@ -60,5 +57,3 @@ int main(int argc, char *argv[]) {
 
   return EXIT_SUCCESS;
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/extract_random_samples.cpp b/model_zoo/jag_utils/extract_random_samples.cpp
index 7937183aa68..61958ae3768 100644
--- a/model_zoo/jag_utils/extract_random_samples.cpp
+++ b/model_zoo/jag_utils/extract_random_samples.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -83,8 +81,7 @@ void print_sample_ids(
   const std::vector<std::set<int> > &samples);
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -150,11 +147,11 @@ int main(int argc, char *argv[]) {
 
     extract_samples(comm.get(), rank, np, conduit_filenames, samples);
 
-  } catch (exception& e) {
+  } catch (const exception& e) {
     std::cerr << "\n\n" << rank << " ::::: caught exception, outer try/catch: " << e.what() << "\n\n";
     El::ReportException(e);
     return EXIT_FAILURE;
-  } catch (std::exception& e) {
+  } catch (const std::exception& e) {
     El::ReportException(e);
     return EXIT_FAILURE;
   }
@@ -415,7 +412,7 @@ std::cerr << rank << " samples.size: " << samples.size() << " np: " << np << "\n
 
     try {
       conduit::relay::io::hdf5_close_file( hdf5_file_hnd );
-    } catch (exception e) {
+    } catch (const exception& e) {
        throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: exception hdf5_close_file; " + filenames[j] + "; " + e.what());
     }
 
@@ -434,7 +431,7 @@ std::cerr << rank << " samples.size: " << samples.size() << " np: " << np << "\n
               << "_" << file_id++ << ".bundle";
     try {
       conduit::relay::io::save(save_me, fn.str(), "hdf5");
-    } catch (exception e) {
+    } catch (const exception& e) {
       throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: exception conduit::relay::save(); what: " + e.what());
     }
   }
@@ -476,4 +473,3 @@ void print_sample_ids(
   }
   std::cerr << "\n==========================================\n";
 }
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/generate_corrupt_samples.cpp b/model_zoo/jag_utils/generate_corrupt_samples.cpp
index 3a2181ea6d8..cfed8a7e67d 100644
--- a/model_zoo/jag_utils/generate_corrupt_samples.cpp
+++ b/model_zoo/jag_utils/generate_corrupt_samples.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay_io_handle.hpp"
 #include <iostream>
@@ -42,8 +40,7 @@
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -103,7 +100,7 @@ int main(int argc, char *argv[]) {
     std::vector<std::string> cnames;
     try {
       hndl.list_child_names(cnames);
-    } catch (std::exception e) {
+    } catch (const std::exception&) {
       err << "list_child_names failed for this file: " << files[j];
       LBANN_ERROR(err.str());
     }
@@ -161,4 +158,3 @@ int main(int argc, char *argv[]) {
   out.close();
   std::cout << "\nMade directory 'corrupt_jag_samples/' and wrote files in that directory\n\n";
 }
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/load_balance.cpp b/model_zoo/jag_utils/load_balance.cpp
index 5a3403bae68..d31f28e0e9d 100644
--- a/model_zoo/jag_utils/load_balance.cpp
+++ b/model_zoo/jag_utils/load_balance.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -43,8 +41,7 @@
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -137,7 +134,7 @@ int main(int argc, char *argv[]) {
       try {
 
         hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( files[j].c_str() );
-      } catch (std::exception e) {
+      } catch (const std::exception&) {
         std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n";
         continue;
       }
@@ -145,7 +142,7 @@ int main(int argc, char *argv[]) {
       std::vector<std::string> cnames;
       try {
         conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
-      } catch (std::exception e) {
+      } catch (const std::exception&) {
         std::cerr << rank << " :: exception hdf5_group_list_child_names; " << files[j] << "\n";
         continue;
       }
@@ -156,7 +153,7 @@ int main(int argc, char *argv[]) {
         key = "/" + cnames[i] + "/performance/success";
         try {
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
-        } catch (std::exception e) {
+        } catch (const std::exception&) {
           std::cerr << rank << " :: exception reading success flag: " << files[j] << "\n";
           continue;
         }
@@ -168,7 +165,7 @@ int main(int argc, char *argv[]) {
                 conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
                 save_me["/" + cnames[i]] = node;
 
-            } catch (std::exception e) {
+            } catch (const std::exception&) {
               throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: rank " + std::to_string(rank) + " :: " + "exception reading sample: " + cnames[i] + " which is " + std::to_string(i) + " of " + std::to_string(cnames[i].size()) + "; " + files[j]);
             }
 
@@ -176,7 +173,7 @@ int main(int argc, char *argv[]) {
             if (sample_count == samples_per_file) {
               try {
                 conduit::relay::io::save(save_me, output_fn, "hdf5");
-              } catch (exception const &e) {
+              } catch (const std::exception& e) {
                 std::cerr << rank << " :: exception: failed to save conduit node to disk; what: " << e.what() << "\n";
                 continue;
               } catch (...) {
@@ -197,14 +194,14 @@ int main(int argc, char *argv[]) {
       if (sample_count) {
         try {
           conduit::relay::io::save(save_me, output_fn, "hdf5");
-        } catch (exception const &e) {
+        } catch (exception const& e) {
           std::cerr << rank << " :: exception: failed to save conduit node to disk; what: " << e.what() << "\n";
         } catch (...) {
           std::cerr << rank << " :: exception: failed to save conduit node to disk; FINAL FILE\n";
         }
       }
 
-  } catch (std::exception const &e) {
+  } catch (std::exception const& e) {
     El::ReportException(e);
     return EXIT_FAILURE;
   }
@@ -212,5 +209,3 @@ int main(int argc, char *argv[]) {
   // Clean up
   return EXIT_SUCCESS;
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/load_bundle2raw.cpp b/model_zoo/jag_utils/load_bundle2raw.cpp
index 703a39edafd..d86d180f42e 100644
--- a/model_zoo/jag_utils/load_bundle2raw.cpp
+++ b/model_zoo/jag_utils/load_bundle2raw.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -50,8 +48,7 @@ void get_input_names(std::vector<std::string> &s);
 
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int rank = comm->get_rank_in_world();
   const int np = comm->get_procs_in_world();
@@ -255,6 +252,3 @@ void get_scalar_names(std::vector<std::string> &s) {
   s.push_back("tMINradius");
   s.push_back("MINradius");
 }
-
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/python/README.txt b/model_zoo/jag_utils/python/README.txt
index d088b8f50c4..b4e6c4c27db 100644
--- a/model_zoo/jag_utils/python/README.txt
+++ b/model_zoo/jag_utils/python/README.txt
@@ -1,7 +1,19 @@
-build_trainer_lists.py
+sanity.py
+  script to test that sample lists generated by build_trainer_lists.py
+  contain unique indices.
 
-  This is a wrapper that calls the c++ code:
-    lbann/model_zoo/jag_utils/select_samples
+  usage: sanity.py id_mapping_fn bar_fn t0_fn [t1_fn, ...]
+
+partition_jag.py
+  Partitions the 100M JAG set as follows:
+  from 100M:
+    set A = 10M random samples
+    set B = remaining (90M)
+
+  from set B:
+    for j in range 0..9 :
+      extract 256 sets (C_j_0, C_j_1, ..., c_j_255)
+
+  Note: each C_j_i has ~90M / 256 samples
+  Note: also permute the lines in the sample lists for the C sets
 
-  Function: generates a set of sample_list files
-  s.t, any sample_ID appears in at most one sample_list file.
diff --git a/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py
new file mode 100644
index 00000000000..35da15160be
--- /dev/null
+++ b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py
@@ -0,0 +1,46 @@
+import sys
+
+if len(sys.argv) != 4 :
+  print 'usage:', sys.argv[0], 'index_fn id_mapping_fn output_fn'
+  exit(9)
+
+a = open(sys.argv[1])
+a.readline()
+header = a.readline()
+dir = a.readline()
+
+#build map: filename -> set of bad samples
+mp = {}
+mp_good = {}
+mp_bad = {}
+for line in a :
+  t = line.split()
+  mp[t[0]] = set()
+  mp_good[t[0]] = t[1]
+  mp_bad[t[0]] = t[2]
+  for id in t[3:] :
+    mp[t[0]].add(id)
+
+a.close()
+
+out = open(sys.argv[3], 'w')
+out.write('CONDUIT_HDF5_INCLUSION\n')
+out.write(header)
+out.write(dir)
+
+a = open(sys.argv[2])
+bad = 0
+for line in a :
+  t = line.split()
+  fn = t[0]
+  out.write(fn + ' ' + mp_good[fn] + ' ' + mp_bad[fn] + ' ')
+  for id in t[1:] :
+    if id not in mp[fn] :
+      out.write(id + ' ')
+    else :
+      bad += 1
+  out.write('\n')    
+
+out.close()
+print header
+print 'num found bad:', bad
diff --git a/model_zoo/jag_utils/python/build_trainer_lists.py b/model_zoo/jag_utils/python/build_trainer_lists.py
deleted file mode 100644
index ac4ee05535f..00000000000
--- a/model_zoo/jag_utils/python/build_trainer_lists.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env python
-import os
-import subprocess
-import sys
-import random
-import time
-
-
-def runme(cmd) :
-  print 'about to run system call:', cmd
-  t = cmd.split()
-  r = subprocess.check_call(t)
-
-
-if len(sys.argv) < 8:
-  print '\nusage:', sys.argv[0], 'index_fn sample_mapping_fn num_samples num_lists output_dir output_base_name random_seed [HOST]'
-  print 'function: creates "num_lists" sample lists from index_fn;'
-  print '          each list will contain "num_samples." Each list is printed'
-  print '          to a separate file'
-  print
-  print 'if your environment doesn\'t contain HOST (e.g: $echo $HOST pascal83) then you'
-  print 'can specify HOST as the final cmd line param'
-  print
-  print 'example invocation, lassen:'
-  print '   $ build_trainer_lists.py /p/gpfs1/brainusr/datasets/10MJAG/1M_B/index.txt /p/gpfs1/brainusr/datasets/10MJAG/1M_B/id_mapping.txt 10000 4 /p/gpfs1/brainusr/datasets/10MJAG/1M_B sample_list 42\n'
-  print
-  print 'example invocation, lustre:'
-  print '   $ build_trainer_lists.py /p/lscratchh/brainusr/datasets/10MJAG/1M_B/index.txt /p/lscratchh/brainusr/datasets/10MJAG/1M_B/id_mapping.txt 10000 4 /p/lscratchh/brainusr/datasets/10MJAG/1M_B sample_list 42\n'
-  exit(9)
-
-# defaults; because who doesn't use gnu?
-build = 'Release'
-compiler = 'gnu'
-
-# this will fail if we're not running in an lbann repo
-lbann_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'])[:-1]
-
-# get cluster name
-host = ''
-if len(sys.argv) == 9 :
-  host = sys.argv[8]
-else :
-  try :
-    host = os.environ['HOST']
-  except :
-    print '\nYour environment does not appear to contain the HOST variable;'
-    print 'therefore, please specify HOST as the final argument on the cmd line'
-    exit(9)
-
-cluster = ''
-for x in os.environ['HOST'] :
-  if not x.isdigit() :
-    cluster += x
-
-index_fn = sys.argv[1]
-mapping_fn = sys.argv[2]
-num_samples = sys.argv[3]
-num_lists = int(sys.argv[4])
-output_dir = sys.argv[5]
-output_base_name = sys.argv[6]
-seed = sys.argv[7]
-
-# get path to the c++ executable
-exe = lbann_dir + '/build/' + compiler + '.' + build + '.' + cluster \
-    + '.llnl.gov/lbann/build/model_zoo/jag_utils/select_samples'
-cur_dir = os.getcwd()
-
-# seed the random number generator
-random.seed(seed)
-
-first_fn = output_dir + '/t0_' + output_base_name + '.txt'
-bar_fn = output_dir + '/t_' + output_base_name + '.txt_bar'
-
-print 'constructing trainer file # 0 ... please wait ...'
-cmd = exe + ' --index_fn=' + index_fn + ' --sample_mapping_fn=' + mapping_fn \
-          + ' --num_samples=' + num_samples + ' --output_fn=' + first_fn    \
-          + ' --random_seed=' + seed
-runme(cmd)
-
-cmd = 'mv ' + first_fn + '_bar ' + bar_fn
-runme(cmd)
-
-filenames = []
-filenames.append(first_fn)
-
-for j in range(1, num_lists) :
-  fn = output_dir + '/t' + str(j) + '_' + output_base_name + '.txt'
-  print 'constructing trainer file #', j, '... please wait ...'
-
-  cmd = exe + ' --index_fn=' + bar_fn + ' --sample_mapping_fn=' + mapping_fn \
-            + ' --num_samples=' + num_samples + ' --output_fn=' + fn    \
-            + ' --random_seed=' + seed
-  runme(cmd)
-  filenames.append(fn)
-
-  cmd = 'mv ' + fn + '_bar ' + bar_fn
-  runme(cmd)
-filenames.append(bar_fn)
-
-os.system('chgrp brain ' + output_dir + '/*')
-os.system('chmod 660 ' + output_dir + '/*')
-
-print
-print '=================================================================\n'
-print 'generated these files:'
-for f in filenames :
-  print f
-
diff --git a/model_zoo/jag_utils/python/partition_jag.py b/model_zoo/jag_utils/python/partition_jag.py
new file mode 100755
index 00000000000..b312172a7bf
--- /dev/null
+++ b/model_zoo/jag_utils/python/partition_jag.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+'''
+=====================================================================
+This code partitions the 100M JAG set as follows:
+
+from 100M:
+  set A = 10M random samples
+  set B = remaining (90M)
+
+from set B:
+  for j in range 0..9 :
+    extract 256 sets (C_j_0, C_j_1, ..., c_j_255)
+
+Note: each C_j_i has ~90M / 256 samples
+Note: also permute the lines in the sample lists for the C sets
+=====================================================================
+'''
+
+# Change these as appropriate
+output_dir = '/p/gpfs1/hysom/jag_sample_lists_2'
+exe = '/usr/workspace/wsb/hysom/lbann/build/gnu.Release.lassen.llnl.gov/lbann/build/model_zoo/jag_utils/select_samples'
+
+# Set to false for non-repeatable output
+repeat_time = True
+
+num_multi_lists = 256
+validation_sample_count = 10000000
+data_sub_dir = '100M/'
+num_trials = 10
+
+# uncomment the following for initial testing
+'''
+num_multi_lists = 5 
+validation_sample_count = 100000
+data_sub_dir = '10MJAG/1M_A/'
+num_trials = 2
+'''
+
+#Note: print statements from this script begin with 'PY: '
+#      to distinguish them from print statements fromt
+#      the c++ executable "select_samples"
+
+#============================================================
+# All that follows should be OK for pascal or lassen
+#============================================================
+
+# Set data directory for pascal or lassen
+import socket
+host = socket.gethostname()
+data_dir = '/p/lustre2/'
+if host.find('lassen') != -1 or host.find('sierra') != -1 :
+  data_dir = '/p/gpfs1/'
+
+# Remaining global variables
+jag_dir = data_dir + 'brainusr/datasets/' + data_sub_dir
+index_fn = jag_dir + 'index.txt'
+mapping_fn = jag_dir + 'id_mapping.txt'
+output_base = 'sample_list'
+
+import os
+
+# Get random seed for the next run of select_samples
+random_seed = 1 #will be incremented 
+def get_random_seed() :
+  global random_seed
+  if repeat_time :
+    random_seed += 1
+  else :
+    random_seed += int(str((a - int(a))).split('.')[1])
+  return str(random_seed)
+
+# Make directory, if it doesn't exist
+def make_dir(d) :
+  print("PY: Making output directory (if it doesn\'t exist): " + d) 
+  r = os.system('mkdir -p ' + d)
+  if r :
+    print('PY: failed to make output directory: ' + d + '- can\'t continue')
+    exit(9)
+
+num_lists = 1 
+def run_partitioner(index_fn, mapping_fn, num_samples_per_list, num_lists, output_dir, output_base_fn, random_seed) :
+  seed = str(random_seed)
+  cmd = exe + ' --index_fn=' + index_fn \
+            + ' --mapping_fn=' + mapping_fn \
+            + ' --num_samples_per_list=' + str(num_samples_per_list) \
+            + ' --num_lists=' + str(num_lists) \
+            + ' --output_dir=' + output_dir \
+            + ' --output_base_fn=' + output_base_fn \
+            + ' --random_seed=' + seed
+
+  print('PY: about to run: ' + cmd)
+  r = os.system(cmd)
+  if r != 0 :
+    print('PY: cmd failed: ' + cmd)
+    print('PY: Can\'t continue')
+    exit(-1)
+
+make_dir(output_dir)
+
+run_partitioner(index_fn, mapping_fn, validation_sample_count, num_lists, output_dir, output_base, get_random_seed())
+
+# This is brittle!
+index_fn = output_dir + '/t_exclusion_' + output_base + '_bar' #XX
+
+a = open(index_fn)
+header = a.readline()
+header = a.readline()
+t = header.split()
+num_samples = int(t[0])
+
+
+#subtract 10 so random number generator doesn't spin ...
+multi_sample_size = int((num_samples / num_multi_lists) - 10)
+
+print('PY: num samples in ' + index_fn + ' is ' + str(num_samples))
+print('PY: num samples in multi lists is ' + str(multi_sample_size))
+print('PY: num_list: ' + str(num_lists))
+
+for j in range(num_trials) :
+  next_fn = output_dir + '/' + str(j)
+  make_dir(next_fn)
+  run_partitioner(index_fn, mapping_fn, multi_sample_size, num_multi_lists, next_fn, output_base, get_random_seed())
+
+cmd = 'find ' + output_dir + ' -type f | grep -v bar | grep -v exclusion | grep -v index | grep ' + output_base + ' > /tmp/erase_me'
+r = os.system(cmd)
+if r :
+  print('PY: cmd failed: ' + cmd)
+  exit(9)
+
+# Permute the sample_list entries
+import random
+v = open('/tmp/erase_me').readlines()
+for line in v :
+  b = open(line[:-1]).readlines()
+  print('PY: opening for write: ' + line[:-1])
+  out = open(line[:-1] + '_permuted', 'w')
+  for j in range(3) :
+    out.write(b[j])
+  p = [random.randint(3,len(b)-1) for _ in range(len(b)-3)]
+  for j in range(3, len(b)) :
+    out.write(b[p[j-3]])
+  print('PY: len(b): '+str(len(b))+' len(p): ' + str(len(b)))
+  out.close();
+
+cmd = 'find ' + output_dir + ' | grep -v bar | grep -v exclusion > /tmp/erase_me'
+cmd = 'find ' + output_dir + ' | grep permuted  > /tmp/erase_me'
+r = os.system(cmd)
+if r :
+  print('PY: cmd failed: ' + cmd)
+  exit(9)
+
+v = open('/tmp/erase_me').readlines()
+out = open(output_dir + '/sample_list_index.txt', 'w')
+out.write('The following is a list inclusion filenames for use in experiments:\n')
+for line in v : 
+  out.write(line)
+out.close()
+
+print('\n\nPY: SEE: ' + output_dir + '/sample_list_index.txt for list of sample filenames\n')
diff --git a/model_zoo/jag_utils/python/sanity.py b/model_zoo/jag_utils/python/sanity.py
index cc3a5a4e6b4..01f5c7e9636 100644
--- a/model_zoo/jag_utils/python/sanity.py
+++ b/model_zoo/jag_utils/python/sanity.py
@@ -12,12 +12,25 @@
 
 if len(sys.argv) == 1 :
   print '''
-    usage: sanity.py id_mapping_fn bar_fn t0_fn [t1_fn, ...]
+    usage: sanity.py id_mapping_fn sample_list_dir sample_list_base_name num_sample_lists
     where: bar_fn, t0_fn, etc, are outputs from build_trainer_lists.py
     function: test that the intersection of the sample IDs in the
-              sample lists are empty.\n'''
+              sample lists are empty, and that every sample_ID
+              is in either one sample list or in the exclusion (bar) file\n
+    example usage: 
+      python sanity.py \\
+        /p/lustre2/brainusr/datasets/10MJAG/1M_A/id_mapping.txt \\
+        /p/lustre2/brainusr/datasets/10MJAG/1M_A/select_samples_test/another_dir \\
+        my_samples.txt \\
+        10 
+        
+    CAUTION: this script is fragile: it may break if/when model_zoo/jag_utils/select_samples.cpp is modified
+        '''
+
+
   exit(9)
 
+#======================================================================
 def buildInc(mp, fn) :
   r = set()
   print 'buildInc; opening:', fn
@@ -29,8 +42,11 @@ def buildInc(mp, fn) :
     t = line.split()
     for j in t[3:] :
       r.add(j)
+  print '   num sample IDs:', len(r)
   return r
 
+#======================================================================
+#returns (excluded, included) sample IDs from an input EXCLUSION sample list
 def buildExc(mp, fn) :
   s = set()
   print 'buildExc; opening:', fn
@@ -42,35 +58,91 @@ def buildExc(mp, fn) :
     t = line.split()
     for j in t[3:] :
       s.add(j)
+  #at this point, 's' contains all excluded sample IDs (these are the IDs
+  #that are explicitly listed in the exclusion bar file);
+  #mp is the set of all sample IDs, whether included, or excluded 
+  #(unsuccessfule)
   r = set()
   for sample_id in mp :
     if sample_id not in s :
       r.add(sample_id)
-  return r
-
+  print '   num sample IDs:', len(r)
+  return (s, r)
 
+#======================================================================
+#build set that contains all sample names
 mp = set()
 a = open(sys.argv[1])
 for line in a :
   t = line.split()
   for j in t[1:] :
     mp.add(j)
-print '\nlen(map):', len(mp), '/n'
+print '\nlen(map):', len(mp)
 
+sample_list_dir = sys.argv[2]
+sample_list_base_name = sys.argv[3]
+
+#build exclusion set; this set contains all valid (successful) sample IDs
+(excluded, included) = buildExc(mp, sample_list_dir + '/t_exclusion_' + sample_list_base_name + '_bar')
+
+print '\nlen(included):', len(included), 'len(excluded):', len(excluded), 'intersection:', len(included.intersection(excluded))
 data = []
-s2 = buildExc(mp, sys.argv[2])
-data.append(s2)
+data.append(included)
+
+#build bar inclusion set
+(included2, excluded2) = buildExc(mp, sample_list_dir + '/t_inclusion_' + sample_list_base_name + '_bar')
+#(excluded2, included2) = buildExc(mp, sample_list_dir + '/t_inclusion_' + sample_list_base_name + '_bar')
+print '\nlen(included):', len(included2), 'len(excluded):', len(excluded2), 'intersection:', len(included2.intersection(excluded2))
 
-for j in range(3, len(sys.argv)) :
-  s2 = buildInc(mp, sys.argv[j])
+print
+print 'checking that the bar files do not intersect'
+r = len(excluded.intersection(included2))
+if r != 0 :
+  print 'FAILED!'
+  print 'len(intersection):', r
+  exit(0)
+#print 'bar inclusion file contains', len(bar), 'sample IDs'
+#data.append(bar)
+
+
+for j in range(int(sys.argv[4])) :
+  s2 = buildInc(mp, sample_list_dir + '/t' + str(j) + '_' + sample_list_base_name)
   data.append(s2)
   print len(s2)
-
 print
 print '===================================================================='
+print 'running intersection test ...'
+success = True
 for j in range(0, len(data)-1) :
   for k in range(1, len(data)) :
-    a = data[j]
-    b = data[k]
-    print 'testing', sys.argv[j], 'against', sys.argv[k], '; len(intersection):',  len(a.intersection(b))
+    if j != k :
+      a = data[j]
+      b = data[k]
+      print 'testing', j, 'against', k, 'len:', len(a), len(b)
+      r = len(a.intersection(b))
+      if r != 0 :
+        print 'FAILED: ', j, 'intersection with',k, '=' , r
+        tt = 0
+        for x in a :
+          if x in b :
+            print x,
+            tt += 1
+        print
+        print 'total:', tt
+        exit(9)
+        success = False
+if success :
+  print '  SUCCESS!'
+
+print
+print 'testing that all samples appear in one sample list, or the exclusion bar file'
 
+s2 = set()
+for j in range(0, len(data)) :
+  for sample_id in data[j] :
+    assert(sample_id in mp)
+    mp.remove(sample_id)
+if len(mp) == 0 :
+  print '  SUCCESS!'
+else :
+  print '  FAILED; len(mp)= ', len(mp), 'should be zero'
diff --git a/model_zoo/jag_utils/python/shuffle.py b/model_zoo/jag_utils/python/shuffle.py
new file mode 100644
index 00000000000..7e19f2a525e
--- /dev/null
+++ b/model_zoo/jag_utils/python/shuffle.py
@@ -0,0 +1,42 @@
+import sys
+import os
+import random
+
+def shuffle(fn) :
+  fn2 = fn + '.shuffled'
+  a = open(fn).readlines()
+  b = open(fn2, 'w')
+  b.write(a[0])
+  b.write(a[1])
+  b.write(a[2])
+  c = a[3:]
+  n = len(c)
+  r = set()
+  r_idx = []
+  for y in range(n) :
+    while True :
+      y = random.randint(0, n-1)
+      if y not in r :
+        r.add(y)
+        r_idx.append(y)
+      if len(r) == n :
+        break
+  for j in range(len(c)) :
+    b.write(c[r_idx[j]])
+  b.close()
+  print 'wrote:', fn2
+
+#====================================================================
+if len(sys.argv) != 4 :
+  print 'usage:', sys.argv[0], 'base_dir num_sample_lists sample_list_base_name'
+  print 'example: python', sys.argv[0], '/p/lustre2/brainusr/datasets/10MJAG/1M_A/select_samples_test/another_dir 10 my_samples.txt',
+  exit(9)
+
+dir = sys.argv[1]
+n = int(sys.argv[2])
+base_fn = sys.argv[3]
+
+for j in range(n) :
+  fn = dir + '/t' + str(j) + '_' + base_fn
+  shuffle(fn)
+
diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp
index 527ab8be1b2..d5bf4a11789 100644
--- a/model_zoo/jag_utils/select_samples.cpp
+++ b/model_zoo/jag_utils/select_samples.cpp
@@ -7,64 +7,205 @@
 #include <cstdlib>
 #include <unordered_set>
 #include <unordered_map>
-#include "lbann/lbann.hpp"
-#include "conduit/conduit.hpp"
-#include "conduit/conduit_relay.hpp"
-#include "conduit/conduit_relay_io_hdf5.hpp"
-#include "lbann/lbann.hpp"
+#include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/utils/lbann_library.hpp"
+#include "lbann/comm.hpp"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 using namespace std;
-using namespace lbann;
+using lbann::options;
 
-#undef SANITY
-#define SANITY
+//============================================================================
+// sanity checks the cmd line
+void check_cmd_line();
 
+// returns the help message
+string help_msg();
+
+// tests that the output dir exists and is writable,
+// and creates it if otherwise
+void test_output_dir();
+
+// tests that there are sufficient samples to build the lists
+// (i.e, num_lists*num_samples_per_list must not be greater than
+// the total number of (successful) samples
+void sanity_test_request();
+
+// constructs various mappings from the mapping file
+void read_mapping_file(
+  unordered_map<string, unordered_set<string>> &sample_mapping,
+  unordered_map<string, vector<string>> &sample_mapping_v,
+  unordered_map<string, int>& string_to_index);
+
+// constructs various mappings from the index file
+void build_index_maps(
+  unordered_map<string, unordered_set<int>> &index_map_keep,
+  unordered_map<string, unordered_set<int>> &index_map_exclude,
+  unordered_map<string, int> &string_to_index,
+  unordered_map<string, string> &filename_data);
+
+// partition the sample IDs in index_map_keep into n sets;
+// on entry, sets.size() = num_lists
+void divide_selected_samples(
+  const unordered_map<string, unordered_set<int>> &index_map_keep,
+  vector<unordered_map<string, unordered_set<int>>> &sets);
+
+// write the n-th sample list to file
+void write_sample_list(
+    int n,
+    const vector<unordered_map<string, unordered_set<int>>> &subsets,
+    const unordered_map<string, vector<string>> &sample_mapping_v,
+    const std::unordered_map<std::string, std::string> &filename_data);
+
+void write_bar_files(
+  const unordered_map<string, unordered_set<int>> index_map_exclude,
+  const unordered_map<string, unordered_set<string>> &sample_mapping,
+  const unordered_map<string, vector<string>> &sample_mapping_v,
+  const unordered_map<string, string> &filename_data);
+//============================================================================
 int main(int argc, char **argv) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
-  bool master = comm->am_world_master();
-  int rank, np;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &np);
-
-  if (np!= 1) {
-    if (master) {
+  lbann::world_comm_ptr comm = lbann::initialize(argc, argv);
+  int np = comm->get_procs_in_world();
+
+  try {
+
+    if (np != 1) {
       LBANN_ERROR("please run with a single processor");
     }
-  }
 
-  options *opts = options::get();
-  opts->init(argc, argv);
+    options *opts = options::get();
+    opts->init(argc, argv);
+
+    // check for proper invocation, print help message
+    if (opts->get_bool("h") || opts->get_bool("help") || argc == 1) {
+      cout << help_msg();
+      return EXIT_FAILURE;
+    }
+
+    // check for proper invocation
+    check_cmd_line();
+
+    // check that output directory exists and is writable,
+    // and creates it if otherwise
+    test_output_dir();
+
+    // ensure we have enough samples to fullfill the requirements
+    sanity_test_request();
+
+    // maps a sample_id filename to the set of sample IDs
+    unordered_map<string, unordered_set<string>> sample_mapping;
+    // maps  a sample_id filename to a list of sample IDs
+    unordered_map<string, vector<string>> sample_mapping_v;
+    // maps a sampleID to a local idex
+    unordered_map<string, int> string_to_index;
 
-  std::stringstream err;
+    read_mapping_file(sample_mapping, sample_mapping_v, string_to_index);
 
-  // sanity check the cmd line
-  if (! (opts->has_string("index_fn") && opts->has_string("sample_mapping_fn")
-         && opts->has_int("num_samples") && opts->has_int("random_seed")
-         && opts->has_string("output_fn"))) {
-    if (master) {
-      LBANN_ERROR("usage: select_samples --index_fn=<string> --sample_mapping_fn=<string> --num_samples=<int> --output_fn=<string> --random_seed=<int>\n\n");
+    // maps a samole_id filename to a set of randomly selected sample_ids
+    unordered_map<string, unordered_set<int>> index_map_keep;
+    // maps a samole_id filename to the set of sample_ids that have not been randomly selscted
+    unordered_map<string, unordered_set<int>> index_map_exclude;
+    std::unordered_map<std::string, std::string> filename_data;
+    build_index_maps(index_map_keep, index_map_exclude, string_to_index, filename_data);
+
+    // partition the randomly selected samples into "num_lists" sets
+    int num_lists = opts->get_int("num_lists");
+    vector<unordered_map<string, unordered_set<int>>> subsets(num_lists);
+    divide_selected_samples(index_map_keep, subsets);
+
+    write_bar_files(index_map_exclude, sample_mapping, sample_mapping_v, filename_data);
+
+    // write the sample lists
+    for (int n=0; n<num_lists; n++) {
+      write_sample_list(n, subsets, sample_mapping_v, filename_data);
+    }
+
+    cout << "SUCESS - FINISHED!\n";
+
+  } catch (lbann::exception& e) {
+    if (options::get()->get_bool("stack_trace_to_file")) {
+      ostringstream ss("stack_trace");
+      const auto& rank = lbann::get_rank_in_world();
+      if (rank >= 0) {
+        ss << "_rank" << rank;
+      }
+      ss << ".txt";
+      ofstream fs(ss.str());
+      e.print_report(fs);
     }
-    exit(9);
+    El::ReportException(e);
+    return EXIT_FAILURE;
+  } catch (std::exception& e) {
+    El::ReportException(e);
+    return EXIT_FAILURE;
   }
 
-  const std::string index_fn = opts->get_string("index_fn");
-  const std::string mapping_fn = opts->get_string("sample_mapping_fn");
-  const std::string output_fn = opts->get_string("output_fn");
-  size_t num_samples = opts->get_int("num_samples");
-  int seed = opts->get_int("random_seed");
-
-  //==========================================================================
-  // read previously computed mapping: sample_id (string) -> local_index
-  //==========================================================================
-  cerr << "reading sample mapping\n";
-  // maps filename to { sample_ids }
-  unordered_map<string, std::unordered_set<string>> sample_mapping;
-  unordered_map<string, std::vector<string>> sample_mapping_v;
-  // maps a sampleID to a local idex
-  unordered_map<string, int> string_to_index;
+  return EXIT_SUCCESS;
+}
 
+// sanity check the cmd line
+void check_cmd_line() {
+  options *opts = options::get();
+  stringstream err;
+  if (! (opts->has_string("index_fn") && opts->has_string("mapping_fn")
+         && opts->has_int("num_samples_per_list") && opts->has_int("num_lists")
+         && opts->has_int("random_seed")
+         && opts->has_string("output_dir") && opts->has_string("output_base_fn"))) {
+    cout << help_msg();
+    if (!opts->has_string("index_fn")) {
+      cout << "missing --index_fn=<string> \n";
+    }
+    if (!opts->has_string("mapping_fn")) {
+      cout << "missing --mapping_fn=<string> \n";
+    }
+    if (!opts->has_string("num_samples_per_list")) {
+      cout << "missing --num_samples_per_list=<int> \n";
+    }
+    if (!opts->has_string("num_lists")) {
+      cout << "missing --num_lists=<int> \n";
+    }
+    if (!opts->has_string("random_seed")) {
+      cout << "missing --random_seed=<int> \n";
+    }
+    if (!opts->has_string("output_dir")) {
+      cout << "missing --output_dir=<string> \n";
+    }
+    if (!opts->has_string("output_base_fn")) {
+      cout << "missing --output_base_fn=<string> \n";
+    }
+    cout << "\n";
+    exit(0);
+  }
+}
+
+string help_msg() {
+      stringstream err;
+      err << "usage: select_samples --index_fn=<string> --sample_mapping_fn=<string> --num_samples_per_list=<int> --num_lists --output_dir=<string> --output_base_name=<string> --random_seed=<int>\n\n";
+      err << "example invocation:\n";
+      err << "select_samples \\\n";
+      err << "  --index_fn=/p/gpfs1/brainusr/datasets/100M/index.txt \\\n";
+      err << "  --mapping_fn=/p/gpfs1/brainusr/datasets/100M/id_mapping.txt \\\n";
+      err << "  --num_samples_per_list=100000 \\\n";
+      err << "  --num_lists=640 \\\n";
+      err << "  --output_dir=/p/gpfs1/brainusr/datasets/100M/1M_B \\\n";
+      err << "  --output_base_fn=my_samples.txt \\\n";
+      err << "  --random_seed=42\n";
+      err << "\n";
+      return err.str();
+}
+
+void read_mapping_file(unordered_map<string, unordered_set<string>> &sample_mapping, unordered_map<string, vector<string>> &sample_mapping_v, unordered_map<string, int>& string_to_index) {
+  cout << "starting read_mapping_file\n";
+  double tm1 = lbann::get_time();
+  const string mapping_fn = options::get()->get_string("mapping_fn");
   ifstream in(mapping_fn.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", mapping_fn, " for reading");
+  }
   string filename;
   string sample_id;
   string line;
@@ -81,209 +222,433 @@ int main(int argc, char **argv) {
       sample_mapping[filename].insert(sample_id);
       sample_mapping_v[filename].push_back(sample_id);
       if (string_to_index.find(sample_id) != string_to_index.end()) {
-        err << "duplicate sample_ID: " << sample_id << " in file: " << filename;
-        LBANN_ERROR(err.str());
+        LBANN_ERROR("duplicate sample_ID: ", sample_id, " in file: ", filename);
       }
       string_to_index[sample_id] = hh++;
     }
   }
   in.close();
-  cerr << "num lines processed: " << n << "\n";
-
-  //==========================================================================
-  // master builds two maps: <string, set<int>> maps a filename to the
-  // set of indices (not sample_ids; that comes later!) that are to be
-  // included and excluded
-  if (master) {
-
-    // your job, should you decide to accept it, is to fill in these maps
-    std::unordered_map<std::string, std::unordered_set<int>> index_map_keep;
-    std::unordered_map<std::string, std::unordered_set<int>> index_map_exclude;
-
-    //open input file
-    in.open(index_fn);
-    if (!in) {
-      err << "failed to open " << index_fn << " for reading\n";
-      LBANN_ERROR(err.str());
-      MPI_Abort(MPI_COMM_WORLD, -1);
+  double tm2 = lbann::get_time() - tm1;
+  cout << "  FINISHED reading sample mapping: num lines processed: " << n << "; time: " << tm2 << "\n";
+}
+
+// build two maps: <string, set<int>> maps a filename to the
+// set of indices (not sample_ids; that comes later!) that are to be
+// included and excluded
+void build_index_maps(
+  unordered_map<string, unordered_set<int>> &index_map_keep,
+  unordered_map<string, unordered_set<int>> &index_map_exclude,
+  unordered_map<string, int>& string_to_index,
+  unordered_map<string, string> &filename_data) {
+
+  cout << "starting build_index_maps\n";
+  double tm1 = lbann::get_time();
+
+  int samples_per_list = options::get()->get_int("num_samples_per_list");
+  int num_lists = options::get()->get_int("num_lists");
+  size_t num_samples = samples_per_list * num_lists;
+
+  //open input file
+  const string index_fn = options::get()->get_string("index_fn").c_str();
+  ifstream in(index_fn.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", index_fn, " for reading");
+  }
+
+  string line;
+  getline(in, line);
+  if (line != "CONDUIT_HDF5_EXCLUSION") {
+    LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n");
+  }
+
+  int num_valid, num_invalid, num_files;
+  in >> num_valid >> num_invalid >> num_files;
+  getline(in, line);  //discard newline
+  string base_dir;
+  getline(in, base_dir);
+  options::get()->set_option("base_dir", base_dir);
+  cout << "input index file contains " << num_valid << " valid samples\n";
+
+  cout << "generating random indices ...\n";
+  double tm2 = lbann::get_time();
+  unordered_set<int> random_indices;
+  srandom(options::get()->get_int("random_seed"));
+  while (true) {
+    int v = random() % num_valid;
+    random_indices.insert(v);
+    if (random_indices.size() == num_samples) {
+      break;
     }
+  }
+  cout << "  FINISHED generating random indices; time: " << lbann::get_time() - tm2 << endl;
+  cout << "selecting samples based on random indices\n";
+  double tm3 = lbann::get_time();
+
+  // loop over each entry from in input index file; determine which, if any,
+  // local indices will be added to the INCLUSION index
+  int first = 0;
+  size_t good, bad;
+  num_files = 0;
+  string fn;
+  while (! in.eof()) {
+    line = "";
     getline(in, line);
-    if (line != "CONDUIT_HDF5_EXCLUSION") {
-      LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n");
+    if (!line.size()) {
+      break;
     }
+    ++num_files;
+    if (num_files % 1000 == 0) cout << num_files/1000 << "K input lines processed\n";
+    stringstream s(line);
+    s >> fn >> good >> bad;
+    filename_data[fn] = line;
+    const int total = good+bad;
+    index_map_exclude[fn];
+    index_map_keep[fn];
+    string sample_id;
 
-    int num_valid, num_invalid, num_files;
-    in >> num_valid >> num_invalid >> num_files;
-    getline(in, line);  //discard newline
-    string base_dir;
-    getline(in, base_dir);
-    cerr << "input index file contains " << num_valid << " valid samples\n";
-
-    // generate random indices; note that these are global indices
-    cerr << "generating random indicess ...\n";
-    unordered_set<int> random_indices;
-    srandom(seed);
-    while (true) {
-      int v = random() % num_valid;
-      random_indices.insert(v);
-      if (random_indices.size() == num_samples) {
-        break;
-      }
+    while (s >> sample_id) {
+      index_map_exclude[fn].insert(string_to_index[sample_id]);
+    }
+    if (index_map_exclude[fn].size() != bad) {
+      LBANN_ERROR("exclude.size(): ", index_map_exclude[fn].size(), " should be: ", bad, " but isn't\n");
     }
 
-    // loop over each entry from in input index file; determine which, if any,
-    // local indices will be added to the INCLUSION index
-    int first = 0;
-    size_t good, bad;
-    num_files = 0;
-    string fn;
-    std::unordered_map<std::string, std::string> data;
-    while (! in.eof()) {
-      line = "";
-      getline(in, line);
-      if (!line.size()) {
-        break;
-      }
-      ++num_files;
-      if (num_files % 1000 == 0) cerr << num_files/1000 << "K input lines processed\n";
-      stringstream s(line);
-      s >> fn >> good >> bad;
-      data[fn] = line;
-      const int total = good+bad;
-      index_map_exclude[fn];
-      index_map_keep[fn];
-      while (s >> sample_id) {
-        if (sample_mapping[fn].find(sample_id) == sample_mapping[fn].end()) {
-          LBANN_ERROR("failed to find " + sample_id + " in sample_mapping");
+    int local_valid_index = 0;
+    for (int local_index=0; local_index<total; local_index++) {
+      if (index_map_exclude[fn].find(local_index) == index_map_exclude[fn].end()) {
+        int global_idx = local_valid_index+first;
+        ++local_valid_index;
+        if (random_indices.find(global_idx) != random_indices.end()) {
+          index_map_keep[fn].insert(local_index);
+          index_map_exclude[fn].insert(local_index);
         }
-        index_map_exclude[fn].insert(string_to_index[sample_id]);
-      }
-      if (index_map_exclude[fn].size() != bad) {
-        err << "exclude.size(): " << index_map_exclude[fn].size() << " should be: " << bad << " but isn't\n";
-        LBANN_ERROR(err.str());
       }
+    }
+    first += good;
+  }
+  cout << "FINISHED selecting samples based on random indices; time: " << lbann::get_time() - tm3 << endl;
 
-      int local_valid_index = 0;
-      for (int local_index=0; local_index<total; local_index++) {
-        if (index_map_exclude[fn].find(local_index) == index_map_exclude[fn].end()) {
-          int global_idx = local_valid_index+first;
-          ++local_valid_index;
-          if (random_indices.find(global_idx) != random_indices.end()) {
-            index_map_keep[fn].insert(local_index);
-            index_map_exclude[fn].insert(local_index);
-          }
-        }
+  if (index_map_exclude.size() != index_map_keep.size()) {
+    LBANN_ERROR("index_map_exclude.size() != index_map_keep.size()");
+  }
+  cout << "  FINISHED build_index_maps; time: " << lbann::get_time() - tm1 << endl;
+}
+
+void sanity_test_request() {
+  const string index_fn = options::get()->get_string("index_fn").c_str();
+  ifstream in(index_fn.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", index_fn, " for reading");
+  }
+
+  string line;
+  getline(in, line);
+  if (line != "CONDUIT_HDF5_EXCLUSION") {
+    LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n");
+  }
+
+  int num_valid, num_invalid, num_files;
+  in >> num_valid >> num_invalid >> num_files;
+  int samples_per_list = options::get()->get_int("num_samples_per_list");
+  int num_lists = options::get()->get_int("num_lists");
+  int num_samples = samples_per_list * num_lists;
+  if (num_samples > num_valid) {
+    LBANN_ERROR("you requested a total of ", num_samples, " samples, but only ", num_valid, " are available");
+  }
+}
+
+void divide_selected_samples(
+    const unordered_map<string, unordered_set<int>> &index_map_keep,
+    vector<unordered_map<string, unordered_set<int>>> &sets) {
+  size_t samples_per_list = options::get()->get_int("num_samples_per_list");
+  size_t which = 0;
+  size_t count = 0;
+  size_t total = 0;
+  for (auto &it : index_map_keep) {
+    const string &filename = it.first;
+    const unordered_set<int> &sample_ids = it.second;
+    for (auto &it2 : sample_ids) {
+      sets[which][filename].insert(it2);
+      ++total;
+      ++count;
+      /*
+      if (count == samples_per_list) {
+        count = 0;
+        ++which;
+      }
+      */
+      ++which;
+      if (which == sets.size()) {
+        which = 0;
       }
-      first += good;
     }
+  }
 
-    if (index_map_exclude.size() != index_map_keep.size()) {
-      LBANN_ERROR("index_map_exclude.size() != index_map_keep.size()");
-    }
+/*
+  if (which != sets.size()) {
+    LBANN_ERROR("which != sets.size()");
+  }
+  */
+  if (total != samples_per_list * sets.size()) {
+    LBANN_ERROR("samples_per_list * sets.size()");
+  }
+}
 
-    //=====================================================================
-    // write EXCLUSION file
-    //=====================================================================
-    //open output file and write 1st header line
-    const std::string name1 = output_fn + "_bar";
-    std::cerr << "\nWRITING output file: " << name1 << "\n";
-    std::ofstream out(name1.c_str());
-    if (!out) {
-      err << "failed to open " << name1 << " for writing\n";
-      LBANN_ERROR(err.str());
+void write_sample_list(
+    int n,
+    const vector<unordered_map<string, unordered_set<int>>> &subsets,
+    const unordered_map<string, vector<string>> &sample_mapping_v,
+    const std::unordered_map<std::string, std::string> &filename_data) {
+  const string dir = options::get()->get_string("output_dir");
+  const string fn = options::get()->get_string("output_base_fn");
+  stringstream s;
+  s << dir << '/' << "t" << n << '_' << fn;
+  ofstream out(s.str().c_str());
+  if (!out) {
+    LBANN_ERROR("failed to open ", s.str(), " for writing");
+  }
+  cout << "WRITING output file: " << s.str() << endl;
+
+  out << "CONDUIT_HDF5_INCLUSION\n";
+  stringstream s2;
+  size_t total_good = 0;
+  size_t total_bad = 0;
+  size_t num_include_files = 0;
+  stringstream sout;
+  for (auto &t : subsets[n]) {
+    const string &filename = t.first;
+    vector<stringstream> s6;
+
+    // get total samples for the current file
+    std::unordered_map<std::string, std::string>::const_iterator t4 = filename_data.find(filename);
+    if (t4 == filename_data.end()) {
+      LBANN_ERROR("t4 == filename_data.end()");
     }
-    out<< "CONDUIT_HDF5_EXCLUSION\n";
-
-    std::stringstream sout;
-    size_t total_good = 0;
-    size_t total_bad = 0;
-    size_t num_include_files = 0;
-
-    for (auto t : index_map_exclude) {
-      filename = t.first;
-      if (data.find(filename) == data.end()) {
-        err << "data.find(" << filename << ") failed\n";
-        for (auto tt : data) {
-          err << tt.first << "\n";
+    stringstream s5(t4->second);
+    int good, bad;
+    string fn2;
+    s5 >> fn2 >> good >> bad;
+    size_t total = good+bad;
+
+    const unordered_set<int> &include_me = t.second;
+    int included = include_me.size();
+    int excluded = total - included;
+
+    if (included) {
+      ++num_include_files;
+      total_good += included;
+      total_bad += excluded;
+      s6.resize(s6.size()+1);
+      s6.back() << filename << " " << included << " " << excluded;
+      for (auto &t3 : include_me) {
+        if (sample_mapping_v.find(fn2) == sample_mapping_v.end()) {
+          LBANN_ERROR("failed to find the key: ", fn2, " in sample_mapping_v map");
+        }
+        unordered_map<string, vector<string>>::const_iterator t5 = sample_mapping_v.find(fn2);
+        if (t5 == sample_mapping_v.end()) {
+          LBANN_ERROR("t5 == sample_mapping_v.end()");
         }
-        LBANN_ERROR(err.str());
+        if (static_cast<size_t>(t3) >= t5->second.size()) {
+          LBANN_ERROR("t3 >= t5->second.size()");
+        }
+        s6.back() << " " << t5->second[t3];
       }
 
-      // get total samples for the current file
-      std::stringstream s5(data[filename]);
-      s5 >> fn >> good >> bad;
-      size_t total = good+bad;
-
-      const std::unordered_set<int> &exclude_me = t.second;
-      int excluded = exclude_me.size();
-      int included = total - excluded;
-      if (included) {
-        ++num_include_files;
-        total_good += included;
-        total_bad += excluded;
-        sout << filename << " " << included << " " << excluded;
-        for (auto t3 : exclude_me) {
-          sout << " " << sample_mapping_v[fn][t3];
+      //compute values for randomizing
+      //(this was previously done with a python script)
+      size_t n2 = s6.size();
+      unordered_set<int> used_indices;
+      vector<int> indices;
+      while (used_indices.size() < n2) {
+        int v = random() % n2;
+        if (used_indices.find(v) == used_indices.end()) {
+          used_indices.insert(v);
+          indices.push_back(v);
         }
-        sout << "\n";
+      }
+
+      for (size_t y=0; y<n2; ++y) {
+        sout << s6[indices[y]].str() << endl;
       }
     }
+  }
+
+  const string base_dir = options::get()->get_string("base_dir");
+
+  out << total_good << " " << total_bad << " " << num_include_files
+      << "\n" << base_dir << "\n" << sout.str();
+  out.close();
+}
+
+bool file_exists(const char *path) {
+  struct stat s;
+  int err = stat(path, &s);
+  if (err == -1) {
+    return false;
+  }
+  return true;
+}
 
-    out << total_good << " " << total_bad << " " << num_include_files << "\n"
-        << base_dir << "\n" << sout.str();
-    out.close();
-
-    //=====================================================================
-    // write INCLUSION file
-    //=====================================================================
-    // open output file and write 1st header line
-    out.open(output_fn.c_str());
-    std::cerr << "\nWRITING output file: " << output_fn << "\n";
-    if (!out) {
-      err << "failed to open " << output_fn << " for writing\n";
-      LBANN_ERROR(err.str());
+void make_dir(char *cpath) {
+  cout << "   path doesn't exist: " << strerror(errno) << endl;
+  cout << "   attempting to create path\n";
+  int err = mkdir(cpath, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IXUSR | S_IXGRP);
+  if (err) {
+    free(cpath);
+    LBANN_ERROR("mkdir failed for \"", cpath, "\"; please create this directory yourself, then rerun this program");
+    cout << "   mkdir failed: " << strerror(errno) << endl;
+  } else {
+    cout << "   SUCCESS!\n";
+    cout << "   attempting to change permissions\n";
+    err = chmod(cpath, S_ISGID | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IXUSR | S_IXGRP);
+    if (err) {
+      cout << "   mkdir failed: " << strerror(errno) << endl;
+    } else {
+      cout << "   SUCCESS!\n";
     }
-    out << "CONDUIT_HDF5_INCLUSION\n";
-
-    sout.clear();
-    sout.str("");
-    total_good = 0;
-    total_bad = 0;
-    num_include_files = 0;
-
-    for (auto t : index_map_keep) {
-      filename = t.first;
-      if (data.find(filename) == data.end()) {
-        err << "data.find(" << filename << ") failed\n";
-        for (auto tt : data) {
-          err << tt.first << "\n";
-        }
-        LBANN_ERROR(err.str());
+  }
+}
+
+void test_output_dir() {
+  cout << "\nChecking if output diretory path exists;\n"
+          " if not, we'll attempt to create it.\n";
+  const string dir = options::get()->get_string("output_dir");
+  char *cpath = strdup(dir.c_str());
+  char *pp = cpath;
+  if (pp[0] == '/') {
+    ++pp;
+  }
+  char *sp;
+  int status = 0;
+  while (status == 0 && (sp = strchr(pp, '/')) != 0) {
+    if (sp != pp) {
+      *sp = '\0';
+      cout << cpath << endl;
+      if (file_exists(cpath)) {
+        cout << "  path exists\n";
+      } else {
+        make_dir(cpath);
       }
+      *sp = '/';
+    }
+    pp = sp+1;
+  }
+  if (status == 0) {
+    cout << cpath << endl;
+    if (file_exists(cpath)) {
+      cout << "  path exists\n";
+    } else {
+      make_dir(cpath);
+    }
+  }
+  free(cpath);
+  cout << endl;
+}
+
+
+void write_bar_files(
+  const unordered_map<string, unordered_set<int>> index_map_exclude,
+  const unordered_map<string, unordered_set<string>> &sample_mapping,
+  const unordered_map<string, vector<string>> &sample_mapping_v,
+  const unordered_map<string, string> &filename_data
+) {
+
+  unordered_set<string> all_excluded;
+
+  const string dir = options::get()->get_string("output_dir");
+  const string base_fn = options::get()->get_string("output_base_fn");
+  stringstream s;
+  s << dir << '/' << "t_exclusion_" << base_fn << "_bar";
+  std::cerr << "\nWRITING exclusion bar file: " << s.str() << "\n";
+  std::ofstream out(s.str().c_str());
+  if (!out) {
+      LBANN_ERROR("failed to open ", s.str(), " for writing\n");
+  }
+  out<< "CONDUIT_HDF5_EXCLUSION\n";
+
+  std::stringstream sout;
+  size_t total_good = 0;
+  size_t total_bad = 0;
+  size_t num_include_files = 0;
 
-      // get total samples for the current file
-      std::stringstream s5(data[filename]);
-      s5 >> fn >> good >> bad;
-      size_t total = good+bad;
-      const std::unordered_set<int> &include_me = t.second;
-      int included = include_me.size();
-      int excluded = total - included;
-
-      if (included) {
-        ++num_include_files;
-        total_good += included;
-        total_bad += excluded;
-        sout << filename << " " << included << " " << excluded;
-        for (auto t3 : include_me) {
-          sout << " " << sample_mapping_v[fn][t3];
+  string fn;
+  int good;
+  int bad;
+  for (auto t : index_map_exclude) {
+    const string &filename = t.first;
+
+    // get total samples for the current file
+    std::unordered_map<std::string, std::string>::const_iterator t4 = filename_data.find(filename);
+    if (t4 == filename_data.end()) {
+      LBANN_ERROR("t4 == filename_data.end()");
+    }
+
+    std::stringstream s5(t4->second);
+    s5 >> fn >> good >> bad;
+    size_t total = good+bad;
+
+    const std::unordered_set<int> &exclude_me = t.second;
+    int excluded = exclude_me.size();
+    int included = total - excluded;
+    if (included) {
+      ++num_include_files;
+      total_good += included;
+      total_bad += excluded;
+      sout << filename << " " << included << " " << excluded;
+      for (auto t3 : exclude_me) {
+        unordered_map<string, vector<string>>::const_iterator t5 = sample_mapping_v.find(fn);
+        if (t5 == sample_mapping_v.end()) {
+          LBANN_ERROR("t5 == sample_mapping_v.end())");
         }
-        sout << "\n";
+        sout << " " << t5->second[t3];
+        all_excluded.insert(t5->second[t3]);
       }
+      sout << "\n";
     }
+  }
+
+  const string base_dir = options::get()->get_string("base_dir");
+  out << total_good << " " << total_bad << " " << num_include_files << "\n"
+      << base_dir << endl << sout.str();
+  out.close();
 
-    out << total_good << " " << total_bad << " " << num_include_files
-            << "\n" << base_dir << "\n" << sout.str();
+  s.clear();
+  s.str("");
+  s << dir << '/' << "t_inclusion_" << base_fn << "_bar";
+  std::cerr << "\nWRITING inclusion bar file: " << s.str() << "\n";
+  std::ofstream out2(s.str().c_str());
+  if (!out2) {
+      LBANN_ERROR("failed to open ", s.str(), " for writing\n");
   }
+  out2 << "CONDUIT_HDF5_INCLUSION\n";
 
-  return EXIT_SUCCESS;
+  num_include_files = 0;
+  unordered_map<string, unordered_set<string>> data_for_inclusion;
+  for (auto &&t : sample_mapping) {
+    for (auto &t2 : t.second) {
+      if (all_excluded.find(t2) == all_excluded.end()) {
+        data_for_inclusion[t.first].insert(t2);
+      }
+    }
+  }
+
+  cout << "all_excluded.size: " << all_excluded.size() << endl;
+
+  out2 <<  total_good << " " << total_bad << " " << data_for_inclusion.size() << "\n" << base_dir << endl;
+
+  for (auto &&t : data_for_inclusion) {
+    int included = t.second.size();
+    unordered_map<string, unordered_set<string>>::const_iterator it = sample_mapping.find(t.first);
+    if (it == sample_mapping.end()) {
+      LBANN_ERROR("it == sample_mapping.end()");
+    }
+    int total = it->second.size();
+    int excluded = total - included;
+    out2 << t.first << " " << included << " " << excluded << " ";
+    for (auto &t2 : t.second) {
+      out2 << t2 << " ";
+    }
+    out2 << endl;
+  }
+  out2 << endl;
 }
diff --git a/model_zoo/jag_utils/test_conduit_hdf5.cpp b/model_zoo/jag_utils/test_conduit_hdf5.cpp
index 3c2d6955bfa..ecb104d63b7 100644
--- a/model_zoo/jag_utils/test_conduit_hdf5.cpp
+++ b/model_zoo/jag_utils/test_conduit_hdf5.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -48,8 +46,7 @@ void get_scalar_names(std::unordered_set<std::string> &s);
 void get_image_names(std::unordered_set<std::string> &s);
 //==========================================================================
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   const int np = comm->get_procs_in_world();
 
@@ -160,5 +157,3 @@ void get_image_names(std::unordered_set<std::string> &s) {
   s.insert("(90.0, 0.0)//0.0/emi");
   s.insert("(90.0, 78.0)//0.0/emi");
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/test_conduit_with_mpi.cpp b/model_zoo/jag_utils/test_conduit_with_mpi.cpp
index bb86c0708f9..1fc98d8b637 100644
--- a/model_zoo/jag_utils/test_conduit_with_mpi.cpp
+++ b/model_zoo/jag_utils/test_conduit_with_mpi.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_hdf5.hpp"
@@ -43,9 +41,7 @@
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   int np = comm->get_procs_in_world();
 
@@ -91,5 +87,3 @@ int main(int argc, char *argv[]) {
 
 #endif //if 0
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/test_mpi.cpp b/model_zoo/jag_utils/test_mpi.cpp
index 92a78ef4f1d..338e090569f 100644
--- a/model_zoo/jag_utils/test_mpi.cpp
+++ b/model_zoo/jag_utils/test_mpi.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -46,8 +44,7 @@ int main(int argc, char *argv[]) {
 
 #if 0
 
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  world_comm_ptr comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   int np = comm->get_procs_in_world();
 
@@ -93,5 +90,3 @@ int main(int argc, char *argv[]) {
 
 #endif //if 0
 }
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/jag_utils/test_reading_speed.cpp b/model_zoo/jag_utils/test_reading_speed.cpp
new file mode 100644
index 00000000000..00c560b3969
--- /dev/null
+++ b/model_zoo/jag_utils/test_reading_speed.cpp
@@ -0,0 +1,379 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann_config.hpp"
+
+#include "conduit/conduit.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_io_hdf5.hpp"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include "lbann/lbann.hpp"
+#include "lbann/utils/jag_utils.hpp"
+#include <time.h>
+#include <cfloat>
+
+using namespace lbann;
+using namespace std;
+
+vector<string> get_input_names_jag();
+vector<string> get_scalar_names_jag();
+vector<string> get_image_names_jag();
+vector<string> get_input_names_hydra();
+vector<string> get_scalar_names_hydra();
+vector<string> get_image_names_hydra();
+void test_hydra(string filename);
+void test_jag(string filename);
+
+//==========================================================================
+#define MAX_SAMPLES 10000
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+
+  options *opts = options::get();
+  opts->init(argc, argv);
+
+  if (!(opts->has_string("filelist") && opts->has_int("jag"))) {
+    LBANN_ERROR("usage: test_speed_hydra_ --filelist=<string> --jag=<0|1>");
+  }
+
+  if (opts->get_int("jag")) {
+    test_jag(opts->get_string("filelist"));
+  } else {
+    test_hydra(opts->get_string("filelist"));
+  }
+  return EXIT_SUCCESS;
+}
+
+vector<string> get_input_names_hydra() {
+  vector<string> f;
+  f.push_back("p_preheat");
+  f.push_back("sc_peak");
+  f.push_back("t_3rd");
+  f.push_back("t_end");
+  return f;
+}
+
+vector<string> get_scalar_names_hydra() {
+  vector<string> f;
+  f.push_back("avg_rhor");
+  f.push_back("peak_eprod");
+  f.push_back("peak_tion_bw_DT");
+  f.push_back("bt_tion_bw_DT");
+  f.push_back("avg_tion_bw_DT");
+  f.push_back("adiabat");
+  f.push_back("bangt");
+  f.push_back("burnwidth");
+  f.push_back("bt_rhor");
+  f.push_back("bt_eprodr");
+  f.push_back("peak_eprodr");
+  return f;
+}
+
+vector<string> get_image_names_hydra() {
+  vector<string> f;
+  f.push_back("(90,0)/bang/image/data");
+  f.push_back("(0,0)/bang/image/data");
+  return f;
+}
+
+vector<string> get_input_names_jag() {
+  vector<string> f;
+  f.push_back("shape_model_initial_modes:(4,3)");
+  f.push_back("betti_prl15_trans_u");
+  f.push_back("betti_prl15_trans_v");
+  f.push_back("shape_model_initial_modes:(2,1)");
+  f.push_back("shape_model_initial_modes:(1,0)");
+  return f;
+}
+
+vector<string> get_scalar_names_jag() {
+  vector<string> f;
+  f.push_back("BWx");
+  f.push_back("BT");
+  f.push_back("tMAXt");
+  f.push_back("BWn");
+  f.push_back("MAXpressure");
+  f.push_back("BAte");
+  f.push_back("MAXtion");
+  f.push_back("tMAXpressure");
+  f.push_back("BAt");
+  f.push_back("Yn");
+  f.push_back("Ye");
+  f.push_back("Yx");
+  f.push_back("tMAXte");
+  f.push_back("BAtion");
+  f.push_back("MAXte");
+  f.push_back("tMAXtion");
+  f.push_back("BTx");
+  f.push_back("MAXt");
+  f.push_back("BTn");
+  f.push_back("BApressure");
+  f.push_back("tMINradius");
+  f.push_back("MINradius");
+  return f;
+}
+
+vector<string> get_image_names_jag() {
+  vector<string> f;
+  f.push_back("(0.0, 0.0)/0.0/emi");
+  f.push_back("(90.0, 0.0)/0.0/emi");
+  f.push_back("(90.0, 78.0)/0.0/emi");
+  return f;
+}
+
+void test_hydra(string filename) {
+    double tm1 = get_time();
+    hid_t hdf5_file_hnd;
+    std::string key;
+    conduit::Node n_ok;
+    conduit::Node tmp;
+
+    vector<string> input_names = get_input_names_hydra();
+    vector<string> scalar_names = get_scalar_names_hydra();
+    vector<string> image_names = get_image_names_hydra();
+
+    int num_samples = 0;
+    int num_files = 0;
+    double total = 0;
+    double bytes = 0;
+    ifstream in(filename.c_str());
+    long sample_size = 0;
+    while (!in.eof()) {
+      getline(in, filename);
+      if (filename.size() < 2) {
+        continue;
+      }
+      ++num_files;
+
+      try {
+        hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() );
+      } catch (...) {
+        LBANN_ERROR("failed to open " + filename + " for reading");
+      }
+      cout << "reading: " << filename << endl;
+
+      std::vector<std::string> cnames;
+      try {
+        conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
+      } catch (...) {
+        LBANN_ERROR("exception hdf5_group_list_child_names; " + filename);
+      }
+      cout << "samples per file: " << cnames.size() << endl;
+
+      for (size_t i=0; i<cnames.size(); i++) {
+        key = "/" + cnames[i] + "/performance/success";
+        try {
+          conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
+        } catch (...) {
+          cout << "failed to read success flag for file: " + filename + " and key: " + key << endl;
+          continue;
+        }
+
+        int success = n_ok.to_int64();
+        if (success == 1) {
+          try {
+
+            sample_size = 0;
+            sample_size += sizeof(double) + input_names.size();
+            for (size_t h=0; h<input_names.size(); h++) {
+              key = cnames[i] + "/inputs/" + input_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              total += v;
+              bytes += sizeof(double);
+            }
+
+            sample_size += sizeof(double) + scalar_names.size();
+            for (size_t h=0; h<scalar_names.size(); h++) {
+              key = cnames[i] + "/scalars/" + scalar_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              total += v;
+              bytes += sizeof(double);
+            }
+
+            for (size_t h=0; h<image_names.size(); h++) {
+              key = cnames[i] + "/images/" + image_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              conduit::float64_array emi = tmp.value();
+              const size_t image_size = emi.number_of_elements();
+              if (image_size != 3*3*64*64) {
+                LBANN_ERROR("image_size != 3*3*64*64");
+              }
+              for (int g=0; g<3*3*64*64; g++) {
+                total += emi[g];
+                bytes += sizeof(double);
+                sample_size += sizeof(double);
+              }
+            }
+
+            ++num_samples;
+            if (num_samples >= MAX_SAMPLES) {
+              goto FINISHED;
+            }
+          } catch (...) {
+            LBANN_ERROR("error reading " + key + " from file " + filename);
+          }
+        }
+      }
+    }
+
+FINISHED:
+
+    double tm2 = get_time();
+    cout << "========================================================\n"
+         << "hydra test:\n";
+    cout << "bytes per sample: " << sample_size << endl;
+    cout << "time: " << tm2 - tm1 << " num samples: " << num_samples << " num files: " << num_files << "\n"
+         << "num inputs: " << input_names.size() << " scalars: " << scalar_names.size() << endl;
+    cout << "num bytes: " << bytes << " time to read 1M bytes: " << (tm2 - tm1)/(bytes/1000000) << endl;
+
+}
+
+void test_jag(string filename) {
+cout << "starting test_jag; filename: " << filename << endl;
+    double tm1 = get_time();
+    hid_t hdf5_file_hnd;
+    std::string key;
+    conduit::Node n_ok;
+    conduit::Node tmp;
+
+    vector<string> input_names = get_input_names_jag();
+    vector<string> scalar_names = get_scalar_names_jag();
+    vector<string> image_names = get_image_names_jag();
+
+    int num_samples = 0;
+    int num_files = 0;
+    double total = 0;
+    double bytes = 0;
+    ifstream in(filename.c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open " + filename + " for reading\n");
+    }
+    long sample_size = 0;
+    int bad_samples = 0;
+    while (!in.eof()) {
+      getline(in, filename);
+      if (filename.size() < 2) {
+        continue;
+      }
+      ++num_files;
+      cout << "reading: " << filename << endl;
+
+      try {
+        hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() );
+      } catch (...) {
+        LBANN_ERROR("failed to open " + filename + " for reading");
+      }
+
+      std::vector<std::string> cnames;
+      try {
+        conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
+      } catch (...) {
+        LBANN_ERROR("exception hdf5_group_list_child_names; " + filename);
+      }
+      cout << "samples per file: " << cnames.size() << " num samples: " << num_samples << endl;
+
+      for (size_t i=0; i<cnames.size(); i++) {
+        key = "/" + cnames[i] + "/performance/success";
+        try {
+          conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
+        } catch (...) {
+          cout << "failed to read success flag for file: " + filename + " and key: " + key << endl;
+          continue;
+        }
+
+        int success = n_ok.to_int64();
+        if (success == 1) {
+          try {
+            sample_size = 0;
+            sample_size += sizeof(double)*input_names.size();
+            for (size_t h=0; h<input_names.size(); h++) {
+              key = cnames[i] + "/inputs/" + input_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              total += v;
+              bytes += sizeof(double);
+            }
+
+            sample_size += sizeof(double)*scalar_names.size();
+            for (size_t h=0; h<scalar_names.size(); h++) {
+              key = cnames[i] + "/outputs/scalars/" + scalar_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              double v = tmp.value();
+              total += v;
+              bytes += sizeof(double);
+            }
+
+            for (size_t h=0; h<image_names.size(); h++) {
+              key = cnames[i] + "/outputs/images/" + image_names[h];
+              conduit::relay::io::hdf5_read(hdf5_file_hnd, key, tmp);
+              conduit::float32_array emi = tmp.value();
+              const size_t image_size = emi.number_of_elements();
+              for (size_t g=0; g<image_size; g++) {
+                total += emi[g];
+                bytes += sizeof(double);
+                sample_size += sizeof(double);
+              }
+            }
+
+            ++num_samples;
+            if (num_samples >= MAX_SAMPLES) {
+              goto FINISHED;
+            }
+
+          } catch (...) {
+            conduit::Node node;
+            conduit::relay::io::load(filename, "hdf5", node);
+            const conduit::Schema *s = node.schema_ptr();
+            cerr << "KEY: " << key << endl;
+            s->print();
+            LBANN_ERROR("error reading " + key + " from file " + filename);
+          }
+        } else {
+          ++bad_samples;
+        }
+      }
+    }
+
+FINISHED:
+
+    double tm2 = get_time();
+    cout << "========================================================\n"
+         << "jag test:\n";
+    cout << "bytes per sample: " << sample_size << endl;
+    cout << "num bad samples: " << bad_samples << endl;
+    cout << "time: " << tm2 - tm1 << " num samples: " << num_samples << " num files: " << num_files << "\n"
+         << "num inputs: " << input_names.size() << " scalars: " << scalar_names.size() << endl;
+    cout << "num bytes: " << bytes << " time to read 1M bytes: " << (tm2 - tm1)/(bytes/1000000) << endl;
+
+}
diff --git a/model_zoo/lbann.cpp b/model_zoo/lbann.cpp
index 868b3e1d362..8be6986710e 100644
--- a/model_zoo/lbann.cpp
+++ b/model_zoo/lbann.cpp
@@ -30,18 +30,78 @@
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/protobuf_utils.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
-#include <cstdlib>
+#include "lbann/utils/argument_parser.hpp"
+#ifdef LBANN_HAS_CUDNN
+#include "lbann/utils/cudnn.hpp"
+#endif // LBANN_HAS_CUDNN
+#ifdef LBANN_HAS_CUDA
+#include "lbann/utils/cublas.hpp"
+#endif // LBANN_HAS_CUDA
+
+#include <lbann.pb.h>
+#include <model.pb.h>
 
+#include <cstdlib>
 
 using namespace lbann;
 
+namespace {
+int guess_global_rank() noexcept
+{
+  int have_mpi;
+  MPI_Initialized(&have_mpi);
+  if (have_mpi) {
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    return rank;
+  }
+  else {
+    if (char const* slurm_flag = std::getenv("SLURM_PROCID"))
+      return std::stoi(slurm_flag);
+    if (char const* open_mpi_flag = std::getenv("OMPI_WORLD_COMM_RANK"))
+      return std::stoi(open_mpi_flag);
+    else if (char const* mv2_flag = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))
+      return std::stoi(mv2_flag);
+    else
+      return -1;
+  }
+}
+}// namespace <anon>
+
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+  auto use_cudnn_tensor_ops =
+    arg_parser.add_flag("use cudnn tensor ops",
+                        {"--use-cudnn-tensor-ops"},
+                        utils::ENV("LBANN_USE_CUDNN_TENSOR_OPS"),
+                        "Set the default cuDNN math mode to use "
+                        "Tensor Core operations when available.");
+  auto use_cublas_tensor_ops =
+    arg_parser.add_flag("use cublas tensor ops",
+                        {"--use-cublas-tensor-ops"},
+                        utils::ENV("LBANN_USE_CUBLAS_TENSOR_OPS"),
+                        "Set the default cuBLAS math mode to use "
+                        "Tensor Core operations when available.");
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    auto guessed_rank = guess_global_rank();
+    if (guessed_rank <= 0)
+      // Cannot call `El::ReportException` because MPI hasn't been
+      // initialized yet.
+      std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+                << e.what() << "\n\nProcess terminating."
+                << std::endl;
+    std::terminate();
+  }
+
+  world_comm_ptr comm = initialize(argc, argv);
   const bool master = comm->am_world_master();
 
   if (master) {
-    std::cout << "\n\n==============================================================\n"
+    std::cout << "\n\n" << std::string(62,'=') << '\n'
               << "STARTING lbann with this command line:\n";
     for (int j=0; j<argc; j++) {
       std::cout << argv[j] << " ";
@@ -54,10 +114,30 @@ int main(int argc, char *argv[]) {
     options *opts = options::get();
     opts->init(argc, argv);
     if (opts->has_string("h") or opts->has_string("help") or argc == 1) {
+      if (master)
+        std::cout << arg_parser << std::endl;
       print_help(*comm);
       return EXIT_SUCCESS;
     }
 
+    // Setup cuDNN and cuBLAS defaults
+    if (master) {
+      std::cout << "Default tensor core settings:\n"
+                << "   cuDNN: " << (use_cudnn_tensor_ops ? "" : "NOT ")
+                << "using tensor core math." << "\n"
+                << "  cuBLAS: " << (use_cublas_tensor_ops ? "" : "NOT ")
+                << "using tensor core math." << "\n"
+                << std::endl;
+    }
+#ifdef LBANN_HAS_CUDNN
+    if (use_cudnn_tensor_ops)
+      cudnn::default_to_tensor_ops();
+#endif // LBANN_HAS_CUDNN
+#ifdef LBANN_HAS_CUDA
+    if (use_cublas_tensor_ops)
+      cublas::default_to_tensor_ops();
+#endif // LBANN_HAS_CUDA
+
     //this must be called after call to opts->init();
     if (!opts->get_bool("disable_signal_handler")) {
       std::string file_base = (opts->get_bool("stack_trace_to_file") ?
@@ -68,16 +148,33 @@ int main(int argc, char *argv[]) {
     //to activate, must specify --st_on on cmd line
     stack_profiler::get()->activate(comm->get_rank_in_world());
 
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm.get());
-
+    // Load the prototexts specificed on the command line
     auto pbs = protobuf_utils::load_prototext(master, argc, argv);
-    lbann_data::LbannPB pb = *(pbs[0]);
+    // Optionally over-ride some values in the prototext for each model
+    for(size_t i = 0; i < pbs.size(); i++) {
+      get_cmdline_overrides(*comm, *(pbs[i]));
+    }
+
+    lbann_data::LbannPB& pb = *(pbs[0]);
+    lbann_data::Trainer *pb_trainer = pb.mutable_trainer();
+
+    // Construct the trainer
+    std::unique_ptr<trainer> trainer = construct_trainer(comm.get(), pb_trainer, pb, opts);
+
+    thread_pool& io_thread_pool = trainer->get_io_thread_pool();
+
+    int training_dr_linearized_data_size = -1;
+    auto *dr = trainer->get_data_coordinator().get_data_reader(execution_mode::training);
+    if(dr != nullptr) {
+      training_dr_linearized_data_size = dr->get_linearized_data_size();
+    }
 
     lbann_data::Model *pb_model = pb.mutable_model();
 
-    auto model = build_model_from_prototext(argc, argv, pb,
-                                            comm.get(), io_thread_pool, true);
+    auto model = build_model_from_prototext(argc, argv, pb_trainer, pb,
+                                            comm.get(), opts, io_thread_pool,
+                                            trainer->get_callbacks_with_ownership(),
+                                            training_dr_linearized_data_size);
 
     if (opts->has_string("create_tarball")) {
       return EXIT_SUCCESS;
@@ -86,10 +183,10 @@ int main(int argc, char *argv[]) {
     if (! opts->get_bool("exit_after_setup")) {
 
       // Train model
-      model->train(pb_model->num_epochs());
+      trainer->train(model.get(), pb_model->num_epochs());
 
       // Evaluate model on test set
-      model->evaluate(execution_mode::testing);
+      trainer->evaluate(model.get(), execution_mode::testing);
 
       //has no affect unless option: --st_on was given
       stack_profiler::get()->print();
@@ -119,10 +216,14 @@ int main(int argc, char *argv[]) {
       e.print_report(fs);
     }
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   } catch (std::exception& e) {
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   }
 
   return EXIT_SUCCESS;
diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp
deleted file mode 100644
index b72ddd2a38f..00000000000
--- a/model_zoo/lbann2.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_proto.cpp - prototext application
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/lbann.hpp"
-#include "lbann/proto/proto_common.hpp"
-#include "lbann/utils/protobuf_utils.hpp"
-#include <dirent.h>
-#include <cstdlib>
-using namespace lbann;
-
-int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
-  const bool master = comm->am_world_master();
-
-  try {
-    // Initialize options db (this parses the command line)
-    options *opts = options::get();
-    opts->init(argc, argv);
-    if (opts->has_string("h") or opts->has_string("help") or argc == 1) {
-      print_help(*comm);
-      return EXIT_SUCCESS;
-    }
-
-    std::ostringstream err;
-
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm.get());
-
-    auto pbs = protobuf_utils::load_prototext(master, argc, argv);
-
-    auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]),
-                                                comm.get(), io_thread_pool, true);
-    std::unique_ptr<model> model_2;
-    if (pbs.size() > 1) {
-      model_2 = build_model_from_prototext(argc, argv, *(pbs[1]),
-                                           comm.get(), io_thread_pool, false);
-    }
-    // Load layer weights from checkpoint if checkpoint directory given
-    if(opts->has_string("ckpt_dir")){
-      lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), model_1.get());
-    }
-    // Train model
-    if (master) {
-      std::cerr << "\nSTARTING train - model 1\n\n";
-    }
-    const lbann_data::Model pb_model = pbs[0]->model();
-
-    // When using checkpoint states, skip training as those could be the result
-    // of checkpointing by steps.
-    if (!opts->has_string("no_model1_train")){
-      model_1->train( pb_model.num_epochs() );
-    }
-    // Evaluate model 1 unless it is set to skip
-    if (!opts->has_string("no_model1_eval")){
-      model_1->evaluate(execution_mode::testing);
-    }
-
-    if (model_2 != nullptr) {
-      const auto layers1 = model_1->get_layers();
-      const auto layers2 = model_2->get_layers();
-      for(size_t l2=0; l2 < layers2.size(); l2++) {
-        for(size_t l1=0; l1 < layers1.size(); l1++) {
-           if(layers2[l2]->get_name() == layers1[l1]->get_name()){
-             if(master) {
-               std::cout << "Model 1 Layer " << layers1[l1]->get_name();
-             }
-             layers2[l2]->replace_weights(layers1[l1]);
-             if(master) {
-               std::cout << " copied to Model2 Layer " << std::endl;
-             }
-           }
-         }
-       }
-
-      if (master) {
-        std::cerr << "\n STARTING train - model 2\n\n";
-      }
-      const lbann_data::Model pb_model_2 = pbs[1]->model();
-      model_2->train( pb_model_2.num_epochs() );
-      model_2->evaluate(execution_mode::testing);
-    }
-
-  } catch (std::exception& e) {
-    El::ReportException(e);
-    return EXIT_FAILURE;
-  }
-
-  return EXIT_SUCCESS;
-}
diff --git a/model_zoo/lbann_aecycgan.cpp b/model_zoo/lbann_aecycgan.cpp
index 699b779117a..fd6877d03f8 100644
--- a/model_zoo/lbann_aecycgan.cpp
+++ b/model_zoo/lbann_aecycgan.cpp
@@ -29,13 +29,57 @@
 #include "lbann/lbann.hpp"
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/protobuf_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <model.pb.h>
+
 #include <cstdlib>
 
 using namespace lbann;
 
+namespace {
+int guess_global_rank() noexcept
+{
+  int have_mpi;
+  MPI_Initialized(&have_mpi);
+  if (have_mpi) {
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    return rank;
+  }
+  else {
+    if (char const* slurm_flag = std::getenv("SLURM_PROCID"))
+      return std::stoi(slurm_flag);
+    if (char const* open_mpi_flag = std::getenv("OMPI_WORLD_COMM_RANK"))
+      return std::stoi(open_mpi_flag);
+    else if (char const* mv2_flag = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))
+      return std::stoi(mv2_flag);
+    else
+      return -1;
+  }
+}
+}// namespace <anon>
+
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    auto guessed_rank = guess_global_rank();
+    if (guessed_rank <= 0)
+      // Cannot call `El::ReportException` because MPI hasn't been
+      // initialized yet.
+      std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+                << e.what() << "\n\nProcess terminating."
+                << std::endl;
+    std::terminate();
+  }
+
+  world_comm_ptr comm = initialize(argc, argv);
   const bool master = comm->am_world_master();
 
   try {
@@ -49,26 +93,47 @@ int main(int argc, char *argv[]) {
 
     std::ostringstream err;
 
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm.get());
-
     auto pbs = protobuf_utils::load_prototext(master, argc, argv);
+    // Optionally over-ride some values in the prototext for each model
+    for(size_t i = 0; i < pbs.size(); i++) {
+      get_cmdline_overrides(*comm, *(pbs[i]));
+    }
+
+    lbann_data::LbannPB& pb = *(pbs[0]);
+    lbann_data::Trainer *pb_trainer = pb.mutable_trainer();
+
+    // Construct the trainer
+    std::unique_ptr<trainer> trainer = construct_trainer(comm.get(), pb_trainer, *(pbs[0]), opts);
+
+    thread_pool& io_thread_pool = trainer->get_io_thread_pool();
+
+    int training_dr_linearized_data_size = -1;
+    auto *dr = trainer->get_data_coordinator().get_data_reader(execution_mode::training);
+    if(dr != nullptr) {
+      training_dr_linearized_data_size = dr->get_linearized_data_size();
+    }
 
-    auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]),
-                                                comm.get(), io_thread_pool, true); //ae
+    auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]),
+                                              comm.get(), opts, io_thread_pool,
+                                              trainer->get_callbacks_with_ownership(),
+                                              training_dr_linearized_data_size); //ae
     std::unique_ptr<model>
       model_2, //cycgan
       model_3; //ae+cycgan
 
 
     if (pbs.size() > 1) {
-      model_2 = build_model_from_prototext(argc, argv, *(pbs[1]),
-                                           comm.get(), io_thread_pool, false);
+      model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]),
+                                           comm.get(), opts, io_thread_pool,
+                                           trainer->get_callbacks_with_ownership(),
+                                           training_dr_linearized_data_size);
     }
 
     if (pbs.size() > 2) {
-      model_3 = build_model_from_prototext(argc, argv, *(pbs[2]),
-                                           comm.get(), io_thread_pool, false);
+      model_3 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[2]),
+                                           comm.get(), opts, io_thread_pool,
+                                           trainer->get_callbacks_with_ownership(),
+                                           training_dr_linearized_data_size);
     }
 
 
@@ -77,16 +142,16 @@ int main(int argc, char *argv[]) {
     const lbann_data::Model pb_model_3 = pbs[2]->model();
 
     if(master) std::cout << " Pre-train autoencoder " << std::endl;
-    model_1->train(pb_model_1.num_epochs());
-    model_1->evaluate(execution_mode::testing);
+    trainer->train(model_1.get(), pb_model_1.num_epochs());
+    trainer->evaluate(model_1.get(), execution_mode::testing);
     auto ae_weights = model_1->get_weights();
     model_2->copy_trained_weights_from(ae_weights);
     model_3->copy_trained_weights_from(ae_weights);
 
     //Train cycle GAN
     if (master)  std::cerr << "\nSTARTING train - cycle GAN \n\n";
-    model_2->train(pb_model_2.num_epochs());
-    model_2->evaluate(execution_mode::testing);
+    trainer->train(model_2.get(), pb_model_2.num_epochs());
+    trainer->evaluate(model_2.get(), execution_mode::testing);
     auto model2_weights = model_2->get_weights();
 
     //Evaluate on pretrained autoencoder
@@ -95,11 +160,13 @@ int main(int argc, char *argv[]) {
     if(master) std::cout << " Save AE + cycleGAN" << std::endl;
     model_3->save_model();
     if(master) std::cout << " Evaluate cycleGAN model on pretrained autoencoder" << std::endl;
-    model_3->evaluate(execution_mode::testing);
+    trainer->evaluate(model_3.get(), execution_mode::testing);
 
   } catch (std::exception& e) {
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   }
 
   return EXIT_SUCCESS;
diff --git a/model_zoo/lbann_cycgan.cpp b/model_zoo/lbann_cycgan.cpp
index 3093075aa7e..76fd2023372 100644
--- a/model_zoo/lbann_cycgan.cpp
+++ b/model_zoo/lbann_cycgan.cpp
@@ -29,13 +29,57 @@
 #include "lbann/lbann.hpp"
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/protobuf_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <model.pb.h>
+
 #include <cstdlib>
 
 using namespace lbann;
 
+namespace {
+int guess_global_rank() noexcept
+{
+  int have_mpi;
+  MPI_Initialized(&have_mpi);
+  if (have_mpi) {
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    return rank;
+  }
+  else {
+    if (char const* slurm_flag = std::getenv("SLURM_PROCID"))
+      return std::stoi(slurm_flag);
+    if (char const* open_mpi_flag = std::getenv("OMPI_WORLD_COMM_RANK"))
+      return std::stoi(open_mpi_flag);
+    else if (char const* mv2_flag = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))
+      return std::stoi(mv2_flag);
+    else
+      return -1;
+  }
+}
+}// namespace <anon>
+
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    auto guessed_rank = guess_global_rank();
+    if (guessed_rank <= 0)
+      // Cannot call `El::ReportException` because MPI hasn't been
+      // initialized yet.
+      std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+                << e.what() << "\n\nProcess terminating."
+                << std::endl;
+    std::terminate();
+  }
+
+  world_comm_ptr comm = initialize(argc, argv);
   const bool master = comm->am_world_master();
 
   if (master) {
@@ -68,13 +112,30 @@ int main(int argc, char *argv[]) {
 
     std::ostringstream err;
 
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm.get());
-
     auto pbs = protobuf_utils::load_prototext(master, argc, argv);
+    // Optionally over-ride some values in the prototext for each model
+    for(size_t i = 0; i < pbs.size(); i++) {
+      get_cmdline_overrides(*comm, *(pbs[i]));
+    }
+
+    lbann_data::LbannPB& pb = *(pbs[0]);
+    lbann_data::Trainer *pb_trainer = pb.mutable_trainer();
+
+    // Construct the trainer
+    std::unique_ptr<trainer> trainer = construct_trainer(comm.get(), pb_trainer, *(pbs[0]), opts);
+
+    thread_pool& io_thread_pool = trainer->get_io_thread_pool();
+
+    int training_dr_linearized_data_size = -1;
+    auto *dr = trainer->get_data_coordinator().get_data_reader(execution_mode::training);
+    if(dr != nullptr) {
+      training_dr_linearized_data_size = dr->get_linearized_data_size();
+    }
 
-    auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]),
-                                              comm.get(), io_thread_pool, true); //D1 solver
+    auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]),
+                                              comm.get(), opts, io_thread_pool,
+                                              trainer->get_callbacks_with_ownership(),
+                                              training_dr_linearized_data_size); //D1 solver
     //hack, overide model name to make reporting easy, what can break?"
     std::unique_ptr<model> model_2, //G1 solver
       model_3, //G2 solver
@@ -84,23 +145,31 @@ int main(int argc, char *argv[]) {
       ae_cycgan_model; //contain layer(s) from (cyc)GAN
 
     if (pbs.size() > 1) {
-      model_2 = build_model_from_prototext(argc, argv, *(pbs[1]),
-                                           comm.get(), io_thread_pool, false);
+      model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]),
+                                           comm.get(), opts, io_thread_pool,
+                                           trainer->get_callbacks_with_ownership(),
+                                           training_dr_linearized_data_size);
     }
 
     if (pbs.size() > 2) {
-      model_3 = build_model_from_prototext(argc, argv, *(pbs[2]),
-                                           comm.get(), io_thread_pool, false);
+      model_3 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[2]),
+                                           comm.get(), opts, io_thread_pool,
+                                           trainer->get_callbacks_with_ownership(),
+                                           training_dr_linearized_data_size);
     }
 
     if (pbs.size() > 3) {
-      ae_model = build_model_from_prototext(argc, argv, *(pbs[3]),
-                                           comm.get(), io_thread_pool, false);
+      ae_model = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[3]),
+                                            comm.get(), opts, io_thread_pool,
+                                            trainer->get_callbacks_with_ownership(),
+                                            training_dr_linearized_data_size);
     }
 
     if (pbs.size() > 4) {
-      ae_cycgan_model = build_model_from_prototext(argc, argv, *(pbs[4]),
-                                           comm.get(), io_thread_pool, false);
+      ae_cycgan_model = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[4]),
+                                                   comm.get(), opts, io_thread_pool,
+                                                   trainer->get_callbacks_with_ownership(),
+                                                   training_dr_linearized_data_size);
     }
 
     const lbann_data::Model pb_model = pbs[0]->model();
@@ -112,7 +181,7 @@ int main(int argc, char *argv[]) {
     if(ae_model != nullptr) {
       if(master) std::cout << " Pre-train autoencoder " << std::endl;
       const lbann_data::Model pb_model_4 = pbs[3]->model();
-      ae_model->train(pb_model_4.num_epochs());
+      trainer->train(ae_model.get(), pb_model_4.num_epochs());
       auto ae_weights = ae_model->get_weights();
       model_1->copy_trained_weights_from(ae_weights);
       model_2->copy_trained_weights_from(ae_weights);
@@ -125,20 +194,20 @@ int main(int argc, char *argv[]) {
     int max_super_step = pb_model.super_steps();
     while (super_step <= max_super_step) {
       if (master)  std::cerr << "\nSTARTING train - discriminator (D1 & D2) models at step " << super_step <<"\n\n";
-      model_1->train( super_step*pb_model.num_epochs(),pb_model.num_batches());
+      trainer->train(model_1.get(), super_step*pb_model.num_epochs(),pb_model.num_batches());
 
       if(master) std::cout << " Copy all trained weights from discriminator to G1 and train/freeze as appropriate " << std::endl;
       auto model1_weights = model_1->get_weights();
       model_2->copy_trained_weights_from(model1_weights);
       if (master) std::cerr << "\n STARTING train - G1 solver model at step " << super_step << " \n\n";
-      model_2->train( super_step*pb_model_2.num_epochs(),pb_model_2.num_batches());
+      trainer->train(model_2.get(), super_step*pb_model_2.num_epochs(),pb_model_2.num_batches());
       // Evaluate model on test set
       //      model_2->evaluate(execution_mode::testing,pb_model_2.num_batches());
 
       if(master) std::cout << " Copy all trained weights from discriminator to G2 and train/freeze as appropriate " << std::endl;
       model_3->copy_trained_weights_from(model1_weights);
       if (master) std::cerr << "\n STARTING train - G2 solver model at step " << super_step << " \n\n";
-      model_3->train( super_step*pb_model_3.num_epochs(),pb_model_3.num_batches());
+      trainer->train(model_3.get(), super_step*pb_model_3.num_epochs(),pb_model_3.num_batches());
       // Evaluate model on test set
       //      model_3->evaluate(execution_mode::testing,pb_model_3.num_batches());
 
@@ -167,14 +236,16 @@ int main(int argc, char *argv[]) {
     model_3->save_model();
     ae_cycgan_model->save_model();
     if(master) std::cout << " Evaluate pretrained autoencoder" << std::endl;
-    ae_cycgan_model->evaluate(execution_mode::testing);
+    trainer->evaluate(ae_cycgan_model.get(), execution_mode::testing);
 
     //has no affect unless option: --st_on was given
     stack_profiler::get()->print();
 
   } catch (std::exception& e) {
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   }
 
   return EXIT_SUCCESS;
diff --git a/model_zoo/lbann_gan.cpp b/model_zoo/lbann_gan.cpp
index 2db1b291bf6..22d9f129ff2 100644
--- a/model_zoo/lbann_gan.cpp
+++ b/model_zoo/lbann_gan.cpp
@@ -29,13 +29,57 @@
 #include "lbann/lbann.hpp"
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/protobuf_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <model.pb.h>
+
 #include <cstdlib>
 
 using namespace lbann;
 
+namespace {
+int guess_global_rank() noexcept
+{
+  int have_mpi;
+  MPI_Initialized(&have_mpi);
+  if (have_mpi) {
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    return rank;
+  }
+  else {
+    if (char const* slurm_flag = std::getenv("SLURM_PROCID"))
+      return std::stoi(slurm_flag);
+    if (char const* open_mpi_flag = std::getenv("OMPI_WORLD_COMM_RANK"))
+      return std::stoi(open_mpi_flag);
+    else if (char const* mv2_flag = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))
+      return std::stoi(mv2_flag);
+    else
+      return -1;
+  }
+}
+}// namespace <anon>
+
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    auto guessed_rank = guess_global_rank();
+    if (guessed_rank <= 0)
+      // Cannot call `El::ReportException` because MPI hasn't been
+      // initialized yet.
+      std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+                << e.what() << "\n\nProcess terminating."
+                << std::endl;
+    std::terminate();
+  }
+
+  world_comm_ptr comm = initialize(argc, argv);
   const bool master = comm->am_world_master();
 
   try {
@@ -49,16 +93,34 @@ int main(int argc, char *argv[]) {
 
     std::ostringstream err;
 
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm.get());
-
     auto pbs = protobuf_utils::load_prototext(master, argc, argv);
+    // Optionally over-ride some values in the prototext for each model
+    for(size_t i = 0; i < pbs.size(); i++) {
+      get_cmdline_overrides(*comm, *(pbs[i]));
+    }
+
+    lbann_data::LbannPB& pb = *(pbs[0]);
+    lbann_data::Trainer *pb_trainer = pb.mutable_trainer();
+
+    // Construct the trainer
+    std::unique_ptr<trainer> trainer = construct_trainer(comm.get(), pb_trainer, *(pbs[0]), opts);
+
+    thread_pool& io_thread_pool = trainer->get_io_thread_pool();
+
+    int training_dr_linearized_data_size = -1;
+    auto *dr = trainer->get_data_coordinator().get_data_reader(execution_mode::training);
+    if(dr != nullptr) {
+      training_dr_linearized_data_size = dr->get_linearized_data_size();
+    }
 
-    auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]), comm.get(), io_thread_pool, true); //discriminator
-                                                                                    //model
+    auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), comm.get(), opts, io_thread_pool,
+                                              trainer->get_callbacks_with_ownership(),
+                                              training_dr_linearized_data_size); //discriminator model
     std::unique_ptr<model> model_2 = nullptr; //adversarial model
     if (pbs.size() > 1) {
-      model_2 = build_model_from_prototext(argc, argv, *(pbs[1]), comm.get(), io_thread_pool, false);
+      model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), comm.get(), opts, io_thread_pool,
+                                           trainer->get_callbacks_with_ownership(),
+                                           training_dr_linearized_data_size);
     }
 
     const lbann_data::Model pb_model = pbs[0]->model();
@@ -71,7 +133,7 @@ int main(int argc, char *argv[]) {
     while (super_step <= max_super_step) {
       if (master)  std::cerr << "\nSTARTING train - discriminator model at step " << super_step <<"\n\n";
       //@todo freeze generator layers in this step
-      model_1->train( super_step*pb_model.num_epochs() );
+      trainer->train(model_1.get(), super_step*pb_model.num_epochs() );
 
       //Replace/copy "proxy" layer in adversarial model (model2) with its "equivalent" layer in discriminator model (model1)
       //@todo freeze layers after replacement
@@ -84,7 +146,7 @@ int main(int argc, char *argv[]) {
           for(size_t l1=0; l1 < layers1.size(); l1++) {
              if(l2_name == layers1[l1]->get_name()){
                if(master) std::cout << "Replacing adversarial model (model 2) Layer " << layers1[l1]->get_name();
-               layers2[l2]->replace_weights(layers1[l1]);
+               layers2[l2]->replace_weights(*layers1[l1]);
                if(master) std::cout << " with corresponding layer " << layers2[l2]->get_name() << " in discriminator model (model1) " << std::endl;
              }
           }
@@ -92,14 +154,16 @@ int main(int argc, char *argv[]) {
       }
 
       if (master) std::cerr << "\n STARTING train - adversarial model at step " << super_step << " \n\n";
-      model_2->train( super_step*pb_model_2.num_epochs() );
+      trainer->train(model_2.get(), super_step*pb_model_2.num_epochs() );
 
       super_step++;
     }
 
   } catch (std::exception& e) {
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   }
 
   return EXIT_SUCCESS;
diff --git a/model_zoo/lbann_help.cpp b/model_zoo/lbann_help.cpp
index a297877a36a..badc0affdfd 100644
--- a/model_zoo/lbann_help.cpp
+++ b/model_zoo/lbann_help.cpp
@@ -27,12 +27,26 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <lbann/proto/proto_common.hpp>
+#include "lbann/utils/argument_parser.hpp"
+#include "lbann/utils/lbann_library.hpp"
 
 #include <iostream>
 
 using namespace lbann;
 
-int main(int, char **) {
+int main(int argc, char *argv[]) {
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+              << e.what() << "\n\nProcess terminating."
+              << std::endl;
+    std::terminate();
+  }
   print_help(std::cerr);
   return EXIT_SUCCESS;
 }
diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp
index 252e944e256..ec56b023c14 100644
--- a/model_zoo/lbann_inf.cpp
+++ b/model_zoo/lbann_inf.cpp
@@ -29,13 +29,31 @@
 #include "lbann/lbann.hpp"
 #include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/protobuf_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <model.pb.h>
+
 #include <dirent.h>
+
 #include <cstdlib>
+
 using namespace lbann;
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  auto comm = initialize(argc, argv, random_seed);
+  auto& arg_parser = global_argument_parser();
+  construct_std_options();
+
+  try {
+    arg_parser.parse(argc, argv);
+  }
+  catch (std::exception const& e) {
+    std::cerr << "Error during argument parsing:\n\ne.what():\n\n  "
+              << e.what() << "\n\nProcess terminating."
+              << std::endl;
+    std::terminate();
+  }
+  auto comm = initialize(argc, argv);
   const bool master = comm->am_world_master();
 
   try {
@@ -49,39 +67,51 @@ int main(int argc, char *argv[]) {
 
     std::ostringstream err;
 
-    // Initalize a global I/O thread pool
-    std::shared_ptr<thread_pool> io_thread_pool
-      = construct_io_thread_pool(comm.get());
-
     auto pbs = protobuf_utils::load_prototext(master, argc, argv);
+    // Optionally over-ride some values in the prototext for each model
+    for(size_t i = 0; i < pbs.size(); i++) {
+      get_cmdline_overrides(*comm, *(pbs[i]));
+    }
+
+    lbann_data::LbannPB& pb = *(pbs[0]);
+    lbann_data::Trainer *pb_trainer = pb.mutable_trainer();
+
+    // Construct the trainer
+    std::unique_ptr<trainer> trainer = construct_trainer(comm.get(), pb_trainer, *(pbs[0]), opts);
+
+    thread_pool& io_thread_pool = trainer->get_io_thread_pool();
+    int training_dr_linearized_data_size = -1;
+    auto *dr = trainer->get_data_coordinator().get_data_reader(execution_mode::testing);
+    if(dr != nullptr) {
+      training_dr_linearized_data_size = dr->get_linearized_data_size();
+    } else {
+      LBANN_ERROR("No testing data reader defined");
+    }
+
     std::vector<std::unique_ptr<model>> models;
     for(auto&& pb_model : pbs) {
       models.emplace_back(
-        build_model_from_prototext(argc, argv, *pb_model,
-                                   comm.get(), io_thread_pool, models.size() == 0));
-    }
-
-    // Load layer weights from checkpoint if checkpoint directory given
-    if(opts->has_string("ckpt_dir")){
-      for(auto&& m : models) {
-        lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get());
-      }
-    }else {
-      LBANN_ERROR("Unable to reload model");
+        build_model_from_prototext(argc, argv, pb_trainer, *pb_model,
+                                   comm.get(), opts, io_thread_pool,
+                                   trainer->get_callbacks_with_ownership(),
+                                   training_dr_linearized_data_size));
     }
 
     /// Interleave the inference between the models so that they can use a shared data reader
     /// Enable shared testing data readers on the command line via --share_testing_data_readers=1
-    El::Int num_samples = models[0]->get_num_iterations_per_epoch(execution_mode::testing);
+    El::Int num_samples = dr->get_num_iterations_per_epoch();
+    if(num_samples == 0) { LBANN_ERROR("The testing data reader does not have any samples"); }
     for(El::Int s = 0; s < num_samples; s++) {
       for(auto&& m : models) {
-        m->evaluate(execution_mode::testing, 1);
+        trainer->evaluate(m.get(), execution_mode::testing, 1);
       }
     }
 
   } catch (std::exception& e) {
     El::ReportException(e);
-    return EXIT_FAILURE;
+    // It's possible that a proper subset of ranks throw some
+    // exception. But we want to tear down the whole world.
+    El::mpi::Abort(El::mpi::COMM_WORLD, EXIT_FAILURE);
   }
 
   return EXIT_SUCCESS;
diff --git a/model_zoo/models/alexnet/model_alexnet.prototext b/model_zoo/models/alexnet/model_alexnet.prototext
deleted file mode 100644
index 58e8edfb22d..00000000000
--- a/model_zoo/models/alexnet/model_alexnet.prototext
+++ /dev/null
@@ -1,335 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 72
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-5 categorical accuracy"
-      layer: "top5_accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  callback { print {} }
-  callback { timer {} }
-  # callback { debug {} }
-  callback {
-    drop_fixed_learning_rate {
-      drop_epoch: 20
-      drop_epoch: 40
-      drop_epoch: 60
-      amt: 0.1
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    name: "image"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    name: "label"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    name: "conv1"
-    parents: "image"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "0 0"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    name: "relu1"
-    parents: "conv1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-  layer {
-    name: "norm1"
-    parents: "relu1"
-    data_layout: "data_parallel"
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-  }
-  layer {
-    name: "pool1"
-    parents: "norm1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    name: "conv2"
-    parents: "pool1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    name: "relu2"
-    parents: "conv2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-  layer {
-    name: "norm2"
-    parents: "relu2"
-    data_layout: "data_parallel"
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-  }
-  layer {
-    name: "pool2"
-    parents: "norm2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    name: "conv3"
-    parents: "pool2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    name: "relu3"
-    parents: "conv3"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    name: "conv4"
-    parents: "relu3"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    name: "relu4"
-    parents: "conv4"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    name: "conv5"
-    parents: "relu4"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    name: "relu5"
-    parents: "conv5"
-    data_layout: "data_parallel"
-    relu {}
-  }
-  layer {
-    name: "pool5"
-    parents: "relu5"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    name: "fc6"
-    parents: "pool5"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-  }
-  layer {
-    name: "relu6"
-    parents: "fc6"
-    data_layout: "model_parallel"
-    relu {}
-  }
-  layer {
-    name: "drop6"
-    parents: "relu6"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.5
-    }
-  }
-
-  layer {
-    name: "fc7"
-    parents: "drop6"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-  }
-  layer {
-    name: "relu7"
-    parents: "fc7"
-    data_layout: "model_parallel"
-    relu {}
-  }
-  layer {
-    name: "drop7"
-    parents: "relu7"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.5
-    }
-  }
-
-  layer {
-    name: "fc8"
-    parents: "drop7"
-    data_layout: "model_parallel"
-    hint_layer: "label"
-    fully_connected {
-      has_bias: false
-    }
-  }
-
-  layer {
-    name: "prob"
-    parents: "fc8"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    name: "cross_entropy"
-    parents: "prob label"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    name: "top1_accuracy"
-    parents: "prob label"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-  layer {
-    name: "top5_accuracy"
-    parents: "prob label"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 5 }
-  }
-
-}
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
index 6e40b9f3328..21026e35f17 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
@@ -1,13 +1,13 @@
+trainer {
+  num_parallel_readers: 1
+}
 model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP
   data_layout: "model_parallel"
   mini_batch_size: 128
-  block_size: 256
   num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
@@ -193,7 +193,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 250
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext
deleted file mode 100644
index c25231de172..00000000000
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext
+++ /dev/null
@@ -1,374 +0,0 @@
-model {
-  ### Model description and network architecture taken from:
-  ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
-  ### This network description is anologous to AutoEncoder_Chem_ECFP
-  data_layout: "model_parallel"
-  mini_batch_size: 1024
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "Pearson correlation"
-      layer: "pearson_r"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "input"
-    children: "data dummy"
-    data_layout: "data_parallel"
-    input {
-      target_mode: "reconstruction"
-    }
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #################
-  # FULLY_CONNECTED encode1
-  #################
-  layer {
-    parents: "data"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 200
-      has_bias: true
-    }
-  }
-
-  ######
-  # RELU relu1
-  ######
-  layer {
-    parents: "encode1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode2
-  #################
-  layer {
-    parents: "relu1"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 150
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu2
-  #######
-  layer {
-    parents: "encode2"
-    name: "relu2"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode3
-  #################
-  layer {
-    parents: "relu2"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu3
-  #######
-  layer {
-    parents: "encode3"
-    name: "relu3"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode4
-  #################
-  layer {
-    parents: "relu3"
-    name: "encode4"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu4
-  #######
-  layer {
-    parents: "encode4"
-    name: "relu4"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode5
-  #################
-  layer {
-    parents: "relu4"
-    name: "encode5"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu5
-  #######
-  layer {
-    parents: "encode5"
-    name: "relu5"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode5
-  #################
-  layer {
-    parents: "relu5"
-    name: "decode5"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU 6
-  #######
-  layer {
-    parents: "decode5"
-    name: "relu6"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode4
-  #################
-  layer {
-    parents: "relu6"
-    name: "decode4"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-
-  #######
-  # RELU relu7
-  #######
-  layer {
-    parents: "decode4"
-    name: "relu7"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode3
-  #################
-  layer {
-    parents: "relu7"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 150
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu8
-  #######
-  layer {
-    parents: "decode3"
-    name: "relu8"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode2
-  #################
-  layer {
-    parents: "relu8"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 200
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu9
-  #######
-  layer {
-    parents: "decode2"
-    name: "relu9"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "relu9"
-    name: "decode1"
-    data_layout: "model_parallel"
-    num_neurons_from_data_reader: true
-    fully_connected {
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu10
-  #######
-  layer {
-    parents: "decode1"
-    name: "relu10"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "relu10"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction data"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-  #####################
-  # PEARSON CORRELATION
-  #####################
-  # rho(x,y) = covariance(x,y) / sqrt( variance(x) * variance(y) )
-  layer {
-    parents: "reconstruction data"
-    name: "pearson_r_cov"
-    data_layout: "model_parallel"
-    covariance {}
-  }
-  layer {
-    parents: "data"
-    name: "pearson_r_var1"
-    data_layout: "model_parallel"
-    variance {}
-  }
-  layer {
-    parents: "reconstruction"
-    name: "pearson_r_var2"
-    data_layout: "model_parallel"
-    variance {}
-  }
-  layer {
-    parents: "pearson_r_var1 pearson_r_var2"
-    name: "pearson_r_mult"
-    data_layout: "model_parallel"
-    multiply {}
-  }
-  layer {
-    parents: "pearson_r_mult"
-    name: "pearson_r_sqrt"
-    data_layout: "model_parallel"
-    sqrt {}
-  }
-  layer {
-    parents: "pearson_r_cov pearson_r_sqrt"
-    name: "pearson_r"
-    data_layout: "model_parallel"
-    divide {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext
deleted file mode 100644
index 576d5a3c402..00000000000
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext
+++ /dev/null
@@ -1,277 +0,0 @@
-model {
-  ### Model description and network architecture taken from:
-  ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
-  ### This network description is anologous to AutoEncoder_Chem_ECFP
-  data_layout: "model_parallel"
-  mini_batch_size: 1024
-  block_size: 256
-  num_epochs:20
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "Pearson correlation"
-      layer: "pearson_r"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "input"
-    children: "data dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #################
-  # FULLY_CONNECTED encode1
-  #################
-  layer {
-    parents: "data"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  ######
-  # SELU selu1
-  ######
-  layer {
-    parents: "encode1"
-    name: "selu1"
-    data_layout: "model_parallel"
-    selu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode2
-  #################
-  layer {
-    parents: "selu1"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 250
-      has_bias: true
-    }
-  }
-
-  #######
-  # SELU selu2
-  #######
-  layer {
-    parents: "encode2"
-    name: "selu2"
-    data_layout: "model_parallel"
-    selu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode3
-  #################
-  layer {
-    parents: "selu2"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # SELU selu3
-  #######
-  layer {
-    parents: "encode3"
-    name: "selu3"
-    data_layout: "model_parallel"
-    selu {
-    }
-  }
-
-
-  #################
-  # FULLY_CONNECTED decode3
-  #################
-  layer {
-    parents: "selu3"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 250
-      has_bias: true
-    }
-  }
-
-  #######
-  # SELU selu8
-  #######
-  layer {
-    parents: "decode3"
-    name: "selu8"
-    data_layout: "model_parallel"
-    selu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode2
-  #################
-  layer {
-    parents: "selu8"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  #######
-  # SELU selu9
-  #######
-  layer {
-    parents: "decode2"
-    name: "selu9"
-    data_layout: "model_parallel"
-    selu {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "selu9"
-    name: "decode1"
-    data_layout: "model_parallel"
-    num_neurons_from_data_reader: true
-    fully_connected {
-      has_bias: true
-    }
-  }
-
-  #######
-  # SELU selu10
-  #######
-  layer {
-    parents: "decode1"
-    name: "selu10"
-    data_layout: "model_parallel"
-    #selu {
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "relu10"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction data"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-  #####################
-  # PEARSON CORRELATION
-  #####################
-  # rho(x,y) = covariance(x,y) / sqrt( variance(x) * variance(y) )
-  layer {
-    parents: "reconstruction data"
-    name: "pearson_r_cov"
-    data_layout: "model_parallel"
-    covariance {}
-  }
-  layer {
-    parents: "data"
-    name: "pearson_r_var1"
-    data_layout: "model_parallel"
-    variance {}
-  }
-  layer {
-    parents: "reconstruction"
-    name: "pearson_r_var2"
-    data_layout: "model_parallel"
-    variance {}
-  }
-  layer {
-    parents: "pearson_r_var1 pearson_r_var2"
-    name: "pearson_r_mult"
-    data_layout: "model_parallel"
-    multiply {}
-  }
-  layer {
-    parents: "pearson_r_mult"
-    name: "pearson_r_sqrt"
-    data_layout: "model_parallel"
-    sqrt {}
-  }
-  layer {
-    parents: "pearson_r_cov pearson_r_sqrt"
-    name: "pearson_r"
-    data_layout: "model_parallel"
-    divide {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext
deleted file mode 100644
index 374ed07ec4c..00000000000
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext
+++ /dev/null
@@ -1,234 +0,0 @@
-model {
-  ### Model description and network architecture taken from:
-  ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
-  ### This network description is anologous to AutoEncoder_Chem_Sigmoid
-  data_layout: "model_parallel"
-  mini_batch_size: 128
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    summary {
-      dir: "."
-      batch_interval: 1
-      mat_interval: 25
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "input"
-    children: "data dummy"
-    data_layout: "data_parallel"
-    input {
-      target_mode: "reconstruction"
-    }
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #################
-  # FULLY_CONNECTED encode1
-  #################
-  layer {
-    parents: "data"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 2000
-      has_bias: true
-    }
-  }
-
-  ######
-  # SIGMOID sigmoid1
-  ######
-  layer {
-    parents: "encode1"
-    name: "sigmoid1"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode2
-  #################
-  layer {
-    parents: "sigmoid1"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid2
-  #######
-  layer {
-    parents: "encode2"
-    name: "sigmoid2"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED encode3
-  #################
-  layer {
-    parents: "sigmoid2"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid3
-  #######
-  layer {
-    parents: "encode3"
-    name: "sigmoid3"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # FULLY_CONNECTED decode3
-  #################
-  layer {
-    parents: "sigmoid3"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid4
-  #######
-  layer {
-    parents: "decode3"
-    name: "sigmoid4"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode2
-  #################
-  layer {
-    parents: "sigmoid4"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 2000
-      has_bias: true
-    }
-  }
-
-
-  #######
-  # SIGMOID sigmoid5
-  #######
-  layer {
-    parents: "decode2"
-    name: "sigmoid5"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "sigmoid5"
-    name: "decode1"
-    data_layout: "model_parallel"
-    num_neurons_from_data_reader: true
-    fully_connected {
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid6
-  #######
-  layer {
-    parents: "decode1"
-    name: "sigmoid6"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "relu10"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction data"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
index 47e375a2e25..2d0305e249d 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
@@ -1,13 +1,13 @@
+trainer {
+  num_parallel_readers: 1
+}
 model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP
   data_layout: "model_parallel"
   mini_batch_size: 128
-  block_size: 256
   num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
diff --git a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext
deleted file mode 100644
index c8f1e32b091..00000000000
--- a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext
+++ /dev/null
@@ -1,145 +0,0 @@
-model {
-  data_layout: "model_parallel"
-  mini_batch_size: 32
-  block_size: 256
-  num_epochs: 100
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "data"
-    children: "image dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #################
-  # FULLY_CONNECTED encode1
-  #################
-  layer {
-    parents: "image"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 1000
-      has_bias: true
-    }
-  }
-
-  # RELU
-  ######
-  layer {
-    parents: "encode1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # DROPOUT
-  #########
-  layer {
-    parents: "relu1"
-    name: "dropout1"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.8
-    }
-  }
-
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "dropout1"
-    name: "decode1"
-    data_layout: "model_parallel"
-    hint_layer: "image"
-    fully_connected {
-      has_bias: true
-    }
-  }
-
-  # SIGMOID
-  #########
-  layer {
-    parents: "decode1"
-    name: "sigmoid1"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  # DROPOUT
-  #########
-  layer {
-    parents: "sigmoid1"
-    name: "dropout2"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.8
-    }
-  }
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "dropout2 image"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext
deleted file mode 100644
index 1107d1f2bfd..00000000000
--- a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext
+++ /dev/null
@@ -1,376 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 128
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 1
-  #procs_per_trainer: 12
-  procs_per_trainer: 0
-  disable_cuda: true
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "data"
-    children: "image dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #############
-  # CONVOLUTION 1
-  #############
-  layer {
-    parents: "image"
-    name: "conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 16
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 1
-  ######
-  layer {
-    parents: "conv1"
-    name: "relu1"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 1
-  #########
-  layer {
-    parents: "relu1"
-    name: "pool1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  #############
-  # CONVOLUTION 2
-  #############
-  layer {
-    parents: "pool1"
-    name: "conv2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 2
-  ######
-  layer {
-    parents: "conv2"
-    name: "relu2"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 2
-  #########
-  layer {
-    parents: "relu2"
-    name: "pool2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  #############
-  # CONVOLUTION 3
-  #############
-  layer {
-    parents: "pool2"
-    name: "conv3"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 3
-  ######
-  layer {
-    parents: "conv3"
-    name: "relu3"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 3
-  #########
-  layer {
-    parents: "relu3"
-    name: "pool3"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
- #########
-  # UNPOOLING 3
-  #########
-  layer {
-    parents: "pool3"
-    name: "unpool3"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool3"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 3
-  #############
-  layer {
-    parents: "unpool3"
-    name: "deconv3"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 4
-  ######
-  layer {
-    parents: "deconv3"
-    name: "relu4"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # UNPOOLING 2
-  #########
-  layer {
-    parents: "relu4"
-    name: "unpool2"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool2"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 2
-  #############
-  layer {
-    parents: "unpool2"
-    name: "deconv2"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 16
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 5
-  ######
-  layer {
-    parents: "deconv2"
-    name: "relu5"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # UNPOOLING 1
-  #########
-  layer {
-    parents: "relu5"
-    name: "unpool1"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool1"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 1
-  #############
-  layer {
-    parents: "unpool1"
-    name: "deconv1"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 3
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 6
-  ######
-  layer {
-    parents: "deconv1"
-    name: "relu6"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "relu6"
-    name: "decode1"
-    data_layout: "data_parallel"
-    hint_layer: "image"
-    fully_connected {
-      num_neurons: 784
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid
-  #######
-  layer {
-    parents: "decode1"
-    name: "reconstruction"
-    data_layout: "data_parallel"
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "reconstruction image"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext
deleted file mode 100644
index 97d0ee18f3f..00000000000
--- a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext
+++ /dev/null
@@ -1,169 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 128
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    save_images {
-      layers: "image reconstruction"
-      image_format: "pgm"
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "data"
-    children: "image dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #############
-  # CONVOLUTION 1
-  #############
-  layer {
-    parents: "image"
-    name: "conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 16
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 1
-  ######
-  layer {
-    parents: "conv1"
-    name: "relu1"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 1
-  #########
-  layer {
-    parents: "relu1"
-    name: "pool1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-#DECODER
-
-  #########
-  # UNPOOLING 1
-  #########
-  layer {
-    parents: "pool1"
-    name: "unpool1"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool1"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 1
-  #############
-  layer {
-    parents: "unpool1"
-    name: "deconv1"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 3
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid
-  #######
-  layer {
-    parents: "deconv1"
-    name: "reconstruction"
-    data_layout: "data_parallel"
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "reconstruction image"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext
deleted file mode 100644
index 81d11fbce37..00000000000
--- a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext
+++ /dev/null
@@ -1,242 +0,0 @@
-model {
-  data_layout: "model_parallel"
-  mini_batch_size: 10
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "data"
-    children: "image dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  # FULLY_CONNECTED encode1
-  #################
-  layer {
-    parents: "image"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 1000
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  # RELU relu1
-  ######
-  layer {
-    parents: "encode1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED encode2
-  #################
-  layer {
-    parents: "relu1"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  # RELU relu2
-  #######
-  layer {
-    parents: "encode2"
-    name: "relu2"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED encode3
-  #################
-  layer {
-    parents: "relu2"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 250
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  #######
-  # RELU relu3
-  #######
-  layer {
-    parents: "encode3"
-    name: "relu3"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED encode4
-  #################
-  layer {
-    parents: "relu3"
-    name: "encode4"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 30
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  # FULLY_CONNECTED decode4
-  #################
-  layer {
-    parents: "encode4"
-    name: "decode4"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 250
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  # RELU 4
-  #######
-  layer {
-    parents: "decode4"
-    name: "relu4"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED decode3
-  #################
-  layer {
-    parents: "relu4"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-
-  # RELU relu5
-  #######
-  layer {
-    parents: "decode3"
-    name: "relu5"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED decode2
-  #################
-  layer {
-    parents: "relu5"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 1000
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  # RELU relu6
-  #######
-  layer {
-    parents: "decode2"
-    name: "relu6"
-    data_layout: "model_parallel"
-    relu {
-    }
-  }
-
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "relu6"
-    name: "decode1"
-    data_layout: "model_parallel"
-    hint_layer: "image"
-    fully_connected {
-      weight_initialization: "glorot_uniform"
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid
-  #######
-  layer {
-    parents: "decode1"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    sigmoid {
-    }
-  }
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "reconstruction image"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext
deleted file mode 100644
index 0bd522e79a7..00000000000
--- a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext
+++ /dev/null
@@ -1,368 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 128
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    save_images {
-      layers: "image reconstruction"
-      image_format: "pgm"
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  #######
-  # INPUT
-  #######
-  layer {
-    name: "data"
-    children: "image dummy"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  #############
-  # CONVOLUTION 1
-  #############
-  layer {
-    parents: "image"
-    name: "conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 16
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 1
-  ######
-  layer {
-    parents: "conv1"
-    name: "relu1"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 1
-  #########
-  layer {
-    parents: "relu1"
-    name: "pool1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  #############
-  # CONVOLUTION 2
-  #############
-  layer {
-    parents: "pool1"
-    name: "conv2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 2
-  ######
-  layer {
-    parents: "conv2"
-    name: "relu2"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 2
-  #########
-  layer {
-    parents: "relu2"
-    name: "pool2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  #############
-  # CONVOLUTION 3
-  #############
-  layer {
-    parents: "pool2"
-    name: "conv3"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 3
-  ######
-  layer {
-    parents: "conv3"
-    name: "relu3"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # POOLING 3
-  #########
-  layer {
-    parents: "relu3"
-    name: "pool3"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "2 2"
-      pool_pads: "0 0"
-      pool_strides: "1 1"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
- #########
-  # UNPOOLING 3
-  #########
-  layer {
-    parents: "pool3"
-    name: "unpool3"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool3"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 3
-  #############
-  layer {
-    parents: "unpool3"
-    name: "deconv3"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 8
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 4
-  ######
-  layer {
-    parents: "deconv3"
-    name: "relu4"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # UNPOOLING 2
-  #########
-  layer {
-    parents: "relu4"
-    name: "unpool2"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool2"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 2
-  #############
-  layer {
-    parents: "unpool2"
-    name: "deconv2"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 16
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 5
-  ######
-  layer {
-    parents: "deconv2"
-    name: "relu5"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  #########
-  # UNPOOLING 1
-  #########
-  layer {
-    parents: "relu5"
-    name: "unpool1"
-    data_layout: "data_parallel"
-    unpooling {
-      num_dims: 2
-      pooling_layer: "pool1"
-    }
-  }
-
-  #############
-  # DECONVOLUTION 1
-  #############
-  layer {
-    parents: "unpool1"
-    name: "deconv1"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 2
-      num_output_channels: 1
-      conv_dims: "3 3"
-      conv_pads: "0 0"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-
-  ######
-  # RELU 6
-  ######
-  layer {
-    parents: "deconv1"
-    name: "relu6"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-
-  #################
-  # FULLY_CONNECTED decode1
-  #################
-  layer {
-    parents: "relu6"
-    name: "decode1"
-    data_layout: "data_parallel"
-    hint_layer: "image"
-    fully_connected {
-      has_bias: true
-    }
-  }
-
-  #######
-  # SIGMOID sigmoid
-  #######
-  layer {
-    parents: "decode1"
-    name: "reconstruction"
-    data_layout: "data_parallel"
-    sigmoid {
-    }
-  }
-
-
-  #################
-  # RECONSTRUCTION
-  #################
-  layer {
-    parents: "reconstruction image"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
index bcba455a50b..a5c26f2ae7f 100644
--- a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
+++ b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
@@ -1,9 +1,10 @@
 # LBANN implementation of MNIST VAE in Doersch's autoencoder tutorial
 # See https://github.com/cdoersch/vae_tutorial/blob/master/mnist_vae.prototxt
+trainer {
+}
 model {
   data_layout: "data_parallel"
   mini_batch_size: 100
-  block_size: 256
   num_epochs: 50
 
   ##############################################
diff --git a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
index 93509871d2a..815924a0e86 100644
--- a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
+++ b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
@@ -1,10 +1,9 @@
+trainer {
+}
 model {
   data_layout: "model_parallel"
   mini_batch_size: 50
-  block_size: 256
   num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
@@ -67,7 +66,9 @@ model {
 
   weights {
     name: "w1"
-    glorot_uniform_initializer {}
+    initializer {
+      glorot_uniform_initializer {}
+    }
   }
 
   layer {
diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext
index 0b5a5ac5535..25807cdec5c 100644
--- a/model_zoo/models/candle/pilot1/combo.prototext
+++ b/model_zoo/models/candle/pilot1/combo.prototext
@@ -1,12 +1,11 @@
 #Example taken from:https://github.com/ECP-CANDLE/Benchmarks/tree/frameworks/Pilot1/Combo
 #Timestamp 03/07/2018 8:30PM
+trainer{
+}
 model {
   data_layout: "model_parallel"
   mini_batch_size: 256
-  block_size: 256
   num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
@@ -89,7 +88,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -120,7 +118,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -151,7 +148,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -178,17 +174,23 @@ model {
 # Specify shared weights for drug tracks
   weights {
     name: "drug_fc1_w"
-    he_normal_initializer {}
+    initializer {
+      he_normal_initializer {}
+    }
   }
 
   weights {
     name: "drug_fc2_w"
-    he_normal_initializer {}
+    initializer {
+      he_normal_initializer {}
+    }
   }
 
   weights {
     name: "drug_fc3_w"
-    he_normal_initializer {}
+    initializer {
+      he_normal_initializer {}
+    }
   }
 
 #Drug1 Track
@@ -398,7 +400,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -429,7 +430,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -460,7 +460,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -491,7 +490,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
diff --git a/model_zoo/models/cosmoflow/model_cosmoflow.prototext b/model_zoo/models/cosmoflow/model_cosmoflow.prototext
deleted file mode 100644
index f4b6829a637..00000000000
--- a/model_zoo/models/cosmoflow/model_cosmoflow.prototext
+++ /dev/null
@@ -1,305 +0,0 @@
-model {
-  type: "directed_acyclic_graph_model"
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 18
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  objective_function {
-    layer_term { layer: "mean_absolute_error" }
-  }
-
-  # TODO: Metrics.
-
-  callback { print {} }
-  callback { timer {} }
-
-  # TODO: Polynomial learning rate decay.
-  # TOOD: LARC.
-
-  layer {
-    name: "data"
-    children: "DARK_MATTER SECRETS_OF_THE_UNIVERSE"
-    data_layout: "data_parallel"
-    input {
-      target_mode: "regression"
-    }
-  }
-
-  layer {
-    name: "DARK_MATTER"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    name: "SECRETS_OF_THE_UNIVERSE"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    name: "conv1"
-    parents: "DARK_MATTER"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 16
-      conv_dims_i: 3
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act1"
-    parents: "conv1"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "pool1"
-    parents: "act1"
-    data_layout: "data_parallel"
-    pooling: {
-      num_dims: 3
-      pool_dims_i: 2
-      pool_pads_i: 0
-      pool_strides_i: 2
-      pool_mode: "average"
-    }
-  }
-
-  layer {
-    name: "conv2"
-    parents: "pool1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 32
-      conv_dims_i: 4
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act2"
-    parents: "conv2"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "pool2"
-    parents: "act2"
-    data_layout: "data_parallel"
-    pooling: {
-      num_dims: 3
-      pool_dims_i: 2
-      pool_pads_i: 0
-      pool_strides_i: 2
-      pool_mode: "average"
-    }
-  }
-
-  layer {
-    name: "conv3"
-    parents: "pool2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 64
-      conv_dims_i: 4
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act3"
-    parents: "conv3"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "pool3"
-    parents: "act3"
-    data_layout: "data_parallel"
-    pooling: {
-      num_dims: 3
-      pool_dims_i: 2
-      pool_pads_i: 0
-      pool_strides_i: 2
-      pool_mode: "average"
-    }
-  }
-
-  layer {
-    name: "conv4"
-    parents: "pool3"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act4"
-    parents: "conv4"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "conv5"
-    parents: "act4"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act5"
-    parents: "conv5"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "conv6"
-    parents: "act5"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 256
-      conv_dims_i: 2
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act6"
-    parents: "conv6"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "conv7"
-    parents: "act6"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 3
-      num_output_channels: 256
-      conv_dims_i: 2
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    name: "act7"
-    parents: "conv7"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "fc1"
-    parents: "act7"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-  }
-  layer {
-    name: "drop1"
-    parents: "fc1"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.5
-    }
-  }
-  layer {
-    name: "fc_act1"
-    parents: "drop1"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "fc2"
-    parents: "fc_act1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    name: "drop2"
-    parents: "fc2"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.5
-    }
-  }
-  layer {
-    name: "fc_act2"
-    parents: "drop2"
-    data_layout: "data_parallel"
-    # Supposed to be leaky ReLU w/ leak=0.01, but not supported on GPU.
-    relu {}
-  }
-
-  layer {
-    name: "fc3"
-    parents: "fc_act2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 3
-      has_bias: true
-    }
-  }
-  layer {
-    name: "drop3"
-    parents: "fc3"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.5
-    }
-  }
-
-  layer {
-    name: "mean_absolute_error"
-    parents: "drop3 SECRETS_OF_THE_UNIVERSE"
-    data_layout: "data_parallel"
-    mean_absolute_error {}
-  }
-
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext
deleted file mode 100644
index 26e83f01d6f..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext
+++ /dev/null
@@ -1,547 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_fake_eval"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {}
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      slice_points: "0 2500 2511"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "data"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_fake"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_fake"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_fake"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_fake"
-  }
-  layer {
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_real_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_fake_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_fake_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_real_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_real_eval"
-    data_layout: "data_parallel"
-    parents: "disc2_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_fake_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_fake_eval"
-    data_layout: "data_parallel"
-    parents: "disc2_fake_bce"
-    evaluation {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback {
-  #  dump_outputs {
-  #    directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jagsv2/cycgan_m1/"
-  #    layers: "image_data_dummy"
-  #    execution_modes: "test"
-  #  }
-  #}
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext
deleted file mode 100644
index 98a6745c3da..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext
+++ /dev/null
@@ -1,66 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_fake_eval"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext
deleted file mode 100644
index e188b803517..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext
+++ /dev/null
@@ -1,528 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-    #layer_term {
-    #  scale_factor: 1.0
-    #  layer: "L_cyc_y_eval"
-    #}
-    #layer_term {
-    #  scale_factor: 1.0
-    #  layer: "L_cyc_x_eval"
-    #}
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {}
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      slice_points: "0 2500 2511"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_1 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv1_eval"
-    data_layout: "data_parallel"
-    parents: "g_adv1_bce"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "data_parallel"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_1 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    parents: "gsample_minus_y"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jagsv2/cycgan_m2/"
-      layers: "image_data_dummy gen1fc4_1 gen1fc4_2"
-      execution_modes: "test"
-    }
-  }
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext
deleted file mode 100644
index b834fb30db2..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext
+++ /dev/null
@@ -1,73 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-   # layer_term {
-   #   scale_factor: 0.9
-   #   layer: "L_cyc_y_eval"
-   # }
-   # layer_term {
-   #   scale_factor: 0.9
-   #   layer: "L_cyc_x_eval"
-   # }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jagsv2/cycgan_m2/"
-      layers: "image_data_dummy gen1fc4_1 gen1fc4_2"
-      execution_modes: "test"
-    }
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext
deleted file mode 100644
index ee04a006ed8..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext
+++ /dev/null
@@ -1,589 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv2_eval"
-    }
-    #layer_term {
-    #  scale_factor: 0.9
-    #  layer: "L_cyc_y_eval"
-    #}
-    #layer_term {
-    #  scale_factor: 0.9
-    #  layer: "L_cyc_x_eval"
-    #}
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {}
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      slice_points: "0 2500 2511"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_1"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_1 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv2_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv2_eval"
-    data_layout: "data_parallel"
-    parents: "g_adv2_bce"
-    evaluation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "data_parallel"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample2_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_y param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    parents: "gsample2_minus_x"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jagsv2/cycgan_m3/"
-      layers: "param_data_id gen2fc4_1 gen2fc4_2"
-      execution_modes: "test"
-    }
-  }
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext
deleted file mode 100644
index 71fc11174c2..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext
+++ /dev/null
@@ -1,73 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv2_eval"
-    }
-    #layer_term {
-    #  scale_factor: 0.9
-    #  layer: "L_cyc_y_eval"
-    #}
-    #layer_term {
-    #  scale_factor: 0.9
-    #  layer: "L_cyc_x_eval"
-    #}
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jagsv2/cycgan_m3/"
-      layers: "param_data_id gen2fc4_1 gen2fc4_2"
-      execution_modes: "test"
-    }
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py
deleted file mode 100644
index 82cc659dcfe..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Generate model 1 (discriminator)
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-  #different weights
-  fc1 = prefix+'fc1'+tag
-  fc2 = prefix+'fc2'+tag
-  fc3 = prefix+'fc3'+tag
-  fc4 = prefix+'fc4'+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer')
-  l.weights = fc1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer')
-  l.weights = fc2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer')
-  l.weights = fc3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer')
-  l.weights = fc4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #Useful constants
-    zero = new_layer(model,'zero','','constant')
-    zero.constant.value = 0.0
-    zero.constant.num_neurons = '1'
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #D_Loss1 branch
-    #Fake path
-    #freeze generator = True
-    #g_sample=generator1(x)
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, True,True)
-    #g_sample2= generator(y)
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False)
-
-    #True path (share weights with fake path discriminator)
-    #discriminator(y,x)
-    #data = y + x
-    D_real = add_discriminator(model, 'data','disc1',False, True, '_real')
-    #CONCAT
-    # Gsample + x
-    #
-    l = new_layer(model, 'concat_gsample_n_param','','concatenation')
-    l.parents = g_sample+' param_data_id'
-    #discriminator false path
-    #discriminator(g_sample,x)
-    D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake')
-
-
-    #D_loss2 branch
-    #Reconcatenate x+y
-    l = new_layer(model, 'concat_param_n_img','param_data_id image_data_dummy','concatenation')
-
-    #D_real2 = discriminator2(x,y)
-    D_real2 = add_discriminator(model,'concat_param_n_img','disc2',False, True, '_real')
-
-    #D_fake2  = discriminator2(G_sample2,y)
-    l = new_layer(model, 'concat_gsample2_n_img',g_sample2+ ' image_data_dummy','concatenation')
-    D_fake2 = add_discriminator(model,'concat_gsample2_n_img','disc2', False, False, '_fake')
-
-    #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
-
-    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
-
-    l = new_layer(model, 'disc2_real_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc2_real_eval','disc2_real_bce', 'evaluation')
-
-    l = new_layer(model, 'disc2_fake_bce', [D_fake2, zero.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc2_fake_eval','disc2_fake_bce', 'evaluation')
-
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py
deleted file mode 100644
index b0298098194..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-#Generate model 2 (forward model X->Y)
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''):
-
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-  w4 = prefix+'fc4'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-  fc4 = w4+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w4 + 'linearity', 'he_normal_initializer')
-  l.weights = w4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #********************************************
-    #g_sample=generator(x)
-    #do not freeze, train generator to confuse discriminator
-    #_1 => first generator1 to be added, to solve problem of all generator1 having the same name
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True,True,'_1')
-    # g_adv1 = discriminator(g_sample,x)
-    l = new_layer(model, 'concat_gsample_n_param',g_sample+' param_data_id','concatenation')
-    #freeze discriminator, fake it as real
-    D_real = add_discriminator(model,'concat_gsample_n_param','disc1',True, True, '_real')
-    #objective function
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-    l = new_layer(model, 'g_adv1_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
-
-    #************************************************
-    #g_sample2= generator2(y) //freeze
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False,True,'_y')
-    #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train
-    #Dont add weights, share weights with _1
-    G_cyc_y = add_generator(model,g_sample2,'gen1',2500,False,True,False,'_2')
-    #G_cyc_y - y
-    l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_y - y = cycy_minus_y
-    l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs')
-    l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation')
-    #+++++++++++++
-    #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name
-    G_cyc_x = add_generator(model,g_sample,'gen2', 11, True,False,False,'_gsample')
-    #G_cyc_x - x
-    l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_x - x = cycx_minus_x
-    l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs')
-    l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation')
-
-    #******************************************************
-    #l2_norm(gsample - y)
-    l = new_layer(model, 'gsample_minus_y', g_sample+' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_norm2')
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py
deleted file mode 100644
index 73cd7f9c134..00000000000
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-#Generate model 3 (inverse model Y->X)
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''):
-
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-  w4 = prefix+'fc4'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-  fc4 = w4+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w4 + 'linearity', 'he_normal_initializer')
-  l.weights = w4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #********************************************
-    #g_sample2=generator2(y)
-    #do not freeze, train generator to confuse discriminator
-    #_1 => first generator1 to be added, to solve problem of all generator2 having the same name
-    g_sample2 = add_generator(model, 'image_data_dummy','gen2', 11, False,False,True,'_1')
-    # g_adv21 = discriminator2(g_sample2,y)
-    l = new_layer(model, 'concat_gsample2_n_img',g_sample2+' image_data_dummy','concatenation')
-    #freeze discriminator, fake it as real
-    D_real = add_discriminator(model,'concat_gsample2_n_img','disc2',True, True, '_real')
-    #objective function
-     one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-    l = new_layer(model, 'g_adv2_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'g_adv2_eval','g_adv2_bce', 'evaluation')
-
-    #************************************************
-    #g_sample2= generator2(y) //train
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, False,False,False,'_y')
-    #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train
-    #Dont add weights, share weights with _1
-    G_cyc_y = add_generator(model,g_sample2,'gen1',2500,True,True,False,'_2')
-    #G_cyc_y - y
-    l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_y - y = cycy_minus_y
-    l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs')
-    l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation')
-    #+++++++++++++
-    #g_sample=generator(x)
-    g_sample = add_generator(model,'param_data_id','gen1',2500,True,True,True,'_1')
-    #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name
-    G_cyc_x = add_generator(model,g_sample,'gen2', 11, False,False,False,'_gsample')
-    #G_cyc_x - x
-    l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_x - x = cycx_minus_x
-    l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs')
-    l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation')
-
-    #******************************************************
-    #l2_norm(gsample2 - x)
-    l = new_layer(model, 'gsample2_minus_x', g_sample2+' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_norm2')
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/gan/mnist/adversarial_data.prototext b/model_zoo/models/gan/mnist/adversarial_data.prototext
deleted file mode 100644
index 64497bfbda7..00000000000
--- a/model_zoo/models/gan/mnist/adversarial_data.prototext
+++ /dev/null
@@ -1,70 +0,0 @@
-data_reader {
-  reader {
-    name: "mnist"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "train-images-idx3-ubyte"
-    label_filename: "train-labels-idx1-ubyte"
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    gan_labelling: true
-    gan_label_value: 1
-    image_preprocessor {
-      noise_factor: 0.0
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: false
-        factor: 0.0
-      }
-    }
-  }
-  reader {
-    name: "mnist"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "t10k-images-idx3-ubyte"
-    label_filename: "t10k-labels-idx1-ubyte"
-    validation_percent: 1.0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    gan_labelling: true
-    gan_label_value: 1
-    image_preprocessor {
-      noise_factor: 0.0
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: false
-        factor: 0.0
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext
index 644df4a90f3..8d15ed42da7 100644
--- a/model_zoo/models/gan/mnist/adversarial_model.prototext
+++ b/model_zoo/models/gan/mnist/adversarial_model.prototext
@@ -1,12 +1,11 @@
 #Adversarial Model
+trainer {
+}
 model {
   data_layout: "model_parallel"
   mini_batch_size: 32
-  block_size: 256
   super_steps: 100000
   num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
@@ -145,8 +144,10 @@ model {
 
   weights {
      name: "gen_fc_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   #############
   # FC 1
@@ -301,8 +302,10 @@ model {
 
   weights {
      name: "dis_flatten_weights"
-     optimizer { }
-     he_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       he_normal_initializer {}
+     }
   }
  # FULLY_CONNECTED dis_flatten
   layer {
@@ -318,8 +321,10 @@ model {
 
   weights {
      name: "dis_fc1_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     name: "dis_fc1_proxy"
@@ -342,8 +347,10 @@ model {
 
   weights {
      name: "dis_fc2_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     parents: "dis_fc1_relu"
@@ -367,8 +374,10 @@ model {
  # FULLY_CONNECTED fc1
   weights {
      name: "dis_fc3_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     parents: "dis_fc2_relu"
diff --git a/model_zoo/models/gan/mnist/discriminator_data.prototext b/model_zoo/models/gan/mnist/discriminator_data.prototext
deleted file mode 100644
index 6f236834e0b..00000000000
--- a/model_zoo/models/gan/mnist/discriminator_data.prototext
+++ /dev/null
@@ -1,70 +0,0 @@
-data_reader {
-  reader {
-    name: "mnist"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "train-images-idx3-ubyte"
-    label_filename: "train-labels-idx1-ubyte"
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    gan_labelling: true
-    gan_label_value: 0
-    image_preprocessor {
-      noise_factor: 0.0
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: false
-        factor: 0.0
-      }
-    }
-  }
-  reader {
-    name: "mnist"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
-    data_filename: "t10k-images-idx3-ubyte"
-    label_filename: "t10k-labels-idx1-ubyte"
-    validation_percent: 1.0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    gan_labelling: true
-    gan_label_value: 0
-    image_preprocessor {
-      noise_factor: 0.0
-      normalizer {
-        scale: true
-        subtract_mean: false
-        unit_variance: false
-        z_score: false
-      }
-      augmenter {
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-      noiser {
-        disable: false
-        factor: 0.0
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext
index 063390d93dd..db66dc80fa8 100644
--- a/model_zoo/models/gan/mnist/discriminator_model.prototext
+++ b/model_zoo/models/gan/mnist/discriminator_model.prototext
@@ -1,12 +1,11 @@
 #Discriminator Model
+trainer {
+}
 model {
   data_layout: "model_parallel"
   mini_batch_size: 32
-  block_size: 256
   super_steps: 100000
   num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
 
   ###################################################
   # Objective function
@@ -137,8 +136,10 @@ model {
   }
   weights {
      name: "gen_fc1_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   #############
   # FC 1
@@ -178,8 +179,10 @@ model {
 
   weights {
      name: "gen_fc2_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     name: "fc2"
@@ -216,8 +219,10 @@ model {
 
   weights {
      name: "gen_fc3_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     name: "fc3"
@@ -254,8 +259,10 @@ model {
 
   weights {
      name: "gen_fc4_weights"
-     optimizer { }
-     glorot_normal_initializer {}
+     optimizer { no_optimizer {} }
+     initializer {
+       glorot_normal_initializer {}
+     }
   }
   layer {
     name: "fc4"
diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext
deleted file mode 100644
index 5f0b88111a6..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext
+++ /dev/null
@@ -1,365 +0,0 @@
-#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and
-#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
-#Timestamp 02/26/2018 8:45AM
-model {
-  name: "ae_model"
-  shareable_training_data_reader:false
-  serialize_io: true
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs:4
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    #layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-  #callback {
-  #    dump_activations{
-      #basename: "/usr/workspace/wsa/jacobs32/github.saj.lbann/dump_acts_ae/"
-  #    basename: "/usr/workspace/wsa/jacobs32/centralized-lbann/EuroViz/"
-  #    interval: 1
-  #    layer_names: "image_data_dummy sigmoid"
-  #    layer_names: "reconstruction"
-  #  }
-  #}
-  #callback {
-  #  save_images {
-  #    image_prefix: "vae_fcn_images_"
-  #    image_format: "jpg"
-  #  }
-  #}
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #layer {
-  #  name: "data"
-  #  children: "encode1 reconstruction"
-  #  data_layout: "data_parallel"
-  #  input {
-  #    target_mode: "reconstruction"
-  #  }
-  #}
-
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      #slice_points: "0 16384 16389"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  ######################
-  # Encoder
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_dummy"
-    name: "encode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons:20
-      weight_initialization: "glorot_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    #parents: "z_mean sample_exp_noise"
-    parents: "z_mean"
-    name: "sample"
-    data_layout: "data_parallel"
-    #sum {}
-    identity {}
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "sample"
-    name: "decode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "data_parallel"
-    #num_neurons_from_data_reader: true
-    fully_connected {
-      weight_initialization: "glorot_normal"
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "data_parallel"
-    sigmoid {}
-  }
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction image_data_dummy"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    #binary_cross_entropy {}
-    mean_squared_error {}
-  }
-  layer {
-    parents: "reconstruction image_data_dummy"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-  ####For metric, loss per individual sample
-  layer {
-    name: "ae_err"
-    data_layout: "model_parallel"
-    parents: "param_data_id mean_squared_error"
-    concatenation {
-    }
-  }
-  callback {
-    dump_outputs {
-     # directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss/"
-     # #directory:"/p/gpfs1/jacobs32/EuroViz3/ae_loss/"
-      directory:"ae_loss/"
-      layers: "ae_err"
-      execution_modes: "test"
-    }
-  }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext
deleted file mode 100644
index c7931f84084..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext
+++ /dev/null
@@ -1,454 +0,0 @@
-#Combines encoder portion ae with cycgan models for inference (forward prediction) in output space.
-model {
-  name: "ae_cycgan_model"
-  shareable_training_data_reader:false
-  serialize_io: true
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    #layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
- # callback {
- #   save_images {
- #     image_prefix: "vae_fcn_images_"
- #     image_format: "jpg"
- #   }
- # }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #Layer from cycle GAN
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 16384 16389"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      #num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 8192
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-
-  ######################
-  # Encoder
-  ######################
-  #Encoder not really used here
-  # encode1
-  layer {
-    parents: "image_data_id"
-    name: "encode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons:20
-      weight_initialization: "glorot_normal"
-      has_bias: true
-    }
-  }
-  #layer {
-  #  parents: "sample_exp sample_noise"
-  #  name: "sample_exp_noise"
-  #  data_layout: "data_parallel"
-  #  hadamard {}
-  #}
-  layer {
-  #  parents: "z_mean sample_exp_noise"
-    parents: "z_mean"
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-  #  sum {}
-    identity {}
-  }
-  ####output of encoder not used, dangling
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    #parents: "sample"
-    parents: "gen1fc4"
-    name: "decode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "data_parallel"
-    #num_neurons_from_data_reader: true
-    fully_connected {
-      weight_initialization: "glorot_normal"
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "data_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    #binary_cross_entropy {}
-    mean_squared_error {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_out_loss"
-    data_layout: "model_parallel"
-    parents: "param_data_id mean_squared_error"
-    concatenation {
-    }
-  }
-  callback {
-    dump_outputs {
-      #directory:"/p/lscratchh/jacobs32/EuroViz/fw_out_loss/"
-      directory:"fw_out_loss/"
-      layers: "fw_out_loss"
-      execution_modes: "test"
-    }
-  }
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext
deleted file mode 100644
index e2a6eb6085d..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext
+++ /dev/null
@@ -1,724 +0,0 @@
-#Augumented version of ae_cyc.prototext so we can have ae_loss, fw_latent_loss and fw_out_loss all in the same file instead of 3 files, a request from MLSI ML team. This augmentation involves replicating blocks for fw_model from cycle gan and encode from autoencoder.
-#Streamlines inference to use of only 1 model checkpoint (saved by using this prototext in training). Weights are copied from autoencoder and cyclegan and saved after training.
-model {
-  name: "ae_cycgan_model"
-  shareable_training_data_reader:false
-  serialize_io: true
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    #layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      #layer: "mean_squared_error"
-      layer: "fw_out_loss"
-    }
-  }
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #Layer from cycle GAN
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 16384 16389"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      #num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 8192
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "gen1relu3_1"
-  }
-
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1bias"
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2bias"
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3bias"
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4bias"
-  }
-
-  ######################
-  # Encoder
-  ######################
-  #Encoder not really used here
-  # encode1
-  layer {
-    parents: "image_data_id"
-    name: "encode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons:20
-      weight_initialization: "glorot_normal"
-      has_bias: true
-    }
-  }
-  #layer {
-  #  parents: "sample_exp sample_noise"
-  #  name: "sample_exp_noise"
-  #  data_layout: "data_parallel"
-  #  hadamard {}
-  #}
-  layer {
-  #  parents: "z_mean sample_exp_noise"
-    parents: "z_mean"
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-  #  sum {}
-    identity {}
-  }
-  ####output of encoder goes to decoder and cycGAN duplicates
-  ######################
-  # Decoder for foward output loss
-  ######################
-
-  # decode3
-  layer {
-    #parents: "sample"
-    parents: "gen1fc4"
-    name: "decode3"
-    data_layout: "data_parallel"
-    weights: "decode3linearity decode3bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "data_parallel"
-    weights: "decode2linearity decode2bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "data_parallel"
-    weights: "decode1linearity decode1bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "data_parallel"
-    #num_neurons_from_data_reader: true
-    weights: "decode0linearity decode0bias"
-    fully_connected {
-      #weight_initialization: "glorot_normal"
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "data_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    #binary_cross_entropy {}
-    mean_squared_error {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "fw_out_loss"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-  ####Decoder weights
-  weights {
-    name: "decode0linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode0bias"
-  }
-
-  weights {
-    name: "decode1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode1bias"
-  }
-  weights {
-    name: "decode2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode2bias"
-  }
-  weights {
-    name: "decode3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode3bias"
-  }
-
-#Decoder duplicated for ae_loss
-  # decode3
-  layer {
-    #parents: "sample"
-    parents: "image_data_dummy"
-    name: "ae_decode3"
-    data_layout: "data_parallel"
-    weights: "decode3linearity decode3bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode3"
-    name: "ae_decode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "ae_decode3_tanh"
-    name: "ae_decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "ae_decode3_dropout"
-    name: "ae_decode2"
-    data_layout: "data_parallel"
-    weights: "decode2linearity decode2bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode2"
-    name: "ae_decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "ae_decode2_tanh"
-    name: "ae_decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "ae_decode2_dropout"
-    name: "ae_decode1"
-    data_layout: "data_parallel"
-    weights: "decode1linearity decode1bias"
-    fully_connected {
-      num_neurons: 256
-      #weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode1"
-    name: "ae_decode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "ae_decode1_elu"
-    name: "ae_decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "ae_decode1_dropout"
-    name: "ae_decode0"
-    data_layout: "data_parallel"
-    #num_neurons_from_data_reader: true
-    weights: "decode0linearity decode0bias"
-    fully_connected {
-      #weight_initialization: "glorot_normal"
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode0"
-    name: "ae_sigmoid"
-    data_layout: "data_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "ae_sigmoid"
-    name: "ae_reconstruction"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "ae_reconstruction image_data_id"
-    name: "ae_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###Cycle GAN duplicated for latent loss dump
-  #Takes output of encoder as input
-  layer {
-    fully_connected {
-      num_neurons: 64
-      #num_neurons: 256
-      has_bias: true
-    }
-    name: "latent_gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "latent_gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "latent_gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "latent_gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "latent_gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "latent_gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 8192
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "latent_gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    parents: "latent_gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "latent_gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "latent_gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "latent_gen1relu3_1"
-  }
-
-  layer {
-    name: "gsample_minus_latentsample"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc4 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "data_parallel"
-    l2_norm2 {
-    }
-    parents: "gsample_minus_latentsample"
-  }
-
-  ####For metric, loss per individual sample
-  layer {
-    name: "ae_latent_out_losses"
-    data_layout: "model_parallel"
-    parents: "param_data_id ae_loss fw_latent_loss fw_out_loss"
-    #parents: "data z_mean ae_loss fw_latent_loss fw_out_loss"
-    concatenation {
-    }
-  }
-  callback {
-    dump_outputs {
-      #directory:"/p/lscratchh/jacobs32/EuroViz/fw_out_loss/"
-      directory:"ae_latent_out_losses/"
-      #directory:"save_img_acts/"
-      #ae_reconstruction === autoencoder reconstrcution
-      #reconstruction ==== cycgan+autoencoder reconstruction
-      #layers: "ae_reconstruction image_data_id reconstruction"
-      layers: "ae_latent_out_losses"
-      execution_modes: "test"
-      format: "npz"
-    }
-  }
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext
deleted file mode 100644
index 7d38e4ca6bb..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext
+++ /dev/null
@@ -1,899 +0,0 @@
-model {
-  name: "cycgan_model"
-  shareable_training_data_reader:false
-  serialize_io: true
-  procs_per_trainer:0
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_inv_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-  }
-  num_epochs: 4
-  super_steps: 10
-  metric {
-    layer_metric {
-      layer: "l_l2_y"
-    }
-  }
-  data_layout: "data_parallel"
-  layer {
-    input {
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    #children: "image_data_dummy param_data_id"
-    children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 2500 2511"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    #name: "image_data_dummy"
-    name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ######################
-  # Encoder from VAE
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_id"
-    name: "encode1"
-    freeze: true
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    freeze: true
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    freeze: true
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    freeze: true
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    #parents: "z_mean sample_exp_noise"
-    parents: "z_mean"
-    #name: "sample"
-    ###This is actually sample in latent space, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    #sum {}
-    identity {}
-  }
-  #####VAE Encoder ends here, sample feeds/replaces image data dummy
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  #layer {
-  #  dropout {
-  #    keep_prob: 0.8
-  #  }
-  #  name: "gen1dropout1"
-  #  data_layout: "data_parallel"
-  #  parents: "gen1relu2"
-  #}
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    #parents: "gen1dropout1"
-    parents: "gen1relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2500
-      #get_slice_points_from_reader: "independent"
-      #get_num_neurons_of_slice_from_reader: [ 1 ]
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "gen1relu3"
-  }
-  #concat latenty sample (image_data_dummy) and param
-  layer {
-    name: "concat_latent_sample_n_param"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    #parents: "data"
-    parents: "concat_latent_sample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    parents: "d2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    l2_norm2 {
-    }
-    parents: "gsample_minus_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    #weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    #weights: "gen2fc2linearity"
-    parents: "gen2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    #weights: "gen2fc3linearity"
-    parents: "gen2relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    #weights: "gen2fc4linearity"
-    parents: "gen2relu3"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu1_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    parents: "d1_invrelu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu2_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invrelu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    children: "d1_inv_stop_gradient d2_inv_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_inv_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "d1_inv_stop_gradient"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    parents: "d1_invrelu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invrelu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_inv_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2_invfc1"
-    data_layout: "data_parallel"
-    parents: "d2_inv_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "d2_invrelu1"
-    data_layout: "data_parallel"
-    parents: "d2_invfc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2_invfc2"
-    data_layout: "data_parallel"
-    parents: "d2_invrelu1"
-  }
-  layer {
-    relu {
-    }
-    name: "d2_invrelu2"
-    data_layout: "data_parallel"
-    parents: "d2_invfc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2_invfc3"
-    data_layout: "data_parallel"
-    parents: "d2_invrelu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_inv_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2_invfc3 one"
-  }
-  layer {
-    name: "gsample2_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    l2_norm2 {
-    }
-    parents: "gsample2_minus_x"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1bias"
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2bias"
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3bias"
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4bias"
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1bias"
-  }
-  weights {
-    name: "d1_invfc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc2bias"
-  }
-  weights {
-    name: "d1_invfc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc3bias"
-  }
-  mini_batch_size: 256
-  callback {
-    print {
-      interval: 10
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback { gpu_memory_usage {} }
-  #callback { debug {} }
-  #callback {
-  #  summary {
-  #    dir: "."
-  #    mat_interval: 25
-  #  }
-  #}
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3"
-      batch_interval: 1
-    }
-  }
-  #callback {
-  #  ltfb {
-  #    batch_interval: 100
-  #    metric: "l_l2_y_eval"
-  #    low_score_wins: true
-  #    weights: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias gen2fc1_linearity_weights gen2fc1_bias_weights gen2fc2_linearity_weights gen2fc2_bias_weights gen2fc3_linearity_weights gen2fc3_bias_weights gen2fc4_linearity_weights gen2fc4_bias_weights"
-
-   # }
- # }
-  block_size: 256
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "model_parallel"
-    parents: "param_data_id l_l2_y"
-    concatenation {
-    }
-  }
-
-  callback {
-    dump_outputs {
-      directory:"fw_latent_loss/"
-      #directory:"/p/gpfs1/jacobs32/EuroViz3/fw_latent_loss/"
-      #layer_names: "image_data_dummy gen1fc4 gsample_minus_y l_l2_y"
-      layers: "fw_latent_loss"
-      execution_modes: "test"
-    }
-  }
-  callback { save_model { dir: "model" } }
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext
deleted file mode 100644
index 41a071fab87..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext
+++ /dev/null
@@ -1,766 +0,0 @@
-model {
-  name: "dis_model"
-  shareable_training_data_reader: true
-  serialize_io: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_fake_eval"
-    }
-  }
-  num_epochs: 1
-  data_layout: "model_parallel"
-#### Data space
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "model_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "model_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ######################
-  # Encoder from VAE
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_id"
-    name: "encode1"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "20"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    data_layout: "model_parallel"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_sum"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    hadamard {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    #name: "sample"
-    ###This is actually sample, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "model_parallel"
-    sum {}
-  }
-  #####VAE Encoder ends here, sample feeds/replaces image data dummy
-
-
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "model_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "model_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1"
-    data_layout: "model_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "model_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      #num_neurons: 16384
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1"
-    data_layout: "model_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2"
-    data_layout: "model_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3"
-    data_layout: "model_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3"
-  }
-  #concat latenty sample (image_data_dummy) and param
-  layer {
-    name: "concat_latent_sample_n_param"
-    data_layout: "model_parallel"
-    parents: "image_data_dummy param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc1linearity"
-    #parents: "data"
-    parents: "concat_latent_sample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "model_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "model_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "model_parallel"
-    parents: "gen1fc4 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc1fc1_fake"
-    data_layout: "model_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_fake"
-    data_layout: "model_parallel"
-    parents: "disc1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc1fc2_fake"
-    data_layout: "model_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_fake"
-    data_layout: "model_parallel"
-    parents: "disc1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_fake"
-    data_layout: "model_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_fake"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "model_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "model_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "model_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "model_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc2fc1_fake"
-    data_layout: "model_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_fake"
-    data_layout: "model_parallel"
-    parents: "disc2fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc2fc2_fake"
-    data_layout: "model_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_fake"
-    data_layout: "model_parallel"
-    parents: "disc2fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_fake"
-    data_layout: "model_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_fake"
-  }
-  layer {
-    name: "disc1_real_bce"
-    data_layout: "model_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_real_eval"
-    data_layout: "model_parallel"
-    parents: "disc1_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc1_fake_bce"
-    data_layout: "model_parallel"
-    parents: "disc1fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_fake_eval"
-    data_layout: "model_parallel"
-    parents: "disc1_fake_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_real_bce"
-    data_layout: "model_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_real_eval"
-    data_layout: "model_parallel"
-    parents: "disc2_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_fake_bce"
-    data_layout: "model_parallel"
-    parents: "disc2fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_fake_eval"
-    data_layout: "model_parallel"
-    parents: "disc2_fake_bce"
-    evaluation {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 256
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  block_size: 256
-  super_steps: 20000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext
deleted file mode 100644
index 9a6715a0fc9..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext
+++ /dev/null
@@ -1,760 +0,0 @@
-model {
-  name: "fw_model"
-  shareable_training_data_reader: true
-  serialize_io: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-  }
-  num_epochs: 1
-  data_layout: "model_parallel"
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "model_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ######################
-  # Encoder from VAE
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_id"
-    name: "encode1"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "20"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    data_layout: "model_parallel"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_sum"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    hadamard {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    #name: "sample"
-    ###This is actually sample, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "model_parallel"
-    sum {}
-  }
-  #####VAE Encoder ends here, sample feeds/replaces image data dummy
-
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "model_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "model_parallel"
-    parents: "gen1fc4_1 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "model_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "model_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "model_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv1_bce"
-    data_layout: "model_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv1_eval"
-    data_layout: "model_parallel"
-    parents: "g_adv1_bce"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "model_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "model_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "model_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "model_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    device_allocation: "cpu"
-    data_layout: "model_parallel"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "model_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "model_parallel"
-    parents: "gen1fc4_1 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_y"
-    device_allocation: "cpu"
-    data_layout: "model_parallel"
-    parents: "gsample_minus_y"
-  }
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "model_parallel"
-    parents: "param_data_id l_l2_y"
-    concatenation {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 256
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-  callback {
-    dump_outputs {
-      #directory: "/p/lscratchh/jacobs32/EuroViz/fw_latent_loss/"
-      #general directory: "/p/gpfs1/jacobs32/EuroViz2/fw_latent_loss/"
-      directory: "fw_latent_loss/"
-      layers: "fw_latent_loss"
-      execution_modes: "test"
-      format: "npy"
-    }
-  }
- #callback {
- #   dump_outputs {
- #     directory: "/dir/to/dump_y_activations/"
-  #    directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jags10K_multi/cycgan_m2/"
-  #    batch_interval: 100
-  #    layers: "image_data_dummy gen1fc4_1 l_l2_y"
-  #    execution_modes: "test"
-  #  }
-  #}
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  block_size: 256
-  super_steps: 20000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext
deleted file mode 100644
index 41005af6f15..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext
+++ /dev/null
@@ -1,801 +0,0 @@
-model {
-  name: "inv_model"
-  shareable_training_data_reader: true
-  serialize_io: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv2_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-  }
-  num_epochs: 1
-  data_layout: "model_parallel"
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "model_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ######################
-  # Encoder from VAE
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_id"
-    name: "encode1"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    freeze: true
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "20"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    data_layout: "model_parallel"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_sum"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    hadamard {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    #name: "sample"
-    ###This is actually sample, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "model_parallel"
-    sum {}
-  }
-  #####VAE Encoder ends here, sample feeds/replaces image data dummy
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1_1"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_1"
-    data_layout: "model_parallel"
-    parents: "gen2fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2_1"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_1"
-    data_layout: "model_parallel"
-    parents: "gen2fc2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3_1"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_1"
-    data_layout: "model_parallel"
-    parents: "gen2fc3_1"
-  }
-  layer {
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_1"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_1"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "model_parallel"
-    parents: "gen2fc4_1 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "model_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "model_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "model_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv2_bce"
-    data_layout: "model_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv2_eval"
-    data_layout: "model_parallel"
-    parents: "g_adv2_bce"
-    evaluation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "model_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "model_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "model_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      #num_neurons: 16384
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "model_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "model_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "model_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      #num_neurons: 16384
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "model_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "model_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "model_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "model_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample2_minus_x"
-    data_layout: "model_parallel"
-    parents: "gen2fc4_y param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_x"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "gsample2_minus_x"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 256
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-  #callback {
-  #  dump_outputs {
-  #    directory: "/dir/to/dump_x_activations/"
-  #    layers: "param_data_id gen2fc4_1"
-  #    execution_modes: "test"
-  #  }
-  #}
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  block_size: 256
-  super_steps: 20000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext b/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext
deleted file mode 100644
index b0376077b5e..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext
+++ /dev/null
@@ -1,112 +0,0 @@
-########################################################################
-# The JAG normalization values were computed over the 10M + 1MA + 1MB random
-# pulls from the 100M data set.  They are valid for the directories:
-# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B)
-# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
-# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B
-# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
-########################################################################
-
-data_reader {
-  requires_data_set_metadata: true
-
-  reader {
-    name: "jag_conduit"
-    role: "train"
-    shuffle: true
-    # change to a lustre path
-    data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_A/"
-    index_list: "index.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
-
-    validation_percent: 0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true
-    disable_labels: true
-
-    num_labels: 5
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-      raw_num_channels: 4
-
-      normalizer {
-        disable: true
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-
-  reader {
-    name: "jag_conduit"
-    role: "test"
-    shuffle: true
-    # change to a lustre path
-    data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_B"
-    index_list: "index.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
-
-    validation_percent: 0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true
-    disable_labels: true
-
-    num_labels: 5
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-      raw_num_channels: 4
-
-      normalizer {
-        disable: true
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext
deleted file mode 100644
index 81467ae6970..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext
+++ /dev/null
@@ -1,112 +0,0 @@
-########################################################################
-# The JAG normalization values were computed over the 10M + 1MA + 1MB random
-# pulls from the 100M data set.  They are valid for the directories:
-# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B)
-# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
-# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B
-# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B)
-########################################################################
-
-data_reader {
-  requires_data_set_metadata: true
-
-  reader {
-    name: "jag_conduit"
-    role: "train"
-    shuffle: true
-    # change to a lustre path
-    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/"
-    index_list: "index.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
-
-    validation_percent: 0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true
-    disable_labels: true
-
-    num_labels: 5
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-      raw_num_channels: 4
-
-      normalizer {
-        disable: true
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-
-  reader {
-    name: "jag_conduit"
-    role: "test"
-    shuffle: true
-    # change to a lustre path
-    data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_B/"
-    index_list: "index.txt"
-    index_list_per_trainer: false
-    index_list_per_model: false
-
-    validation_percent: 0
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true
-    disable_labels: true
-
-    num_labels: 5
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-      raw_num_channels: 4
-
-      normalizer {
-        disable: true
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext b/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext
deleted file mode 100644
index f97b43e3031..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext
+++ /dev/null
@@ -1,25 +0,0 @@
-data_reader {
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "train"
-    shuffle: true
-    data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_train_*.npy"
-    validation_percent: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true 
-    disable_labels: true
-  }
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "test"
-    shuffle: false 
-    data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_test_*.npy"
-    validation_percent: 0
-    #test first 16 samples only to match TF version 
-    absolute_sample_count: 100
-    disable_responses: true 
-    disable_labels: true
-  }
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext b/model_zoo/models/jag/ae_cycle_gan/vae1.prototext
deleted file mode 100644
index 1646bdd0298..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext
+++ /dev/null
@@ -1,459 +0,0 @@
-#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and
-#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
-#Timestamp 02/26/2018 8:45AM
-model {
-  name: "ae_model"
-  shareable_training_data_reader: false
-  serialize_io: true
-  data_layout: "model_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #layer {
-  #  name: "data"
-  #  children: "encode1 reconstruction"
-  #  data_layout: "model_parallel"
-  #  input {
-  #    target_mode: "reconstruction"
-  #  }
-  #}
-
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "input"
-    data_layout: "data_parallel"
-    children: "data dummy"
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "model_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  ######################
-  # Encoder
-  ######################
-
-  # encode1
-  layer {
-    #parents: "data"
-    parents: "image_data_dummy"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "20"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    device_allocation: "cpu"
-    data_layout: "model_parallel"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_sum"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    hadamard {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    name: "sample"
-    data_layout: "model_parallel"
-    sum {}
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "sample"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "model_parallel"
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction image_data_dummy"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    binary_cross_entropy {}
-  }
-  layer {
-    parents: "reconstruction image_data_dummy"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-  ####For metric, loss per individual sample
-  layer {
-    name: "ae_err"
-    data_layout: "model_parallel"
-    parents: "param_data_id mean_squared_error"
-    concatenation {
-    }
-  }
-  callback {
-    dump_outputs {
-      #directory: "/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss"
-      #directory: "/p/gpfs1/jacobs32/EuroViz2/ae_loss/"
-      directory: "ae_loss/"
-      layers: "ae_err"
-      execution_modes: "test"
-      format: "npy"
-    }
-  }
-  ####For metric, loss per individual sample
-  #layer {
-  #  parents: "reconstruction image_data_dummy"
-  #  name: "squared_error"
-  #  data_layout: "model_parallel"
-  #  squared_difference {}
-  #}
-
-  #callback {
-  #  dump_outputs {
-  #    directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss"
-  #    layers: "squared_error"
-  #    execution_modes: "test"
-  #  }
-  #}
-  callback { save_model { dir: "model" } }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext b/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext
deleted file mode 100644
index d5d3deca580..00000000000
--- a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext
+++ /dev/null
@@ -1,555 +0,0 @@
-#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and
-#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
-#Timestamp 02/26/2018 8:45AM
-model {
-  name: "ae_cycgan_model"
-  shareable_training_data_reader: true
-  serialize_io: true
-  data_layout: "model_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
- # callback {
- #   save_images {
- #     image_prefix: "vae_fcn_images_"
- #     image_format: "jpg"
- #   }
- # }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #Layer from cycle GAN
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "input"
-    data_layout: "data_parallel"
-    children: "data dummy"
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "model_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "model_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "model_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 8192
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "model_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "model_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-
-  ######################
-  # Encoder
-  ######################
-  #Encoder not really used here
-  # encode1
-  layer {
-    parents: "image_data_id"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-  ######################
-  # Latent space
-  ######################
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "20"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_sum"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    hadamard {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    #name: "sample"
-    name: "image_data_dummy"
-    data_layout: "model_parallel"
-    sum {}
-  }
-  ####output of encoder not used, dangling
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    #parents: "sample"
-    parents: "gen1fc4_1"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "model_parallel"
-    device_allocation: "cpu"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "model_parallel"
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    binary_cross_entropy {}
-  }
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_out_loss"
-    data_layout: "model_parallel"
-    parents: "param_data_id mean_squared_error"
-    concatenation {
-    }
-  }
-  callback {
-    dump_outputs {
-      #directory: "/p/lscratchh/jacobs32/EuroViz/fw_out_loss/"
-      #directory: "/p/gpfs1/jacobs32/EuroViz2/fw_out_loss/"
-      directory: "fw_out_loss/"
-      layers: "fw_out_loss"
-      execution_modes: "test"
-      format: "npy"
-    }
-  }
-
-  ####For metric, loss per individual sample
-  #layer {
-  #  parents: "reconstruction image_data_id"
-  #  name: "squared_error"
-  #  data_layout: "model_parallel"
-  #  squared_difference {}
-  #}
-
-  #callback {
-  #  dump_outputs {
-  #    directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/"
-  #    layers: "squared_error"
-  #    execution_modes: "test"
-  #  }
-  #}
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext
deleted file mode 100644
index 574fa83ed20..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext
+++ /dev/null
@@ -1,547 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_fake_eval"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      # slice_points: "0 49174 49179"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 49174
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "data"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_fake"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_fake"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_fake"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_fake"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_fake"
-  }
-  layer {
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_real_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc1_fake_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_fake_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_real_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_real_eval"
-    data_layout: "data_parallel"
-    parents: "disc2_real_bce"
-    evaluation {
-    }
-  }
-  layer {
-    name: "disc2_fake_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_fake zero"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "disc2_fake_eval"
-    data_layout: "data_parallel"
-    parents: "disc2_fake_bce"
-    evaluation {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext
deleted file mode 100644
index 98a6745c3da..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext
+++ /dev/null
@@ -1,66 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc2_fake_eval"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext
deleted file mode 100644
index 6fd5b2caa07..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext
+++ /dev/null
@@ -1,535 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      # slice_points: "0 49174 49179"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    fully_connected {
-      # num_neurons: 49174
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_1 param_data_id"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc1fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc1linearity"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc1fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc2linearity"
-    parents: "disc1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc1relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc1fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc1fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc1fc3linearity"
-    parents: "disc1relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "disc1fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv1_eval"
-    data_layout: "data_parallel"
-    parents: "g_adv1_bce"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    fully_connected {
-      # num_neurons: 49174
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    device_allocation: "cpu"
-    data_layout: "data_parallel"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_1 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_y"
-    device_allocation: "cpu"
-    data_layout: "data_parallel"
-    parents: "gsample_minus_y"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback {
-  #  dump_outputs {
-  #    directory: "/dir/to/dump_y_activations/"
-  #    batch_interval: 100
-  #    layers: "image_data_dummy gen1fc4_1 l_l2_y"
-  #    execution_modes: "test"
-  #  }
-  #}
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext
deleted file mode 100644
index a5afc2959d5..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext
+++ /dev/null
@@ -1,65 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-  callback {
-    dump_outputs {
-      directory: "/dir/to/dump_y_activations/"
-      layers: "image_data_dummy gen1fc4_1"
-      execution_modes: "test"
-    }
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext
deleted file mode 100644
index 6917f1767a1..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext
+++ /dev/null
@@ -1,597 +0,0 @@
-model {
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv2_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-  }
-  num_epochs: 1
-  data_layout: "data_parallel"
-  layer {
-    input {
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      # slice_points: "0 49174 49179"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_1"
-  }
-  layer {
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_1"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_1 image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "disc2fc1_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc1linearity"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu1_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc1_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "disc2fc2_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc2linearity"
-    parents: "disc2relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "disc2relu2_real"
-    data_layout: "data_parallel"
-    parents: "disc2fc2_real"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "disc2fc3_real"
-    data_layout: "data_parallel"
-    weights: "disc2fc3linearity"
-    parents: "disc2relu2_real"
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "g_adv2_bce"
-    data_layout: "data_parallel"
-    parents: "disc2fc3_real one"
-    sigmoid_binary_cross_entropy {
-    }
-  }
-  layer {
-    name: "g_adv2_eval"
-    data_layout: "data_parallel"
-    parents: "g_adv2_bce"
-    evaluation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_y"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_y"
-  }
-  layer {
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_y"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_y"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "gen2fc4_y"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_2"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_2"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 49174
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-    name: "gen1fc4_2"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_2"
-  }
-  layer {
-    name: "cycy_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4_2 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_y"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "cycy_minus_y"
-  }
-  layer {
-    name: "L_cyc_y_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_y"
-    evaluation {
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity"
-    parents: "gen1relu1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2_1"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3_1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      # num_neurons: 49174
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-    name: "gen1fc4_1"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity"
-    parents: "gen1relu3_1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity"
-    parents: "gen1fc4_1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity"
-    parents: "gen2relu1_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_gsample"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity"
-    parents: "gen2relu2_gsample"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3_gsample"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_gsample"
-  }
-  layer {
-    fully_connected {
-      # num_neurons: 5
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_gsample"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity"
-    parents: "gen2relu3_gsample"
-  }
-  layer {
-    name: "cycx_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_gsample param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    abs {
-    }
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "cycx_minus_x"
-  }
-  layer {
-    name: "L_cyc_x_eval"
-    data_layout: "data_parallel"
-    parents: "L_cyc_x"
-    evaluation {
-    }
-  }
-  layer {
-    name: "gsample2_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4_y param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "gsample2_minus_x"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "disc2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback {
-  #  dump_outputs {
-  #    directory: "/dir/to/dump_x_activations/"
-  #    layers: "param_data_id gen2fc4_1"
-  #    execution_modes: "test"
-  #  }
-  #}
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-}
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext
deleted file mode 100644
index e7dd209e092..00000000000
--- a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext
+++ /dev/null
@@ -1,65 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  super_steps: 10000
-  num_batches: 1
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv2_eval"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  #callback { timer {} }
-
-  callback {
-    dump_outputs {
-      directory: "/dir/to/dump_x_activations/"
-      layers: "param_data_id gen2fc4_1"
-      execution_modes: "test"
-    }
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py
deleted file mode 100644
index c089a2a49ca..00000000000
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-#@todo: clean up, tag may not be needed
-#Weight sharing on the same branch (D1) or (D2)
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-#todo, handle weight sharing
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-  #different weights
-  fc1 = prefix+'fc1'+tag
-  fc2 = prefix+'fc2'+tag
-  fc3 = prefix+'fc3'+tag
-  fc4 = prefix+'fc4'+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer')
-  l.weights = fc1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer')
-  l.weights = fc2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer')
-  l.weights = fc3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer')
-  l.weights = fc4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #Useful constants
-    zero = new_layer(model,'zero','','constant')
-    zero.constant.value = 0.0
-    zero.constant.num_neurons = '1'
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #D_Loss1 branch
-    #Fake path
-    #def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-    #freeze generator = True
-    #g_sample=generator1(x)
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, True,True)
-    #g_sample2= generator(y)
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False)
-
-    #True path (share weights with fake path discriminator)
-    #discriminator(y,x)
-    #data = y + x
-    D_real = add_discriminator(model, 'data','disc1',False, True, '_real')
-    #CONCAT
-    # Gsample + x
-    #
-    l = new_layer(model, 'concat_gsample_n_param','','concatenation')
-    l.parents = g_sample+' param_data_id'
-    #discriminator false path
-    #question: how to deal with weight sharing?
-    #discriminator(g_sample,x)
-    D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake')
-
-    #obectives here (D_real, D_fake)
-
-    #D_loss2 branch
-    #Reconcatenate x+y
-    l = new_layer(model, 'concat_param_n_img','param_data_id image_data_dummy','concatenation')
-
-    #D_real2 = discriminator2(x,y)
-    D_real2 = add_discriminator(model,'concat_param_n_img','disc2',False, True, '_real')
-
-    #D_fake2  = discriminator2(G_sample2,y)
-    l = new_layer(model, 'concat_gsample2_n_img',g_sample2+ ' image_data_dummy','concatenation')
-    D_fake2 = add_discriminator(model,'concat_gsample2_n_img','disc2', False, False, '_fake')
-
-    #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
-
-    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
-
-    l = new_layer(model, 'disc2_real_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc2_real_eval','disc2_real_bce', 'evaluation')
-
-    l = new_layer(model, 'disc2_fake_bce', [D_fake2, zero.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc2_fake_eval','disc2_fake_bce', 'evaluation')
-
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py
deleted file mode 100644
index de8b704f877..00000000000
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-#@todo: clean up, tag may not be needed
-#Weight sharing on the same branch (D1) or (D2)
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-#todo, handle weight sharing
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''):
-
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-  w4 = prefix+'fc4'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-  fc4 = w4+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w4 + 'linearity', 'he_normal_initializer')
-  l.weights = w4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #********************************************
-    #g_sample=generator(x)
-    #do not freeze, train generator to confuse discriminator
-    #_1 => first generator1 to be added, to solve problem of all generator1 having the same name
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True,True,'_1')
-    # g_adv1 = discriminator(g_sample,x)
-    l = new_layer(model, 'concat_gsample_n_param',g_sample+' param_data_id','concatenation')
-    #freeze discriminator, fake it as real
-    D_real = add_discriminator(model,'concat_gsample_n_param','disc1',True, True, '_real')
-    #objective function
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-    l = new_layer(model, 'g_adv1_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
-
-    #************************************************
-    #g_sample2= generator2(y) //freeze
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False,True,'_y')
-    #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train
-    #Dont add weights, share weights with _1
-    G_cyc_y = add_generator(model,g_sample2,'gen1',2500,False,True,False,'_2')
-    #G_cyc_y - y
-    l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_y - y = cycy_minus_y
-    l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs')
-    l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation')
-    #+++++++++++++
-    #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name
-    G_cyc_x = add_generator(model,g_sample,'gen2', 11, True,False,False,'_gsample')
-    #G_cyc_x - x
-    l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_x - x = cycx_minus_x
-    l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs')
-    l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation')
-
-    #******************************************************
-    #L_cyc = L_cyc_y + L_cyc_x
-    #l = new_layer(model, 'L_cyc', 'L_cyc_y L_cyc_x', 'weighted_sum')
-    #l.weighted_sum.scaling_factors = '1 1'
-    #l = new_layer(model, 'L_cyc_eval','L_cyc', 'evaluation')
-    #******************************************************
-    #******************************************************
-    #l2_norm(gsample - y)
-    l = new_layer(model, 'gsample_minus_y', g_sample+' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_norm2')
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py
deleted file mode 100644
index 3a14b8b6da5..00000000000
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py
+++ /dev/null
@@ -1,257 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan'
-template_proto  = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-#@todo: clean up, tag may not be needed
-#Weight sharing on the same branch (D1) or (D2)
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-#todo, handle weight sharing
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''):
-
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-  w4 = prefix+'fc4'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-  fc4 = w4+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight):
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w4 + 'linearity', 'he_normal_initializer')
-  l.weights = w4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #********************************************
-    #g_sample2=generator2(y)
-    #do not freeze, train generator to confuse discriminator
-    #_1 => first generator1 to be added, to solve problem of all generator2 having the same name
-    g_sample2 = add_generator(model, 'image_data_dummy','gen2', 11, False,False,True,'_1')
-    # g_adv21 = discriminator2(g_sample2,y)
-    l = new_layer(model, 'concat_gsample2_n_img',g_sample2+' image_data_dummy','concatenation')
-    #freeze discriminator, fake it as real
-    D_real = add_discriminator(model,'concat_gsample2_n_img','disc2',True, True, '_real')
-    #objective function
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-    l = new_layer(model, 'g_adv2_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'g_adv2_eval','g_adv2_bce', 'evaluation')
-
-    #************************************************
-    #g_sample2= generator2(y) //train
-    g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, False,False,False,'_y')
-    #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train
-    #Dont add weights, share weights with _1
-    G_cyc_y = add_generator(model,g_sample2,'gen1',2500,True,True,False,'_2')
-    #G_cyc_y - y
-    l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_y - y = cycy_minus_y
-    l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs')
-    l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation')
-    #+++++++++++++
-    #g_sample=generator(x)
-    g_sample = add_generator(model,'param_data_id','gen1',2500,True,True,True,'_1')
-    #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name
-    G_cyc_x = add_generator(model,g_sample,'gen2', 11, False,False,False,'_gsample')
-    #G_cyc_x - x
-    l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-    #abs(x) x= G_cyc_x - x = cycx_minus_x
-    l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs')
-    l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation')
-
-    #******************************************************
-    #L_cyc = L_cyc_y + L_cyc_x
-    #l = new_layer(model, 'L_cyc', 'L_cyc_y L_cyc_x', 'weighted_sum')
-    #l.weighted_sum.scaling_factors = '1 1'
-    #l = new_layer(model, 'L_cyc_eval','L_cyc', 'evaluation')
-    #******************************************************
-    #******************************************************
-    #l2_norm(gsample2 - x)
-    l = new_layer(model, 'gsample2_minus_x', g_sample2+' param_data_id','weighted_sum')
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_norm2')
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/jag/cycle_gan/jag_data.prototext b/model_zoo/models/jag/cycle_gan/jag_data.prototext
deleted file mode 100644
index d6d1065206f..00000000000
--- a/model_zoo/models/jag/cycle_gan/jag_data.prototext
+++ /dev/null
@@ -1,25 +0,0 @@
-data_reader {
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "train"
-    shuffle: true
-    data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_train_*.npy"
-    validation_percent: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true 
-    disable_labels: true
-  }
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "test"
-    shuffle: false 
-    data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_test_*.npy"
-    validation_percent: 0
-    #test first 16 samples only to match TF version 
-    absolute_sample_count: 16
-    disable_responses: true 
-    disable_labels: true
-  }
-}
diff --git a/model_zoo/models/jag/data_reader_jag.prototext b/model_zoo/models/jag/data_reader_jag.prototext
deleted file mode 100644
index 34561db2d99..00000000000
--- a/model_zoo/models/jag/data_reader_jag.prototext
+++ /dev/null
@@ -1,46 +0,0 @@
-data_reader {
-  reader {
-    name: "jag"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/jjayaram/LDRD-SI/deep-latent-spaces/data"
-    data_filename: "outputs_polar_100k.npy:outputs_scalars_100k.npy:inputs_100k.npy"
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-
-    # 1: JAG_Image,  2: JAG_Scalar,  3: JAG_Input
-    independent: [1]
-    dependent: [3]
-
-    image_preprocessor {
-      raw_width: 50
-      raw_height: 50
-
-      # 0: none,  1: dataset-wise,  2: sample-wise
-      early_normalization: 2
-    }
-  }
-
-  reader {
-    name: "jag"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/jjayaram/LDRD-SI/deep-latent-spaces/data"
-    data_filename: "outputs_polar_100k.npy:outputs_scalars_100k.npy:inputs_100k.npy"
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-
-    # 1: JAG_Image,  2: JAG_Scalar,  3: JAG_Input
-    independent: [1]
-    dependent: [3]
-
-    image_preprocessor {
-      raw_width: 50
-      raw_height: 50
-
-      # 0: none,  1: dataset-wise,  2: sample-wise
-      early_normalization: 2
-    }
-  }
-}
diff --git a/model_zoo/models/jag/data_reader_jag_conduit.prototext b/model_zoo/models/jag/data_reader_jag_conduit.prototext
deleted file mode 100644
index ef5f9293958..00000000000
--- a/model_zoo/models/jag/data_reader_jag_conduit.prototext
+++ /dev/null
@@ -1,53 +0,0 @@
-data_reader {
-  requires_data_set_metadata: true
-
-  reader {
-    name: "jag_conduit"
-    role: "train"
-    shuffle: true
-    # change to a lustre path
-    data_filedir: "/p/lscratchh/brainusr/datasets/1MJAG/1MJAG-A/0/"
-    data_filename: "*/*/*.bundle"
-
-    validation_percent: 0.01
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    disable_responses: true
-    disable_labels: true
-
-    split_jag_image_channels: true
-
-    num_labels: 5
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-      raw_num_channels: 4
-
-      normalizer {
-        disable: true
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/jag/gan/cyclic/.gitignore b/model_zoo/models/jag/gan/cyclic/.gitignore
deleted file mode 100644
index d4a466c3a66..00000000000
--- a/model_zoo/models/jag/gan/cyclic/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lbann_pb2.py
-lbann_pb2.pyc
diff --git a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext
deleted file mode 100644
index 235a00313d7..00000000000
--- a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext
+++ /dev/null
@@ -1,718 +0,0 @@
-model {
-  procs_per_trainer:0
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_inv_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-  }
-  num_epochs: 100
-  metric {
-    layer_metric {
-      layer: "l_l2_y"
-    }
-  }
-  data_layout: "data_parallel"
-  layer {
-    input {
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    slice {
-      slice_points: "0 2500 2511"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    #weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    #weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    #weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    #weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "data"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    parents: "d2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 image_data_dummy"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    l2_norm2 {
-    }
-    parents: "gsample_minus_y"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    #weights: "gen2fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    #weights: "gen2fc2linearity"
-    parents: "gen2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    #weights: "gen2fc3linearity"
-    parents: "gen2relu2"
-  }
-  layer {
-    relu {
-    }
-    name: "gen2relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 11
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    #weights: "gen2fc4linearity"
-    parents: "gen2relu3"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1_invfc1_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu1_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1_invfc2_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    parents: "d1_invrelu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu2_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invrelu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    children: "d1_inv_stop_gradient d2_inv_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_inv_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1_invfc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "d1_inv_stop_gradient"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1_invfc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    parents: "d1_invrelu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "d1_invrelu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invrelu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_inv_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d2_invfc1"
-    data_layout: "data_parallel"
-    parents: "d2_inv_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "d2_invrelu1"
-    data_layout: "data_parallel"
-    parents: "d2_invfc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d2_invfc2"
-    data_layout: "data_parallel"
-    parents: "d2_invrelu1"
-  }
-  layer {
-    relu {
-    }
-    name: "d2_invrelu2"
-    data_layout: "data_parallel"
-    parents: "d2_invfc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2_invfc3"
-    data_layout: "data_parallel"
-    parents: "d2_invrelu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_inv_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2_invfc3 one"
-  }
-  layer {
-    name: "gsample2_minus_x"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 param_data_id"
-    weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    l2_norm2 {
-    }
-    parents: "gsample2_minus_x"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1bias"
-  }
-  weights {
-    name: "d1_invfc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc2bias"
-  }
-  weights {
-    name: "d1_invfc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc3bias"
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 10
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3"
-      batch_interval: 1
-    }
-  }
-  block_size: 256
-}
diff --git a/model_zoo/models/jag/gan/cyclic/generate_model.py b/model_zoo/models/jag/gan/cyclic/generate_model.py
deleted file mode 100644
index 1cd1032cd38..00000000000
--- a/model_zoo/models/jag/gan/cyclic/generate_model.py
+++ /dev/null
@@ -1,297 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/jag/gan/cyclic'
-template_proto  = lbann_dir + '/model_zoo/models/jag/gan/cyclic/model_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-#@todo: clean up, tag may not be needed
-#Weight sharing on the same branch (D1) or (D2)
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  #@todo add  bias ---difficult to debug problem without bias
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  #@todo/bug: fix, will still add weights to layer even though it is not suppose to
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-#todo, handle weight sharing
-#@todo, use default weight/bias, adding weights cause bad thing to happen with LTFB except you add/transfer both w and b
-#@todo, generally automate manual editing made in debugging process
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-  #different weights
-  fc1 = prefix+'fc1'+tag
-  fc2 = prefix+'fc2'+tag
-  fc3 = prefix+'fc3'+tag
-  fc4 = prefix+'fc4'+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer')
-  l.weights = fc1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer')
-  l.weights = fc2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer')
-  l.weights = fc3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer')
-  l.weights = fc4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #Useful constants
-    zero = new_layer(model,'zero','','constant')
-    zero.constant.value = 0.0
-    zero.constant.num_neurons = '1'
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    # Forward Model
-    #D_Loss1 branch
-    #Fake path
-    #def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-    #freeze generator = False
-    #forward generator x->y'
-    #g_sample=generator1(x)
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True)
-
-    #True path (share weights with fake path discriminator)
-    #discriminator(y,x)
-    #data = y + x
-    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-    #forward_model
-    D_real = add_discriminator(model, 'data','d1',False, True, '_real')
-
-    #CONCAT
-    # Gsample + x
-    #
-    l = new_layer(model, 'concat_gsample_n_param','','concatenation')
-    l.parents = g_sample+' param_data_id'
-    l.children = 'd1_stop_gradient d2_dummy'
-    #discriminator false path
-    #question: how to deal with d1 weight sharing? //Dreal and Dfake weights are shared?
-    #And copied to discriminator (d2) on adversarial path at every iteration
-    #discriminator(g_sample,x)
-    #add stop gradient, so gradient doesnt go to generator on Dfake path
-    l = new_layer(model, 'd1_stop_gradient','concat_gsample_n_param', 'stop_gradient')
-    #D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake')
-    D_fake = add_discriminator(model,'d1_stop_gradient','d1',False, False, '_fake')
-
-    #Objective term (and metric) layers here
-    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
-
-    #Adversarial part
-    #replicate discriminator (freeze it), weight will be copied through replace_layer callback, fake it as real
-    #add identity/dummy layer that is a copy of concat
-    l = new_layer(model, 'd2_dummy','concat_gsample_n_param', 'identity')
-    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-    D_adv = add_discriminator(model,'d2_dummy','d2',True, False)
-    #objective function
-    #fake as real
-    l = new_layer(model, 'g_adv1_bce', [D_adv, one.name], 'sigmoid_binary_cross_entropy')
-
-    #Add L2 loss
-    l = new_layer(model, 'gsample_minus_y', ' ', 'weighted_sum')
-    l.parents = g_sample+' image_data_dummy'
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model,'l_l2_y', 'gsample_minus_y','l2_norm2')
-
-    #####Inverse Model
-
-    #inverse generator y->x'
-    #g_sample2=generator2(y)
-    g_sample2 = add_generator(model, 'image_data_dummy','gen2', 11, False,False)
-
-    #inverse model real part, we need a concat of param and image
-    l = new_layer(model, 'concat_param_n_img','','concatenation')
-    l.parents =  'param_data_id image_data_dummy'
-    #l.children = ' '
-    D_inv_real = add_discriminator(model, 'concat_param_n_img','d1_inv',False, True, '_real')
-    #CONCAT
-    # Gsample2 (that is x') + y
-    #
-    l = new_layer(model, 'concat_gsample2_n_img','','concatenation')
-    l.parents = g_sample2+' image_data_dummy'
-    l.children = 'd1_inv_stop_gradient d2_inv_dummy'
-    #discriminator(g_sample2,y)
-    #add stop gradient, so gradient doesnt go to generator on this path
-    l = new_layer(model, 'd1_inv_stop_gradient','concat_gsample2_n_img', 'stop_gradient')
-    D_inv_fake = add_discriminator(model,'d1_inv_stop_gradient','d1_inv',False, False, '_fake')
-    #Objective term (and metric) layers here
-    l = new_layer(model, 'disc1_inv_real_bce', [D_inv_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_inv_fake_bce', [D_inv_fake, zero.name], 'sigmoid_binary_cross_entropy')
-    #Adversarial part
-    l = new_layer(model, 'd2_inv_dummy','concat_gsample2_n_img', 'identity')
-    D_inv_adv = add_discriminator(model,'d2_inv_dummy','d2_inv',True, False)
-    #objective function
-    #fake as real
-    l = new_layer(model, 'g_inv_adv1_bce', [D_inv_adv, one.name], 'sigmoid_binary_cross_entropy')
-
-    #Add L2 loss
-    l = new_layer(model, 'gsample2_minus_x', ' ', 'weighted_sum')
-    l.parents = g_sample2+' param_data_id'
-    l.weighted_sum.scaling_factors = '1 -1'
-
-    l = new_layer(model,'l_l2_x', 'gsample2_minus_x','l2_norm2')
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/jag/gan/cyclic/model_template.prototext b/model_zoo/models/jag/gan/cyclic/model_template.prototext
deleted file mode 100644
index 3c130a16aa8..00000000000
--- a/model_zoo/models/jag/gan/cyclic/model_template.prototext
+++ /dev/null
@@ -1,110 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_y"
-    }
-
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_inv_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      layer: "l_l2_x"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      layer: "l_l2_x"
-    }
-  }
-  metric {
-    layer_metric {
-      layer: "l_l2_y"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-
-  #Add callback for replace_weights
-  callback {
-    replace_weights{
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3"
-    }
-  }
-
-  #Optional callbacks
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jag_imgs/cyclic_gan/"
-      batch_interval: 100
-      layers: "image_data_dummy gen1fc4 param_data_id gen2fc4"
-      execution_modes: "test"
-    }
-  }
-  callback {
-    ltfb {
-      batch_interval: 100
-      low_score_wins: true
-      metric: "l_l2_y_eval"
-      #weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias"
-      weights: "gen1fc1_linearity_weights gen1fc1_bias_weights gen1fc2_linearity_weights gen1fc2_bias_weights gen1fc3_linearity_weights gen1fc3_bias_weights gen1fc4_linearity_weights gen1fc4_bias_weights"
-      }
-
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/jag/gan/jag_data.prototext b/model_zoo/models/jag/gan/jag_data.prototext
deleted file mode 100644
index 00223b1bb35..00000000000
--- a/model_zoo/models/jag/gan/jag_data.prototext
+++ /dev/null
@@ -1,26 +0,0 @@
-data_reader {
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "train"
-    shuffle: true
-    data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/jag_train_*.npy"
-    validation_percent: 0.1
-    percent_of_data_to_use: 1.0
-    disable_responses: true 
-    disable_labels: true
-  }
-  reader {
-    name: "merge_features"
-    format: "numpy"
-    role: "test"
-    shuffle: false 
-    data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/jag_test_*.npy"
-    validation_percent: 0
-    #test first 16 samples only to match TF version 
-    absolute_sample_count: 100
-    #percent_of_data_to_use: 1.0
-    disable_responses: true 
-    disable_labels: true
-  }
-}
diff --git a/model_zoo/models/jag/gan/vanilla/.gitignore b/model_zoo/models/jag/gan/vanilla/.gitignore
deleted file mode 100644
index d4a466c3a66..00000000000
--- a/model_zoo/models/jag/gan/vanilla/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lbann_pb2.py
-lbann_pb2.pyc
diff --git a/model_zoo/models/jag/gan/vanilla/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext
deleted file mode 100644
index 7bfa4e4d3d8..00000000000
--- a/model_zoo/models/jag/gan/vanilla/gan.prototext
+++ /dev/null
@@ -1,473 +0,0 @@
-model {
-  random_init_models_differently: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      #layer: "disc1_real_eval"
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      #layer: "disc1_fake_eval"
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      scale_factor: 0.05
-      #layer: "g_adv1_eval"
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 0.025
-      #layer: "l_l2_y_eval"
-      layer: "l_l2_y"
-    }
-  }
- # metric {
- #   layer_metric {
-      #unit: "%"
- #     layer: "disc1_real_eval"
- #   }
- # }
- # metric {
- #   layer_metric {
-      #unit: "%"
- #     layer: "disc1_fake_eval"
- #   }
- # }
- # metric {
- #   layer_metric {
- #     layer: "g_adv1_eval"
- #   }
- # }
-  metric {
-    layer_metric {
-      #layer: "l_l2_y_eval"
-      layer: "l_l2_y"
-    }
-  }
-  num_epochs: 100
-  data_layout: "data_parallel"
-  layer {
-    input {
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data1"
-    data_layout: "data_parallel"
-  }
-
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data1"
-    children: "image_data_dummy param_data_id"
-    slice {
-      slice_points: "0 2500 2511"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    #weights: "gen1fc1linearity"
-    parents: "param_data_id"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    #weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    #weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    relu {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 2500
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    #weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "data1"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_real"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_fake"
-  }
-  layer {
-    relu {
-    }
-    name: "d1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  #layer {
-  #  name: "disc1_real_eval"
-  #  data_layout: "data_parallel"
-  #  parents: "disc1_real_bce"
-  #  evaluation {
-  #  }
-  #}
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  #layer {
-  #  name: "disc1_fake_eval"
-  #  data_layout: "data_parallel"
-  #  parents: "disc1_fake_bce"
-  #  evaluation {
-  #  }
-  #}
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    parents: "d2relu1"
-  }
-  layer {
-    relu {
-    }
-    name: "d2relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  #layer {
-  #  name: "g_adv1_eval"
-  #  data_layout: "data_parallel"
-  #  parents: "g_adv1_bce"
-  #  evaluation {
-  #  }
-  #}
-  #L2loss
-  layer {
-    name: "gsample_minus_y"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 image_data_dummy"
-   weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  layer {
-    l2_norm2 {
-    }
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    #device_allocation: "cpu"
-    parents: "gsample_minus_y"
-  }
-
-  #layer {
-  #  name: "l_l2_y_eval"
-  #  data_layout: "data_parallel"
-  #  parents: "l_l2_y"
-  #  evaluation {
-  #  }
-  #}
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3"
-      batch_interval: 1
-    }
-  }
-  #callback {
-  #  dump_outputs {
-  #    directory: "/dir/to/save/imgs"
-  #    batch_interval: 10
-  #    layers: "gen1fc4"
-  #    execution_modes: "test"
-  #  }
-  #}
-  #callback {
-  #  ltfb {
-  #    batch_interval: 100
-  #    low_score_wins: true
-  #    metric: "l_l2_y_eval"
-  #    #weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias"
-  #    weights: "gen1fc1_linearity_weights gen1fc1_bias_weights gen1fc2_linearity_weights gen1fc2_bias_weights gen1fc3_linearity_weights gen1fc3_bias_weights gen1fc4_linearity_weights gen1fc4_bias_weights"
-  #    }
-
- # }
-  block_size: 256
-  procs_per_trainer:0
-}
diff --git a/model_zoo/models/jag/gan/vanilla/gan_template.prototext b/model_zoo/models/jag/gan/vanilla/gan_template.prototext
deleted file mode 100644
index 89c1f949691..00000000000
--- a/model_zoo/models/jag/gan/vanilla/gan_template.prototext
+++ /dev/null
@@ -1,76 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_eval"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_eval"
-    }
-    layer_term {
-      scale_factor: 0.05
-      layer: "g_adv1_eval"
-    }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  #metric {
-  #  layer_metric {
-  #    layer: "dis_eval_t"
-  #  }
-  #  layer_metric {
-  #    layer: "dis_eval_f"
-  #  }
-  #}
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-
-  #Add callback for replace_weights
-  callback {
-    replace_weights{
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3"
-    }
-  }
-
-  callback {
-    dump_outputs {
-      directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jag_imgs/gan/"
-      #batch_interval: 100
-      layers: "image_data_dummy gen1fc4"
-      execution_modes: "tests"
-    }
-  }
-
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-}
diff --git a/model_zoo/models/jag/gan/vanilla/generate_gan.py b/model_zoo/models/jag/gan/vanilla/generate_gan.py
deleted file mode 100644
index 26eae2f3c1e..00000000000
--- a/model_zoo/models/jag/gan/vanilla/generate_gan.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import sys
-import os
-import subprocess
-import functools
-
-# Parameters
-lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
-lbann_proto_dir = lbann_dir + '/src/proto/'
-work_dir        = lbann_dir + '/model_zoo/models/jag/gan'
-template_proto  = lbann_dir + '/model_zoo/models/jag/gan/gan_template.prototext'
-output_proto    = lbann_dir + '/model_zoo/models/jag/gan/gan.prototext'
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, list):
-        return ' '.join(str(i) for i in l)
-    elif isinstance(l, str):
-        return l
-    else:
-        raise TypeError('str_list expects a list or a string')
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
-    l = model.layer.add()
-    l.name = name
-    l.data_layout = layout
-    l.parents = str_list(parents)
-    #l.device_allocation = device
-    exec('l.' + layer_type + '.SetInParent()')
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = 'constant_initializer'):
-    w = model.weights.add()
-    w.name = name
-    exec('w.' + initializer + '.SetInParent()')
-    return w
-
-# Discriminator
-#@todo: clean up, tag may not be needed
-#Weight sharing on the same branch (D1) or (D2)
-def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-  #Shared weights for same path (e.g. D1 fake and D1 real)
-  w1 = prefix+'fc1'
-  w2 = prefix+'fc2'
-  w3 = prefix+'fc3'
-
-  fc1 = w1+tag
-  fc2 = w2+tag
-  fc3 = w3+tag
-
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-
-  l = new_layer(model, fc1, disc_input,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
-  l.weights = w1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
-  l.weights = w2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-
-  l = new_layer(model, fc3, relu2, 'fully_connected')
-  l.fully_connected.num_neurons = 1
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  if(add_weight) :
-    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
-  l.weights = w3 + 'linearity'
-  return fc3
-
-
-#Generator
-#Weight frozen, no weight sharing
-#todo, handle weight sharing
-def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-  #different weights
-  fc1 = prefix+'fc1'+tag
-  fc2 = prefix+'fc2'+tag
-  fc3 = prefix+'fc3'+tag
-  fc4 = prefix+'fc4'+tag
-
-  relu1 = prefix+'relu1'+tag
-  relu2 = prefix+'relu2'+tag
-  relu3 = prefix+'relu3'+tag
-
-  dropout1 = prefix+'dropout1'+tag
-
-  l = new_layer(model, fc1, gen_input,'fully_connected')
-  l.fully_connected.num_neurons = 16
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer')
-  l.weights = fc1 + 'linearity'
-
-  l = new_layer(model, relu1, fc1,'relu')
-
-  l = new_layer(model, fc2, relu1,'fully_connected')
-  l.fully_connected.num_neurons = 128
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer')
-  l.weights = fc2 + 'linearity'
-
-  l = new_layer(model, relu2, fc2,'relu')
-  next_parent = relu2
-  if(add_dropout):
-    l = new_layer(model,dropout1,next_parent, 'dropout')
-    l.dropout.keep_prob = 0.8
-    next_parent=dropout1
-
-  l = new_layer(model, fc3, next_parent, 'fully_connected')
-  l.fully_connected.num_neurons = 512
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer')
-  l.weights = fc3 + 'linearity'
-
-  l = new_layer(model, relu3, fc3, 'relu')
-
-  l = new_layer(model, fc4, relu3, 'fully_connected')
-  l.fully_connected.num_neurons = output_dim
-  l.fully_connected.has_bias = True
-  l.freeze = freeze
-  w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer')
-  l.weights = fc4 + 'linearity'
-
-  return fc4
-
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    #####INPUT DATA (including Slices)
-    ### Input data comes from merge features of image (Y) and param (X)
-    l = new_layer(model,'data',' ', 'input')
-
-    slice_points = [0,2500,2511]
-    l = new_layer(model, 'slice_data','data', 'slice')
-    l.children = 'image_data_dummy param_data_id'
-    l.slice.slice_points = str_list(slice_points)
-
-    #Useful constants
-    zero = new_layer(model,'zero','','constant')
-    zero.constant.value = 0.0
-    zero.constant.num_neurons = '1'
-    one = new_layer(model,'one','','constant')
-    one.constant.value = 1.0
-    one.constant.num_neurons = '1'
-
-    #ID Image (Y) data
-    l = new_layer(model,'image_data_dummy','slice_data','identity')
-
-    #ID parameter data (X)
-    l = new_layer(model,'param_data_id','slice_data','identity')
-
-    #D_Loss1 branch
-    #Fake path
-    #def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
-    #freeze generator = False
-    #g_sample=generator1(x)
-    g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True)
-
-    #True path (share weights with fake path discriminator)
-    #discriminator(y,x)
-    #data = y + x
-    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-    D_real = add_discriminator(model, 'data','d1',False, True, '_real')
-    #CONCAT
-    # Gsample + x
-    #
-    l = new_layer(model, 'concat_gsample_n_param','','concatenation')
-    l.parents = g_sample+' param_data_id'
-    l.children = 'd1_stop_gradient d2_dummy'
-    #discriminator false path
-    #question: how to deal with weight sharing?
-    #discriminator(g_sample,x)
-    #add stop gradient, so gradient doesnt go to generator on this path
-    l = new_layer(model, 'd1_stop_gradient','concat_gsample_n_param', 'stop_gradient')
-    #D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake')
-    D_fake = add_discriminator(model,'d1_stop_gradient','d1',False, False, '_fake')
-
-    #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
-
-    l = new_layer(model, 'disc1_fake_bce', [D_real, zero.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
-
-    #Adversarial part
-    #replicate discriminator (freeze it), weight will be copied through callback, fake it as real
-    #add identity/dummy layer that is a copy of concat
-    l = new_layer(model, 'd2_dummy','concat_gsample_n_param', 'identity')
-    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
-    D_real2 = add_discriminator(model,'d2_dummy','d2',True, False)
-    #objective function
-    #fake as real
-    l = new_layer(model, 'g_adv1_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
-    l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
-
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output('hostname').strip('\n1234567890')
-    protoc = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/install/bin/protoc'
-    proto_python_dir = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/protobuf/src/python'
-    os.putenv('PROTOC', protoc)
-    subprocess.call('cd ' + proto_python_dir + '; '
-                    + sys.executable + ' '
-                    + proto_python_dir + '/setup.py build',
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     '-I=' + lbann_proto_dir,
-                     '--python_out=' + work_dir,
-                     lbann_proto_dir + '/lbann.proto'])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, 'r') as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, 'w') as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/models/jag/input.minmax.1M.txt b/model_zoo/models/jag/input.minmax.1M.txt
deleted file mode 100644
index f63bdb63d3e..00000000000
--- a/model_zoo/models/jag/input.minmax.1M.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-      { scale: 1.666721654e+01 	bias: 5.000145788e+00}, 	# shape_model_initial_modes:(4,3)
-      { scale: 1.000025133e+01 	bias: -1.603520955e-06}, 	# betti_prl15_trans_u
-      { scale: 1.000001645e+00 	bias: -1.406676728e-06}, 	# betti_prl15_trans_v
-      { scale: 1.666672975e+00 	bias: 4.999989818e-01}, 	# shape_model_initial_modes:(2,1)
-      { scale: 1.666668753e+00 	bias: 5.000004967e-01}  	# shape_model_initial_modes:(1,0)
diff --git a/model_zoo/models/jag/scalar.minmax.1M.txt b/model_zoo/models/jag/scalar.minmax.1M.txt
deleted file mode 100644
index e260dd86dea..00000000000
--- a/model_zoo/models/jag/scalar.minmax.1M.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-	{ scale: 1.5420795e+01 	 bias: -8.313582e-01 }, 	 # BWx
-	{ scale: 1.4593427e+00 	 bias: -3.426026e+00 }, 	 # BT
-	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXt
-	{ scale: 4.4250137e+01 	 bias: -1.623055e+00 }, 	 # BWn
-	{ scale: 2.4432852e-06 	 bias: -7.724349e-01 }, 	 # MAXpressure
-	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAte
-	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXtion
-	{ scale: 1.4302059e+00 	 bias: -3.349900e+00 }, 	 # tMAXpressure
-	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAt
-	{ scale: 7.1544386e-18 	 bias: -1.869906e-02 }, 	 # Yn
-	{ scale: 3.1669860e-03 	 bias: -1.869906e-02 }, 	 # Ye
-	{ scale: 2.1041247e-02 	 bias: -3.084058e-01 }, 	 # Yx
-	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXte
-	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAtion
-	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXte
-	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXtion
-	{ scale: 1.3456596e+00 	 bias: -3.116023e+00 }, 	 # BTx
-	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXt
-	{ scale: 1.4593427e+00 	 bias: -3.426026e+00 }, 	 # BTn
-	{ scale: 3.0520000e-06 	 bias: -7.714907e-01 }, 	 # BApressure
-	{ scale: 1.3925443e+00 	 bias: -3.239921e+00 }, 	 # tMINradius
-	{ scale: 1.0023756e-01 	 bias: -2.815272e+00 }  	 # MINradius
diff --git a/model_zoo/models/jag/vae_fcn.prototext b/model_zoo/models/jag/vae_fcn.prototext
deleted file mode 100644
index 8f2528984df..00000000000
--- a/model_zoo/models/jag/vae_fcn.prototext
+++ /dev/null
@@ -1,393 +0,0 @@
-#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and
-#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
-#Timestamp 02/26/2018 8:45AM
-model {
-  data_layout: "model_parallel"
-  #mini_batch_size: 128
-  mini_batch_size: 100 #more last minibatch images to save
-  block_size: 256
-  num_epochs: 40
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "binary_cross_entropy" }
-    layer_term { layer: "kl_divergence" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "mean squared error"
-      layer: "mean_squared_error"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-  callback {
-    dump_outputs {
-      directory: "dump_acts/"
-      layers: "data sigmoid"
-      execution_modes: "test"
-    }
-  }
-  callback {
-    save_images {
-      image_prefix: "vae_fcn_images_"
-      image_format: "jpg"
-    }
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  layer {
-    name: "input"
-    children: "data dummy"
-    data_layout: "data_parallel"
-    input {
-      target_mode: "reconstruction"
-    }
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "dummy"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  ######################
-  # Encoder
-  ######################
-
-  # encode1
-  layer {
-    parents: "data"
-    name: "encode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1"
-    name: "encode1_elu"
-    data_layout: "model_parallel"
-    elu {}
-  }
-  layer {
-    parents: "encode1_elu"
-    name: "encode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode2
-  layer {
-    parents: "encode1_dropout"
-    name: "encode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2"
-    name: "encode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode2_tanh"
-    name: "encode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # encode3
-  layer {
-    parents: "encode2_dropout"
-    name: "encode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:5
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_dropout"
-    name: "z_log_sigma"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:5
-      has_bias: true
-    }
-  }
-
-  ######################
-  # KL divergence
-  ######################
-
-  layer {
-    name: "kl_one"
-    data_layout: "model_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "5"
-    }
-  }
-  layer {
-    parents: "z_mean"
-    name: "kl_z_mean2"
-    data_layout: "model_parallel"
-    square {}
-  }
-  layer {
-    parents: "z_log_sigma"
-    name: "kl_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
-    name: "kl_full"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "-0.5 -0.5 0.5 0.5"
-    }
-  }
-  layer {
-    parents: "kl_full"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    reduction {
-      mode: "sum"
-    }
-  }
-
-  ######################
-  # Sample from latent space
-  ######################
-
-  layer {
-    parents: "z_log_sigma"
-    name: "sample_half"
-    data_layout: "model_parallel"
-    weighted_sum {
-      scaling_factors: "0.5"
-    }
-  }
-  layer {
-    parents: "sample_half"
-    name: "sample_exp"
-    data_layout: "model_parallel"
-    exp {}
-  }
-  layer {
-    name: "sample_noise"
-    data_layout: "model_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "5"
-    }
-  }
-  layer {
-    parents: "sample_exp sample_noise"
-    name: "sample_exp_noise"
-    data_layout: "model_parallel"
-    multiply {}
-  }
-  layer {
-    parents: "z_mean sample_exp_noise"
-    name: "sample"
-    data_layout: "model_parallel"
-    add {}
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "sample"
-    name: "decode3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "model_parallel"
-    elu {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "model_parallel"
-    num_neurons_from_data_reader: true
-    fully_connected {
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode0"
-    name: "sigmoid"
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "sigmoid"
-    name: "reconstruction"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "reconstruction data"
-    name: "mean_squared_error"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-  layer {
-    parents: "reconstruction data"
-    name: "binary_cross_entropy"
-    data_layout: "model_parallel"
-    binary_cross_entropy {}
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext
deleted file mode 100644
index f8edac45647..00000000000
--- a/model_zoo/models/jag/wae.prototext
+++ /dev/null
@@ -1,561 +0,0 @@
-model {
-  random_init_models_differently: true
-  serialize_io: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      #layer: "disc1_real_eval"
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      #layer: "disc1_fake_eval"
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      #lam = 0.01
-      scale_factor: 0.01
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "img_loss"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "rec_error"
-    }
-  }
-  metric {
-    layer_metric {
-      layer: "img_loss"
-    }
-  }
-  num_epochs: 100
-  data_layout: "data_parallel"
-  layer {
-    input {
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data1"
-    data_layout: "data_parallel"
-  }
-  #z or sample_z
-  #@todo z = -1+2*np.random.rand(batch_size, zdim=20)
-  layer {
-    name: "sample_z"
-    data_layout: "data_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data1"
-    children: "image_data_dummy param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-
-  #concate image data with sample_z
-  layer {
-    name: "concat_y_n_samplez"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy sample_z"
-    concatenation {
-    }
-  }
-
-  ###generator == encoder
-  layer {
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    #weights: "gen1fc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    elu {
-    }
-    name: "gen1relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    #weights: "gen1fc2linearity"
-    parents: "gen1relu1"
-  }
-  layer {
-    tanh {
-    }
-    name: "gen1relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 1.0
-    }
-    name: "gen1dropout1"
-    data_layout: "data_parallel"
-    parents: "gen1relu2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    #weights: "gen1fc3linearity"
-    parents: "gen1dropout1"
-  }
-  layer {
-    tanh {
-    }
-    name: "gen1relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #gen output is latent dim
-      num_neurons: 20
-      has_bias: true
-    }
-    #z_sample
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    #weights: "gen1fc4linearity"
-    parents: "gen1relu3"
-  }
-
-  ####Discriminator
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "concat_y_n_samplez"
-  }
-  layer {
-    elu {
-    }
-    #@todo: use "acts" for activation instead of actualy type
-    name: "d1relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_real"
-  }
-  layer {
-    tanh {
-    }
-    name: "d1relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    ## This is D_prior
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_real"
-  }
-  layer {
-    name: "concat_y_n_zsample"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy gen1fc4"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    elu {
-    }
-    name: "d1relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    parents: "d1relu1_fake"
-  }
-  layer {
-    tanh {
-    }
-    name: "d1relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    #This is D_sample
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    elu {
-    }
-    name: "d2relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    parents: "d2relu1"
-  }
-  layer {
-    tanh {
-    }
-    name: "d2relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  layer {
-    name: "decode0_minus_y"
-    data_layout: "data_parallel"
-    parents: "decode0 image_data_dummy"
-   weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  #L2loss
-  layer {
-    l2_norm2 {
-    }
-    name: "rec_error"
-    data_layout: "data_parallel"
-    parents: "decode0_minus_y"
-  }
-
-  layer {
-    parents: "decode0 image_data_dummy"
-    name: "img_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "gen1fc4"
-    name: "decode3"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    elu {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    tanh {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "data_parallel"
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-
-  ######################
-
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  mini_batch_size: 64
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real"
-      destination_layers: "d2fc1 d2fc2 d2fc3"
-      batch_interval: 2
-    }
-  }
-  #callback {
-  #  dump_outputs {
-  #    prefix: "/dir/to/save/imgs"
-  #    batch_interval: 10
-  #    layers: "gen1fc4"
-  #    execution_modes: "test"
-  #  }
-  #}
-  #callback {
-  #  ltfb {
-  #    batch_interval: 100
-  #    low_score_wins: true
-  #    metric: "l_l2_y_eval"
-  #    #weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias"
-  #    weights: "gen1fc1_linearity_weights gen1fc1_bias_weights gen1fc2_linearity_weights gen1fc2_bias_weights gen1fc3_linearity_weights gen1fc3_bias_weights gen1fc4_linearity_weights gen1fc4_bias_weights"
-  #    }
-
- # }
-  block_size: 256
-  procs_per_trainer:0
-}
diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext
deleted file mode 100644
index 69fbf816f53..00000000000
--- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext
+++ /dev/null
@@ -1,1093 +0,0 @@
-model {
-  name: "cycgan_model"
-  shareable_training_data_reader:false 
-  serialize_io: true
-  procs_per_trainer:0 
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      #scale_factor: 0.05
-      scale_factor: 0.01
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      #scale_factor: 0.025
-      scale_factor: 1.0
-      layer: "l_l2_y"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_fake_bce"
-    }
-    layer_term {
-      #scale_factor: 0.05
-      scale_factor: 0.01
-      layer: "g_inv_adv1_bce"
-    }
-    layer_term {
-      #scale_factor: 0.025
-      scale_factor: 1.0
-      layer: "l_l2_x"
-    }
-    layer_term {
-      scale_factor: 0.1
-      layer: "L_cyc_x"
-    }
-  }
-  num_epochs: 40 
-  super_steps: 10
-  metric {
-    layer_metric {
-      name: "fw_latent_loss"
-      layer: "l_l2_y"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "inv_l1_loss"
-      layer: "l_l2_x"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "X_cyclic_loss"
-      layer: "L_cyc_x"
-    }
-  }
-  data_layout: "data_parallel"
-  layer {
-    input {
-      io_buffer: "partitioned"
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    #children: "image_data_dummy param_data_id"
-    children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 2500 2511"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    #name: "image_data_dummy"
-    name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ###Encoder from WAE
-  #########################
-  layer {
-    fully_connected {
-      #num_neurons: 32
-      num_neurons: 1024
-      has_bias: true
-    }
-    name: "encodefc1"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc1linearity"
-    parents: "image_data_id"
-  }
-  layer {
-    elu {
-    }
-    name: "encodeleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "encodefc1"
-  }
-  layer {
-    parents: "encodeleaky_relu1"
-    name: "encodefc1_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "encodefc2"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc2linearity"
-    parents: "encodefc1_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "encodefc2"
-  }
-  layer {
-    parents: "encodeleaky_relu2"
-    name: "encodefc2_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-    name: "encodefc3"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc3linearity"
-    parents: "encodefc2_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu3"
-    data_layout: "data_parallel"
-    parents: "encodefc3"
-  }
-  layer {
-    parents: "encodeleaky_relu3"
-    name: "encodefc3_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      #gen output is latent dim
-      num_neurons: 20
-      has_bias: true
-    }
-    #z_sample
-    name: "encodefc4"
-    data_layout: "data_parallel"
-    #weights: "encodefc4linearity"
-    freeze: true
-    parents: "encodefc3_bn"
-  }
-  #####################
-
-  layer {
-    parents: "encodefc4"
-    #name: "sample"
-    ###This is actually sample in latent space, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    identity {}
-  }
-  #####WAE Encoder ends here, sample feeds or replaces image data dummy
-  ###Generator starts here
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  #layer {
-  #  dropout {
-  #    keep_prob: 0.8
-  #  }
-  #  name: "gen1dropout1"
-  #  data_layout: "data_parallel"
-  #  parents: "gen1leaky_relu2"
-  #}
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    #parents: "gen1dropout1"
-    parents: "gen1leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2500
-      #get_slice_points_from_reader: "independent"
-      #get_num_neurons_of_slice_from_reader: [ 1 ]
-      #replace image_dim with latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "gen1leaky_relu3"
-  }
-  #concat latenty sample (image_data_dummy) and param
-  layer {
-    name: "concat_latent_sample_n_param"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy param_data_id"
-    concatenation {
-    }
-  }
-  #####Discriminator
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    #parents: "data"
-    parents: "concat_latent_sample_n_param"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    parents: "d1leaky_relu1_real"
-    name: "d1fc1_real_bn"
-    weights: "d1fc1_w0 d1fc1_w1 d1fc1_w2 d1fc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    #parents: "d1leaky_relu1_real"
-    parents: "d1fc1_real_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1leaky_relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    parents: "d1leaky_relu1_fake"
-    name: "d1fc1_fake_bn"
-    weights: "d1fc1_w0 d1fc1_w1 d1fc1_w2 d1fc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    #parents: "d1leaky_relu1_fake"
-    parents: "d1fc1_fake_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1leaky_relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    parents: "d2leaky_relu1"
-    name: "d2fc1_bn"
-    freeze: true
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    #parents: "d2leaky_relu1"
-    parents: "d2fc1_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2leaky_relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  #layer {
-  #  name: "gsample_minus_y"
-  #  data_layout: "data_parallel"
-  #  parents: "gen1fc4 image_data_dummy"
-  #  weighted_sum {
-  #    scaling_factors: "1 -1"
-  #  }
-  #}
-  layer {
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    #l2_norm2 {
-    #}
-    mean_absolute_error { }
-    #parents: "gsample_minus_y"
-    parents: "gen1fc4 image_data_dummy"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 64
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "image_data_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 512
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_real"
-  }
-  layer {
-    parents: "d1_invleaky_relu1_real"
-    name: "d1invfc1_real_bn"
-    weights: "d1invfc1_w0 d1invfc1_w1 d1invfc1_w2 d1invfc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    #parents: "d1_invleaky_relu1_real"
-    parents: "d1invfc1_real_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invleaky_relu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    children: "d1_inv_stop_gradient d2_inv_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_inv_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "d1_inv_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_fake"
-  }
-  layer {
-    parents: "d1_invleaky_relu1_fake"
-    name: "d1invfc1_fake_bn"
-    weights: "d1invfc1_w0 d1invfc1_w1 d1invfc1_w2 d1invfc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    #parents: "d1_invleaky_relu1_fake"
-    parents: "d1invfc1_fake_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invleaky_relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_inv_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2_invfc1"
-    data_layout: "data_parallel"
-    parents: "d2_inv_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2_invleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "d2_invfc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2_invfc2"
-    data_layout: "data_parallel"
-    parents: "d2_invleaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2_invleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "d2_invfc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2_invfc3"
-    data_layout: "data_parallel"
-    parents: "d2_invleaky_relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_inv_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2_invfc3 one"
-  }
-  #layer {
-  #  name: "gsample2_minus_x"
-  #  data_layout: "data_parallel"
-  #  parents: "gen2fc4 param_data_id"
-  #  weighted_sum {
-  #    scaling_factors: "1 -1"
-  #  }
-  #}
-  #@todo: replace with mean abs layer
-  layer {
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    #l2_norm2 {
-    #}
-    mean_absolute_error{
-    }
-    #parents: "gsample2_minus_x"
-    parents: "gen2fc4 param_data_id"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1bias"
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2bias"
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3bias"
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4bias"
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1bias"
-  }
-  weights {
-    name: "gen2fc2bias"
-  }
-  weights {
-    name: "gen2fc3bias"
-  }
-  weights {
-    name: "gen2fc4bias"
-  }
-  weights {
-    name: "d1_invfc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1bias"
-  }
-  weights {
-    name: "d1_invfc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc2bias"
-  }
-  weights {
-    name: "d1_invfc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc3bias"
-  }
-  ###Weights for batch norm
-  weights {
-     name: "d1fc1_w0"
-  }
-  weights {
-     name: "d1fc1_w1"
-  }
-  weights {
-     name: "d1fc1_w2"
-  }
-  weights {
-     name: "d1fc1_w3"
-  }
-  weights {
-     name: "d1invfc1_w0"
-  }
-  weights {
-     name: "d1invfc1_w1"
-  }
-  weights {
-     name: "d1invfc1_w2"
-  }
-  weights {
-     name: "d1invfc1_w3"
-  }
-  mini_batch_size: 128
-  callback {
-    print {
-      interval: 10
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback { gpu_memory_usage {} }
-  #callback { debug {} }
-  #callback {
-  #  summary {
-  #    dir: "."
-  #    mat_interval: 25
-  #  }
-  #}
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real d1fc1_real_bn"
-      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3 d2fc1_bn"
-      batch_interval: 1
-    }
-  }
-  #callback {
-  #  ltfb {
-  #    round_size: 100
-  #    eval_metrics: "l_l2_y_eval"
-  #    increasing_metric_mode: false
-  #    weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias gen2fc1_linearity_weights gen2fc1_bias_weights gen2fc2_linearity_weights gen2fc2_bias_weights gen2fc3_linearity_weights gen2fc3_bias_weights gen2fc4_linearity_weights gen2fc4_bias_weights"
-
-   # }
- # }
-  block_size: 256
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "data_parallel"
-    parents: "param_data_id encodefc4 l_l2_x l_l2_y L_cyc_x"
-    concatenation {
-    }
-  }
-
-  #callback {
-  #  dump_outputs {
-      #directory :"fw_latent_loss/"
-  #    batch_interval: 782
-  #    layers: "fw_latent_loss"
-  #    execution_modes: "train test"
-  #    format: "npy"
-  #  }
-  #}
-  callback { save_model { dir: "model" } }
-  ##########X cyclic loss, input to this path is Y_fake (gen1fc4) from fw model
-  #### Shares weight with path that takes real/encoder (latent) image
-  layer {
-    fully_connected {
-      #num_neurons: 64
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "gen1fc4"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 512
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2_cyclic"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3_cyclic"
-  }
-  layer {
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    mean_absolute_error{
-    }
-    parents: "gen2fc4_cyclic param_data_id"
-  }
-}
diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext
deleted file mode 100644
index 6c840d48da9..00000000000
--- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext
+++ /dev/null
@@ -1,954 +0,0 @@
-model {
-  name: "cycgan_model"
-  shareable_training_data_reader:false 
-  serialize_io: true
-  procs_per_trainer:0 
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      #scale_factor: 0.05
-      scale_factor: 0.01
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      #scale_factor: 0.025
-      scale_factor: 1.0
-      layer: "l_l2_y"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_inv_fake_bce"
-    }
-    layer_term {
-      #scale_factor: 0.05
-      scale_factor: 0.01
-      layer: "g_inv_adv1_bce"
-    }
-    layer_term {
-      #scale_factor: 0.025
-      scale_factor: 1.0
-      layer: "l_l2_x"
-    }
-    layer_term {
-      scale_factor: 0.1
-      layer: "L_cyc_x"
-    }
-  }
-  num_epochs: 40 
-  super_steps: 10
-  metric {
-    layer_metric {
-      name: "fw_latent_loss"
-      layer: "l_l2_y"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "inv_l1_loss"
-      layer: "l_l2_x"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "X_cyclic_loss"
-      layer: "L_cyc_x"
-    }
-  }
-  data_layout: "data_parallel"
-  layer {
-    input {
-      io_buffer: "partitioned"
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_dummy param_data_id"
-    #children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 2500 2511"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    #name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-########Data space end here
-  ###Generator starts here
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  #layer {
-  #  dropout {
-  #    keep_prob: 0.8
-  #  }
-  #  name: "gen1dropout1"
-  #  data_layout: "data_parallel"
-  #  parents: "gen1leaky_relu2"
-  #}
-  layer {
-    fully_connected {
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    #parents: "gen1dropout1"
-    parents: "gen1leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2500
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      #replace image_dim with latent_dim
-      #num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "gen1leaky_relu3"
-  }
-  #concat latenty sample (image_data_dummy) and param
-  layer {
-    name: "concat_latent_sample_n_param"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy param_data_id"
-    concatenation {
-    }
-  }
-  #####Discriminator
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    #parents: "data"
-    parents: "concat_latent_sample_n_param"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1fc1_real"
-  }
-  layer {
-    parents: "d1leaky_relu1_real"
-    name: "d1fc1_real_bn"
-    weights: "d1fc1_w0 d1fc1_w1 d1fc1_w2 d1fc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    #parents: "d1leaky_relu1_real"
-    parents: "d1fc1_real_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1leaky_relu2_real"
-  }
-  layer {
-    name: "concat_gsample_n_param"
-    data_layout: "data_parallel"
-    parents: "gen1fc4 param_data_id"
-    children: "d1_stop_gradient d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc1linearity d1fc1bias"
-    parents: "d1_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc1_fake"
-  }
-  layer {
-    parents: "d1leaky_relu1_fake"
-    name: "d1fc1_fake_bn"
-    weights: "d1fc1_w0 d1fc1_w1 d1fc1_w2 d1fc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc2linearity d1fc2bias"
-    #parents: "d1leaky_relu1_fake"
-    parents: "d1fc1_fake_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1leaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1fc3linearity d1fc3bias"
-    parents: "d1leaky_relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1fc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample_n_param"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2fc1"
-    data_layout: "data_parallel"
-    parents: "d2_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "d2fc1"
-  }
-  layer {
-    parents: "d2leaky_relu1"
-    name: "d2fc1_bn"
-    freeze: true
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2fc2"
-    data_layout: "data_parallel"
-    #parents: "d2leaky_relu1"
-    parents: "d2fc1_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2fc3"
-    data_layout: "data_parallel"
-    parents: "d2leaky_relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2fc3 one"
-  }
-  layer {
-    name: "l_l2_y"
-    data_layout: "data_parallel"
-    mean_absolute_error {
-    }
-    parents: "gen1fc4 image_data_dummy"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 64
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "image_data_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 512
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3"
-  }
-  layer {
-    name: "concat_param_n_img"
-    data_layout: "data_parallel"
-    parents: "param_data_id image_data_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "concat_param_n_img"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_real"
-  }
-  layer {
-    parents: "d1_invleaky_relu1_real"
-    name: "d1invfc1_real_bn"
-    weights: "d1invfc1_w0 d1invfc1_w1 d1invfc1_w2 d1invfc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    #parents: "d1_invleaky_relu1_real"
-    parents: "d1invfc1_real_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_real"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invleaky_relu2_real"
-  }
-  layer {
-    name: "concat_gsample2_n_img"
-    data_layout: "data_parallel"
-    parents: "gen2fc4 image_data_dummy"
-    children: "d1_inv_stop_gradient d2_inv_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "d1_inv_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d1_invfc1_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc1linearity d1_invfc1bias"
-    parents: "d1_inv_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc1_fake"
-  }
-  layer {
-    parents: "d1_invleaky_relu1_fake"
-    name: "d1invfc1_fake_bn"
-    weights: "d1invfc1_w0 d1invfc1_w1 d1invfc1_w2 d1invfc1_w3"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d1_invfc2_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc2linearity d1_invfc2bias"
-    #parents: "d1_invleaky_relu1_fake"
-    parents: "d1invfc1_fake_bn"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d1_invleaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "d1_invfc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d1_invfc3_fake"
-    data_layout: "data_parallel"
-    weights: "d1_invfc3linearity d1_invfc3bias"
-    parents: "d1_invleaky_relu2_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_real_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_inv_fake_bce"
-    data_layout: "data_parallel"
-    parents: "d1_invfc3_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "d2_inv_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_gsample2_n_img"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "d2_invfc1"
-    data_layout: "data_parallel"
-    parents: "d2_inv_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2_invleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "d2_invfc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "d2_invfc2"
-    data_layout: "data_parallel"
-    parents: "d2_invleaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "d2_invleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "d2_invfc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "d2_invfc3"
-    data_layout: "data_parallel"
-    parents: "d2_invleaky_relu2"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_inv_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "d2_invfc3 one"
-  }
-  layer {
-    name: "l_l2_x"
-    data_layout: "data_parallel"
-    mean_absolute_error{
-    }
-    parents: "gen2fc4 param_data_id"
-  }
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1bias"
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2bias"
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3bias"
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4bias"
-  }
-  weights {
-    name: "d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc1bias"
-  }
-  weights {
-    name: "d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc2bias"
-  }
-  weights {
-    name: "d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1fc3bias"
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1bias"
-  }
-  weights {
-    name: "gen2fc2bias"
-  }
-  weights {
-    name: "gen2fc3bias"
-  }
-  weights {
-    name: "gen2fc4bias"
-  }
-  weights {
-    name: "d1_invfc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc1bias"
-  }
-  weights {
-    name: "d1_invfc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc2bias"
-  }
-  weights {
-    name: "d1_invfc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "d1_invfc3bias"
-  }
-  ###Weights for batch norm
-  weights {
-     name: "d1fc1_w0"
-  }
-  weights {
-     name: "d1fc1_w1"
-  }
-  weights {
-     name: "d1fc1_w2"
-  }
-  weights {
-     name: "d1fc1_w3"
-  }
-  weights {
-     name: "d1invfc1_w0"
-  }
-  weights {
-     name: "d1invfc1_w1"
-  }
-  weights {
-     name: "d1invfc1_w2"
-  }
-  weights {
-     name: "d1invfc1_w3"
-  }
-  mini_batch_size: 128
-  callback {
-    print {
-      interval: 10
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback { gpu_memory_usage {} }
-  #callback { debug {} }
-  #callback {
-  #  summary {
-  #    dir: "."
-  #    mat_interval: 25
-  #  }
-  #}
-  callback {
-    replace_weights {
-      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real d1fc1_real_bn"
-      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3 d2fc1_bn"
-      batch_interval: 1
-    }
-  }
-  #callback {
-  #  ltfb {
-  #    round_size: 100
-  #    eval_metrics: "l_l2_y_eval"
-  #    increasing_metric_mode: false
-  #    weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias gen2fc1_linearity_weights gen2fc1_bias_weights gen2fc2_linearity_weights gen2fc2_bias_weights gen2fc3_linearity_weights gen2fc3_bias_weights gen2fc4_linearity_weights gen2fc4_bias_weights"
-
-   # }
- # }
-  block_size: 256
-  ####For metric, loss per individual sample
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "data_parallel"
-    parents: "param_data_id l_l2_x l_l2_y L_cyc_x"
-    concatenation {
-    }
-  }
-
-  #callback { save_model { dir: "model" } }
-  ##########X cyclic loss, input to this path is Y_fake (gen1fc4) from fw model
-  #### Shares weight with path that takes real/encoder (latent) image
-  layer {
-    fully_connected {
-      #num_neurons: 64
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "gen1fc4"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 512
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2_cyclic"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3_cyclic"
-  }
-  layer {
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    mean_absolute_error{
-    }
-    parents: "gen2fc4_cyclic param_data_id"
-  }
-}
diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext
deleted file mode 100644
index cf8265e7019..00000000000
--- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext
+++ /dev/null
@@ -1,734 +0,0 @@
-model {
-  random_init_models_differently: true
-  serialize_io: true
-  name: "wae_model"
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      #lam = 0.01
-      scale_factor: 0.01
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "img_loss"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "rec_error"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "wae_loss"
-      layer: "img_loss"
-    }
-  }
-  num_epochs: 40
-  data_layout: "data_parallel"
-  layer {
-    input {
-      io_buffer: "partitioned"
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data1"
-    data_layout: "data_parallel"
-  }
-  #z or sample_z
-  #@todo z = -1+2*np.random.rand(batch_size, zdim=20)
-  layer {
-    name: "sample_z"
-    data_layout: "data_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data1"
-    children: "image_data_dummy param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-
-  #concate image data with sample_z
-  layer {
-    name: "concat_y_n_samplez"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy sample_z"
-    concatenation {
-    }
-  }
-
-  ###generator == encoder
-  layer {
-    fully_connected {
-      #num_neurons: 32
-      num_neurons: 1024
-      has_bias: true
-    }
-    name: "encodefc1"
-    data_layout: "data_parallel"
-    #weights: "encodefc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    elu {
-    }
-    name: "encodeleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "encodefc1"
-  }
-  layer {
-    parents: "encodeleaky_relu1"
-    name: "encodefc1_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      #decay: 0.99
-      #scale_init: 1.0
-      #bias_init: 0.0
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "encodefc2"
-    data_layout: "data_parallel"
-    #weights: "encodefc2linearity"
-    parents: "encodefc1_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "encodefc2"
-  }
-  layer {
-    parents: "encodeleaky_relu2"
-    name: "encodefc2_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-    name: "encodefc3"
-    data_layout: "data_parallel"
-    #weights: "encodefc3linearity"
-    parents: "encodefc2_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu3"
-    data_layout: "data_parallel"
-    parents: "encodefc3"
-  }
-  layer {
-    parents: "encodeleaky_relu3"
-    name: "encodefc3_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      #gen output is latent dim
-      num_neurons: 20
-      has_bias: true
-    }
-    #z_sample
-    name: "encodefc4"
-    data_layout: "data_parallel"
-    #weights: "encodefc4linearity"
-    parents: "encodefc3_bn"
-  }
-
-  ####Discriminator
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc1linearity wae_d1fc1bias"
-    parents: "concat_y_n_samplez"
-  }
-  layer {
-    leaky_relu {
-    }
-    #@todo: use "acts" for activation instead of actualy type
-    name: "wae_d1leaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc2linearity wae_d1fc2bias"
-    parents: "wae_d1leaky_relu1_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc3linearity wae_d1fc3bias"
-    parents: "wae_d1leaky_relu2_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu3_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc3_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d1fc4_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc4linearity wae_d1fc4bias"
-    parents: "wae_d1leaky_relu3_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu4_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc4_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    ## This is D_prior
-    name: "wae_d1fc5_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc5linearity wae_d1fc5bias"
-    parents: "wae_d1leaky_relu4_real"
-  }
-  layer {
-    name: "concat_y_n_zsample"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy encodefc4"
-    children: "wae_d1_stop_gradient wae_d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "wae_d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc1linearity wae_d1fc1bias"
-    parents: "wae_d1_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc2linearity wae_d1fc2bias"
-    parents: "wae_d1leaky_relu1_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc3linearity wae_d1fc3bias"
-    parents: "wae_d1leaky_relu2_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu3_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc3_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d1fc4_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc4linearity wae_d1fc4bias"
-    parents: "wae_d1leaky_relu3_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu4_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc4_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    #This is D_sample
-    name: "wae_d1fc5_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc5linearity wae_d1fc5bias"
-    parents: "wae_d1leaky_relu4_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc5_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc5_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "wae_d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d2fc1"
-    data_layout: "data_parallel"
-    parents: "wae_d2_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d2fc2"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d2fc3"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d2fc4"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu3"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu4"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc4"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "wae_d2fc5"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu4"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc5 one"
-  }
-  layer {
-    name: "decode0_minus_y"
-    data_layout: "data_parallel"
-    parents: "decode0 image_data_dummy"
-   weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  #L2loss
-  layer {
-    l2_norm2 {
-    }
-    name: "rec_error"
-    data_layout: "data_parallel"
-    parents: "decode0_minus_y"
-  }
-
-  layer {
-    parents: "decode0 image_data_dummy"
-    name: "img_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "encodefc4"
-    name: "decode3"
-    weights: "decode3linearity decode3bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    elu {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    weights: "decode2linearity decode2bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    weights: "decode1linearity decode1bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 1024
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    tanh {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    weights: "decode0linearity decode0bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-
-  ######################
-  ###@todo : delete not used, LTFB uses encodefc*linearity_weights instead
-  weights {
-    name: "encodefc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc4linearity"
-    he_normal_initializer {
-    }
-  }
-
-  #Decoder weights here to be used in WAE+cyclic model
-  weights {
-    name: "decode0linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode0bias"
-  }
-  weights {
-    name: "decode1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode1bias"
-  }
-  weights {
-    name: "decode2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode2bias"
-  }
-  weights {
-    name: "decode3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode3bias"
-  }
-
-  
-  #Discriminator (shared)
-  weights {
-    name: "wae_d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc1bias"
-  }
-  weights {
-    name: "wae_d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc2bias"
-  }
-  weights {
-    name: "wae_d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc3bias"
-  }
-  weights {
-    name: "wae_d1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc4bias"
-  }
-  weights {
-    name: "wae_d1fc5linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc5bias"
-  }
-  mini_batch_size: 128
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    replace_weights {
-      source_layers: "wae_d1fc1_real wae_d1fc2_real wae_d1fc3_real wae_d1fc4_real wae_d1fc5_real"
-      destination_layers: "wae_d2fc1 wae_d2fc2 wae_d2fc3 wae_d2fc4 wae_d2fc5"
-      batch_interval: 2
-    }
-  }
-  #callback {
-  #  ltfb {
-  #    batch_interval: 100
-  #    low_score_wins: true
-  #    metric: "l_l2_y_eval"
-  #    weights: "encodefc1_linearity_weights encodefc1_bias_weights encodefc2_linearity_weights encodefc2_bias_weights encodefc3_linearity_weights encodefc3_bias_weights encodefc4_linearity_weights encodefc4_bias_weights"
-  #    }
-
- # }
-
-  callback { save_model { dir: "model" } }
-  block_size: 256
-  procs_per_trainer:0 
-}
diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext
deleted file mode 100644
index c38e09e5fb2..00000000000
--- a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext
+++ /dev/null
@@ -1,960 +0,0 @@
-#Augumented version of ae_cyc.prototext so we can we ae_loss, fw_latent_loss and fw_out_loss all in the same file instead of 3 files, a request from MLSI ML team. This augmentation involves replicating blocks for fw_model from cycle gan and encode from autoencoder.
-model {
-  name: "wae_fw_inv_model"
-  shareable_training_data_reader:false 
-  serialize_io: true
-  data_layout: "data_parallel"
-  mini_batch_size: 16384 
-  block_size: 256
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "fw_out_loss" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "wae_loss"
-      layer: "ae_loss"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "fw_latent_loss"
-      layer: "fw_latent_loss"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "fw_out_loss"
-      layer: "fw_out_loss"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "inv_loss"
-      layer: "inv_loss"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "L_cyc_x_loss"
-      layer: "L_cyc_x"
-    }
-  }
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback { timer {} }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  ######################
-  # Data
-  ######################
-  #Layer from cycle GAN
-  layer {
-    input {
-      io_buffer: "partitioned"
-      target_mode: "N/A"
-    }
-    name: "data"
-    data_layout: "data_parallel"
-    parents: " "
-  }
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data"
-    children: "image_data_id param_data_id"
-    slice {
-      #slice_points: "0 16384 16389"
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  ## separate predicted scalar from image
-  layer {
-    name: "slice_image_data_id"
-    data_layout: "data_parallel"
-    parents: "image_data_id"
-    #parents: "reconstruction"
-    children: "image scalar"
-    slice {
-      slice_points: "0 49152 49167"
-    }
-  }
-  #image --not used
-  layer {
-    identity {
-    }
-    name: "image"
-    data_layout: "data_parallel"
-    parents: "slice_image_data_id"
-  }
-  #scalar --used in dump outputs
-  layer {
-    identity {
-    }
-    name: "scalar"
-    data_layout: "data_parallel"
-    parents: "slice_image_data_id"
-  }
-
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      #num_neurons: 256
-      has_bias: true
-    }
-    name: "gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu1_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "gen1leaky_relu1_1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu2_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "gen1leaky_relu2_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 8192
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    parents: "gen1dropout1_1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen1leaky_relu3_1"
-    data_layout: "data_parallel"
-    parents: "gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "gen1leaky_relu3_1"
-  }
-  
-  weights {
-    name: "gen1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc1bias"
-  }
-  weights {
-    name: "gen1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc2bias"
-  }
-  weights {
-    name: "gen1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc3bias"
-  }
-  weights {
-    name: "gen1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen1fc4bias"
-  }
-
-  ###Encoder from WAE
-  #########################
-  layer {
-    fully_connected {
-      #num_neurons: 32
-      num_neurons: 1024
-      has_bias: true
-    }
-    name: "encodefc1"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc1linearity"
-    parents: "image_data_id"
-  }
-  layer {
-    elu {
-    }
-    name: "encodeleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "encodefc1"
-  }
-  layer {
-    parents: "encodeleaky_relu1"
-    name: "encodefc1_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "encodefc2"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc2linearity"
-    parents: "encodefc1_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "encodefc2"
-  }
-  layer {
-    parents: "encodeleaky_relu2"
-    name: "encodefc2_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-    name: "encodefc3"
-    data_layout: "data_parallel"
-    freeze: true
-    #weights: "encodefc3linearity"
-    parents: "encodefc2_bn"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu3"
-    data_layout: "data_parallel"
-    parents: "encodefc3"
-  }
-  layer {
-    parents: "encodeleaky_relu3"
-    name: "encodefc3_bn"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      epsilon: 1e-3
-    }
-  }
-  layer {
-    fully_connected {
-      #gen output is latent dim
-      num_neurons: 20
-      has_bias: true
-    }
-    #z_sample
-    name: "encodefc4"
-    data_layout: "data_parallel"
-    #weights: "encodefc4linearity"
-    freeze: true
-    parents: "encodefc3_bn"
-  }
-  #####################
-
-  layer {
-    parents: "encodefc4"
-    #name: "sample"
-    ###This is actually sample in latent space, call image_data_dummy for legacy
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    identity {}
-  }
-  ####output of encoder goes to decoder and cycGAN duplicates 
-  ######################
-  # Decoder for foward output loss
-
-  # decode3
-  layer {
-    parents: "gen1fc4"
-    name: "decode3"
-    data_layout: "data_parallel"
-    weights: "decode3linearity decode3bias"
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    elu {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    data_layout: "data_parallel"
-    weights: "decode2linearity decode2bias"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    data_layout: "data_parallel"
-    weights: "decode1linearity decode1bias"
-    fully_connected {
-      num_neurons: 1024
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    tanh {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    data_layout: "data_parallel"
-    weights: "decode0linearity decode0bias"
-    fully_connected {
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-
-  ######################
- #Need this?
-  #layer {
-  #  parents: "decode0"
-  #  name: "sigmoid"
-  #  data_layout: "data_parallel"
-  #  sigmoid {}
-  #}
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    #parents: "sigmoid"
-    parents: "decode0"
-    name: "reconstruction"
-    data_layout: "data_parallel"
-    split {}
-  }
-  #layer {
-  #  parents: "reconstruction image_data_id"
-    #name: "binary_cross_entropy"
-  #  name: "mean_squared_error"
-  #  data_layout: "data_parallel"
-    #binary_cross_entropy {}
-  #  mean_squared_error {}
-  #}
-  layer {
-    parents: "reconstruction image_data_id"
-    name: "fw_out_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ####Decoder weights
-  weights {
-    name: "decode0linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode0bias"
-  }
-
-  weights {
-    name: "decode1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode1bias"
-  }
-  weights {
-    name: "decode2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode2bias"
-  }
-  weights {
-    name: "decode3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode3bias"
-  }
-
-#Decoder duplicated for ae_loss
-  # decode3
-  layer {
-    parents: "image_data_dummy"
-    name: "ae_decode3"
-    data_layout: "data_parallel"
-    weights: "decode3linearity decode3bias"
-    fully_connected {
-      num_neurons: 32 
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode3"
-    name: "ae_decode3_tanh"
-    data_layout: "data_parallel"
-    elu {}
-  }
-  layer {
-    parents: "ae_decode3_tanh"
-    name: "ae_decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "ae_decode3_dropout"
-    name: "ae_decode2"
-    data_layout: "data_parallel"
-    weights: "decode2linearity decode2bias"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode2"
-    name: "ae_decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "ae_decode2_tanh"
-    name: "ae_decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "ae_decode2_dropout"
-    name: "ae_decode1"
-    data_layout: "data_parallel"
-    weights: "decode1linearity decode1bias"
-    fully_connected {
-      num_neurons: 1024
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "ae_decode1"
-    name: "ae_decode1_elu"
-    data_layout: "data_parallel"
-    tanh {
-    }
-  }
-  layer {
-    parents: "ae_decode1_elu"
-    name: "ae_decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "ae_decode1_dropout"
-    name: "ae_decode0"
-    data_layout: "data_parallel"
-    weights: "decode0linearity decode0bias"
-    fully_connected {
-      #num_neurons: 16384
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-
-  #layer {
-  #  parents: "ae_decode0"
-  #  name: "ae_sigmoid"
-  #  data_layout: "data_parallel"
-  #  sigmoid {}
-  #}
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "ae_decode0"
-    #parents: "ae_sigmoid"
-    name: "ae_reconstruction"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "ae_reconstruction image_data_id"
-    name: "ae_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-  ###Cycle GAN duplicated for latent loss dump
-  #Takes output of encoder as input
-  layer {
-    fully_connected {
-      num_neurons: 64
-      #num_neurons: 256
-      has_bias: true
-    }
-    name: "latent_gen1fc1"
-    data_layout: "data_parallel"
-    weights: "gen1fc1linearity gen1fc1bias"
-    parents: "param_data_id"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "latent_gen1leaky_relu1_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "latent_gen1fc2"
-    data_layout: "data_parallel"
-    weights: "gen1fc2linearity gen1fc2bias"
-    parents: "latent_gen1leaky_relu1_1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "latent_gen1leaky_relu2_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.8
-    }
-    name: "latent_gen1dropout1_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1leaky_relu2_1"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 8192
-      num_neurons: 2048
-      has_bias: true
-    }
-    name: "latent_gen1fc3"
-    data_layout: "data_parallel"
-    weights: "gen1fc3linearity gen1fc3bias"
-    parents: "latent_gen1dropout1_1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "latent_gen1leaky_relu3_1"
-    data_layout: "data_parallel"
-    parents: "latent_gen1fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 16384
-      #latent_dim
-      num_neurons: 20
-      has_bias: true
-    }
-    name: "latent_gen1fc4"
-    data_layout: "data_parallel"
-    weights: "gen1fc4linearity gen1fc4bias"
-    parents: "latent_gen1leaky_relu3_1"
-  }
-  
-  #layer {
-  #  name: "gsample_minus_latentsample"
-  #  data_layout: "data_parallel"
-  #  parents: "latent_gen1fc4 image_data_dummy"
-  #  weighted_sum {
-  #    scaling_factors: "1 -1"
-  #  }
-  #}
-  layer {
-    name: "fw_latent_loss"
-    data_layout: "data_parallel"
-    #l2_norm2 {
-    #}
-    mean_absolute_error { }
-    #parents: "gsample_minus_latentsample"
-    parents: "latent_gen1fc4 image_data_dummy"
-  }
-  
-  #####Inverse loss from cycle GAN
-  #### latent space (image_data_dummy) -> pred X'(gen2fc4)
-  layer {
-    fully_connected {
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "image_data_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "gen2fc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "gen2fc2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "gen2fc3"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3"
-  }
-  
-
-  #layer {
-  #  name: "gsample2_minus_x"
-  #  data_layout: "data_parallel"
-  #  parents: "gen2fc4 param_data_id"
-  #  weighted_sum {
-  #    scaling_factors: "1 -1"
-  #  }
-  #}
-  ### ||X-X'||
-  layer {
-    #name: "l_l2_x"
-    name: "inv_loss"
-    data_layout: "data_parallel"
-    #l2_norm2 {
-    #}
-    mean_absolute_error{ }
-    parents: "gen2fc4 param_data_id"
-    #parents: "gsample2_minus_x"
-  }
-  #layer {
-  #  name: "abs_inv_loss"
-  #  data_layout: "data_parallel"
-  #  abs {
-  #  }
-  #  parents: "gsample2_minus_x"
-  #}
-  ##########X cyclic loss, input to this path is Y_fake (gen1fc4) from fw model
-  #### Shares weight with path that takes real/encoder (latent) image
-  layer {
-    fully_connected {
-      #num_neurons: 64
-      num_neurons: 16
-      has_bias: true
-    }
-    name: "gen2fc1_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc1linearity gen2fc1bias"
-    parents: "gen1fc4"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu1_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc1_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 512
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "gen2fc2_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc2linearity gen2fc2bias"
-    parents: "gen2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu2_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc2_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 2048
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "gen2fc3_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc3linearity gen2fc3bias"
-    parents: "gen2leaky_relu2_cyclic"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "gen2leaky_relu3_cyclic"
-    data_layout: "data_parallel"
-    parents: "gen2fc3_cyclic"
-  }
-  layer {
-    fully_connected {
-      #num_neurons: 11
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 2 ]
-      has_bias: true
-    }
-    name: "gen2fc4_cyclic"
-    data_layout: "data_parallel"
-    weights: "gen2fc4linearity gen2fc4bias"
-    parents: "gen2leaky_relu3_cyclic"
-  }
-  layer {
-    name: "L_cyc_x"
-    data_layout: "data_parallel"
-    mean_absolute_error{
-    }
-    parents: "gen2fc4_cyclic param_data_id"
-  }
-
-  ####For metric, loss per individual sample
-  layer {
-    name: "param_scalar_latentdim_losses"
-    data_layout: "data_parallel"
-    parents: "param_data_id scalar encodefc4 ae_loss fw_latent_loss fw_out_loss L_cyc_x"
-    concatenation {
-    }
-  }
-  #callback {
-  #  dump_outputs {
-      #directory: "loss/"
-      #batch_interval: 3000
-      #layers: "data_latentdim_losses"
-  #    execution_modes: "test"
-  #    format: "npy"
-  #  }
-  #}
-  callback {
-    save_model {
-      dir: "model"
-      disable_save_after_training: true
-    }
-  }
-  weights {
-    name: "gen2fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "gen2fc1bias"
-  }
-  weights {
-    name: "gen2fc2bias"
-  }
-  weights {
-    name: "gen2fc3bias"
-  }
-  weights {
-    name: "gen2fc4bias"
-  }
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext
deleted file mode 100644
index b3afa63d263..00000000000
--- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext
+++ /dev/null
@@ -1,713 +0,0 @@
-model {
-  random_init_models_differently: true
-  name: "wae_model"
-  serialize_io: true
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0001
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_real_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "disc1_fake_bce"
-    }
-    layer_term {
-      #lam = 0.01
-      scale_factor: 0.01
-      layer: "g_adv1_bce"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "img_loss"
-    }
-    layer_term {
-      scale_factor: 1.0
-      layer: "rec_error"
-    }
-  }
-  metric {
-    layer_metric {
-      layer: "img_loss"
-    }
-  }
-  num_epochs: 6
-  data_layout: "data_parallel"
-  layer {
-    input {
-      io_buffer: "partitioned"
-      data_set_per_model: true
-      target_mode: "N/A"
-    }
-    name: "data1"
-    data_layout: "data_parallel"
-  }
-  #z or sample_z
-  #@todo z = -1+2*np.random.rand(batch_size, zdim=20)
-  layer {
-    name: "sample_z"
-    data_layout: "data_parallel"
-    gaussian {
-      mean: 0.0
-      stdev: 1.0
-      neuron_dims: "20"
-    }
-  }
-
-  layer {
-    name: "zero"
-    data_layout: "data_parallel"
-    constant {
-      value: 0.0
-      num_neurons: "1"
-    }
-  }
-  layer {
-    name: "one"
-    data_layout: "data_parallel"
-    constant {
-      value: 1.0
-      num_neurons: "1"
-    }
-  }
-
-  layer {
-    name: "slice_data"
-    data_layout: "data_parallel"
-    parents: "data1"
-    children: "image_data_dummy param_data_id"
-    slice {
-      get_slice_points_from_reader: "independent"
-    }
-  }
-  layer {
-    identity {
-    }
-    name: "image_data_dummy"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-  layer {
-    identity {
-    }
-    name: "param_data_id"
-    data_layout: "data_parallel"
-    parents: "slice_data"
-  }
-
-  #concate image data with sample_z
-  layer {
-    name: "concat_y_n_samplez"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy sample_z"
-    concatenation {
-    }
-  }
-
-  ###generator == encoder
-  layer {
-    fully_connected {
-      #num_neurons: 32
-      num_neurons: 1024
-      has_bias: true
-    }
-    name: "encodefc1"
-    data_layout: "data_parallel"
-    #weights: "encodefc1linearity"
-    parents: "image_data_dummy"
-  }
-  layer {
-    elu {
-    }
-    name: "encodeleaky_relu1"
-    data_layout: "data_parallel"
-    parents: "encodefc1"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "encodefc2"
-    data_layout: "data_parallel"
-    #weights: "encodefc2linearity"
-    parents: "encodeleaky_relu1"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu2"
-    data_layout: "data_parallel"
-    parents: "encodefc2"
-  }
-  layer {
-    dropout {
-      keep_prob: 1.0
-    }
-    name: "encodedropout1"
-    data_layout: "data_parallel"
-    parents: "encodeleaky_relu2"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-    name: "encodefc3"
-    data_layout: "data_parallel"
-    #weights: "encodefc3linearity"
-    parents: "encodedropout1"
-  }
-  layer {
-    tanh {
-    }
-    name: "encodeleaky_relu3"
-    data_layout: "data_parallel"
-    parents: "encodefc3"
-  }
-  layer {
-    fully_connected {
-      #gen output is latent dim
-      num_neurons: 20
-      has_bias: true
-    }
-    #z_sample
-    name: "encodefc4"
-    data_layout: "data_parallel"
-    #weights: "encodefc4linearity"
-    parents: "encodeleaky_relu3"
-  }
-
-  ####Discriminator
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d1fc1_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc1linearity wae_d1fc1bias"
-    parents: "concat_y_n_samplez"
-  }
-  layer {
-    leaky_relu {
-    }
-    #@todo: use "acts" for activation instead of actualy type
-    name: "wae_d1leaky_relu1_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc1_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d1fc2_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc2linearity wae_d1fc2bias"
-    parents: "wae_d1leaky_relu1_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu2_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc2_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d1fc3_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc3linearity wae_d1fc3bias"
-    parents: "wae_d1leaky_relu2_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu3_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc3_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d1fc4_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc4linearity wae_d1fc4bias"
-    parents: "wae_d1leaky_relu3_real"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu4_real"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc4_real"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    ## This is D_prior
-    name: "wae_d1fc5_real"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc5linearity wae_d1fc5bias"
-    parents: "wae_d1leaky_relu4_real"
-  }
-  layer {
-    name: "concat_y_n_zsample"
-    data_layout: "data_parallel"
-    parents: "image_data_dummy encodefc4"
-    children: "wae_d1_stop_gradient wae_d2_dummy"
-    concatenation {
-    }
-  }
-  layer {
-    name: "wae_d1_stop_gradient"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-    stop_gradient {
-    }
-  }
-  layer {
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d1fc1_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc1linearity wae_d1fc1bias"
-    parents: "wae_d1_stop_gradient"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu1_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc1_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d1fc2_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc2linearity wae_d1fc2bias"
-    parents: "wae_d1leaky_relu1_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu2_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc2_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d1fc3_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc3linearity wae_d1fc3bias"
-    parents: "wae_d1leaky_relu2_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu3_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc3_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d1fc4_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc4linearity wae_d1fc4bias"
-    parents: "wae_d1leaky_relu3_fake"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d1leaky_relu4_fake"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc4_fake"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    #This is D_sample
-    name: "wae_d1fc5_fake"
-    data_layout: "data_parallel"
-    weights: "wae_d1fc5linearity wae_d1fc5bias"
-    parents: "wae_d1leaky_relu4_fake"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_real_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc5_real one"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "disc1_fake_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d1fc5_fake zero"
-  }
-  layer {
-    identity {
-    }
-    name: "wae_d2_dummy"
-    data_layout: "data_parallel"
-    parents: "concat_y_n_zsample"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 512
-      has_bias: true
-    }
-    name: "wae_d2fc1"
-    data_layout: "data_parallel"
-    parents: "wae_d2_dummy"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu1"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc1"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    name: "wae_d2fc2"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu1"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu2"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc2"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    name: "wae_d2fc3"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu2"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu3"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc3"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 64
-      has_bias: true
-    }
-    name: "wae_d2fc4"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu3"
-  }
-  layer {
-    leaky_relu {
-    }
-    name: "wae_d2leaky_relu4"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc4"
-  }
-  layer {
-    freeze: true
-    fully_connected {
-      num_neurons: 1
-      has_bias: true
-    }
-    name: "wae_d2fc5"
-    data_layout: "data_parallel"
-    parents: "wae_d2leaky_relu4"
-  }
-  layer {
-    sigmoid_binary_cross_entropy {
-    }
-    name: "g_adv1_bce"
-    data_layout: "data_parallel"
-    parents: "wae_d2fc5 one"
-  }
-  layer {
-    name: "decode0_minus_y"
-    data_layout: "data_parallel"
-    parents: "decode0 image_data_dummy"
-   weighted_sum {
-      scaling_factors: "1 -1"
-    }
-  }
-  #L2loss
-  layer {
-    l2_norm2 {
-    }
-    name: "rec_error"
-    data_layout: "data_parallel"
-    parents: "decode0_minus_y"
-  }
-
-  layer {
-    parents: "decode0 image_data_dummy"
-    name: "img_loss"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  layer {
-    parents: "encodefc4"
-    name: "decode3"
-    weights: "decode3linearity decode3bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 32
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3"
-    name: "decode3_tanh"
-    data_layout: "data_parallel"
-    elu {}
-  }
-  layer {
-    parents: "decode3_tanh"
-    name: "decode3_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode2
-  layer {
-    parents: "decode3_dropout"
-    name: "decode2"
-    weights: "decode2linearity decode2bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2"
-    name: "decode2_tanh"
-    data_layout: "data_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "decode2_tanh"
-    name: "decode2_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode1
-  layer {
-    parents: "decode2_dropout"
-    name: "decode1"
-    weights: "decode1linearity decode1bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 1024
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1"
-    name: "decode1_elu"
-    data_layout: "data_parallel"
-    tanh {
-    }
-  }
-  layer {
-    parents: "decode1_elu"
-    name: "decode1_dropout"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 1.0
-    }
-  }
-
-  # decode0
-  layer {
-    parents: "decode1_dropout"
-    name: "decode0"
-    weights: "decode0linearity decode0bias"
-    data_layout: "data_parallel"
-    fully_connected {
-      get_slice_points_from_reader: "independent"
-      get_num_neurons_of_slice_from_reader: [ 1 ]
-      has_bias: true
-    }
-  }
-
-  ######################
-  ###@todo : delete not used, LTFB uses encodefc*linearity_weights instead
-  weights {
-    name: "encodefc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "encodefc4linearity"
-    he_normal_initializer {
-    }
-  }
-
-  #Decoder weights here to be used in WAE+cyclic model
-  weights {
-    name: "decode0linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode0bias"
-  }
-  weights {
-    name: "decode1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode1bias"
-  }
-  weights {
-    name: "decode2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode2bias"
-  }
-  weights {
-    name: "decode3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "decode3bias"
-  }
-
-  
-  #Discriminator (shared)
-  weights {
-    name: "wae_d1fc1linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc1bias"
-  }
-  weights {
-    name: "wae_d1fc2linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc2bias"
-  }
-  weights {
-    name: "wae_d1fc3linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc3bias"
-  }
-  weights {
-    name: "wae_d1fc4linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc4bias"
-  }
-  weights {
-    name: "wae_d1fc5linearity"
-    he_normal_initializer {
-    }
-  }
-  weights {
-    name: "wae_d1fc5bias"
-  }
-  mini_batch_size: 128
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    replace_weights {
-      source_layers: "wae_d1fc1_real wae_d1fc2_real wae_d1fc3_real wae_d1fc4_real wae_d1fc5_real"
-      destination_layers: "wae_d2fc1 wae_d2fc2 wae_d2fc3 wae_d2fc4 wae_d2fc5"
-      batch_interval: 2
-    }
-  }
-  #callback {
-  #  ltfb {
-  #    batch_interval: 100
-  #    low_score_wins: true
-  #    metric: "l_l2_y_eval"
-  #    weights: "encodefc1_linearity_weights encodefc1_bias_weights encodefc2_linearity_weights encodefc2_bias_weights encodefc3_linearity_weights encodefc3_bias_weights encodefc4_linearity_weights encodefc4_bias_weights"
-  #    }
-
- # }
-  #callback { save_model { dir: "model" } }
-  block_size: 256
-  procs_per_model:0 
-}
diff --git a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext
deleted file mode 100644
index 99a763c52b6..00000000000
--- a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext
+++ /dev/null
@@ -1,212 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    summary {
-      dir: "."
-      mat_interval: 25
-    }
-  }
-
-  # callback {
-  #   checkpoint {
-  #     checkpoint_dir: "test"
-  #     checkpoint_epochs: 1
-  #     #checkpoint_secs: 7
-  #   }
-  # }
-  # callback {
-  #   debug {
-  #     phase: "test"
-  #   }
-  # }
-  # callback {
-  #   debug_io {
-  #     phase: "test"
-  #     lvl: 1
-  #   }
-  # }
-  callback {
-    adaptive_learning_rate {
-      patience: 4
-      amt: 0.1
-    }
-  }
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  # callback {
-  #   dump_mb_indices {
-  #     basename: "debug_alexnet/"
-  #     interval: 1
-  #   }
-  # }
-  # callback {
-  #   disp_io_stats {
-  #     layers: "data"
-  #   }
-  # }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "images labels"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    name: "images"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    name: "labels"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "images"
-    name: "conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 20
-      conv_dims_i: 5
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "conv1"
-    name: "pool1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 2
-      pool_pads_i: 0
-      pool_strides_i: 2
-      pool_mode: "max"
-    }
-  }
-
-  layer {
-    parents: "pool1"
-    name: "conv2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 50
-      conv_dims_i: 5
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "conv2"
-    name: "pool2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 2
-      pool_pads_i: 0
-      pool_strides_i: 2
-      pool_mode: "max"
-    }
-  }
-
-  layer {
-    parents: "pool2"
-    name: "ip1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1"
-    name: "ip2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip2"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    name: "cross_entropy"
-    parents: "prob labels"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    name: "top1_accuracy"
-    parents: "prob labels"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext
deleted file mode 100644
index 8b6bb672ceb..00000000000
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext
+++ /dev/null
@@ -1,212 +0,0 @@
-model {
-  mini_batch_size: 1024
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ##############################################
-  # Objective function
-  ##############################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    layer_term {
-      layer: "mean_absolute_error"
-      scale_factor: 0.01
-    }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ##############################################
-  # Callbacks
-  ##############################################
-
-  callback { print {} }
-  callback { timer {} }
-
-  ##############################################
-  # Layers
-  ##############################################
-
-  ######################
-  # Data
-  ######################
-
-  layer {
-    name: "input"
-    children: "data label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    reshape {
-      dims: "1 -1" # Reshape to 1xX tensor
-    }
-  }
-  layer {
-    parents: "input"
-    name: "label"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  ######################
-  # Encoder
-  ######################
-
-  # encode1
-  layer {
-    parents: "data"
-    name: "encode1_conv"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 1
-      num_output_channels: 1024
-      conv_dims: "240"
-      conv_pads: "0"
-      conv_strides: "240"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "encode1_conv"
-    name: "encode1_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "encode1_bn"
-    name: "encode1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # encode2
-  layer {
-    parents: "encode1"
-    name: "encode2_fc"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 12
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode2_fc"
-    name: "encode2_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "encode2_bn"
-    name: "encode2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode2
-  weights {
-    name: "decode2_fc_matrix"
-    glorot_uniform_initializer {}
-  }
-  layer {
-    parents: "encode2"
-    name: "decode2_fc"
-    weights: "decode2_fc_matrix"
-    hint_layer: "encode1"
-    data_layout: "data_parallel"
-    fully_connected {
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode2_fc"
-    name: "decode2_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "decode2_bn"
-    name: "decode2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # decode1
-  layer {
-    parents: "decode2"
-    name: "decode1_deconv"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 1
-      num_output_channels: 1
-      conv_dims: "240"
-      conv_pads: "0"
-      conv_strides: "240"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "decode1_deconv"
-    name: "decode1_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "decode1_bn"
-    name: "decode1"
-    data_layout: "data_parallel"
-    elu {
-    }
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "decode1 data"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-  layer {
-    parents: "decode1 data"
-    name: "mean_absolute_error"
-    data_layout: "data_parallel"
-    mean_absolute_error {}
-  }
-
-}
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext
deleted file mode 100644
index 2c503c625ab..00000000000
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext
+++ /dev/null
@@ -1,268 +0,0 @@
-model {
-  mini_batch_size: 32
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ##############################################
-  # Objective function
-  ##############################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ##############################################
-  # Callbacks
-  ##############################################
-
-  callback { print {} }
-  callback { timer {} }
-
-  ##############################################
-  # Layers
-  ##############################################
-
-  #######
-  # INPUT
-  #######
-
-  layer {
-    name: "input"
-    children: "data label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    reshape {
-      dims: "1 -1" # Reshape to 1xX tensor
-    }
-  }
-  layer {
-    parents: "input"
-    name: "label"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  ######################
-  # Encoder
-  ######################
-
-  # encode1
-  layer {
-    parents: "data"
-    name: "encode1_conv"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 1
-      num_output_channels: 512
-      conv_dims: "20"
-      conv_pads: "0"
-      conv_strides: "20"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "encode1_conv"
-    name: "encode1_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "encode1_bn"
-    name: "encode1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # encode2
-  layer {
-    parents: "encode1"
-    name: "encode2_conv"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 1
-      num_output_channels: 1024
-      conv_dims: "12"
-      conv_pads: "0"
-      conv_strides: "12"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "encode2_conv"
-    name: "encode2_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "encode2_bn"
-    name: "encode2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # encode3
-  layer {
-    parents: "encode2"
-    name: "encode3_fc"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 12
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3_fc"
-    name: "encode3_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "encode3_bn"
-    name: "encode3"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  # decode3
-  weights {
-    name: "decode3_fc_matrix"
-    glorot_uniform_initializer {}
-  }
-  layer {
-    parents: "encode3"
-    name: "decode3_fc"
-    weights: "decode3_fc_matrix"
-    hint_layer: "encode2"
-    data_layout: "data_parallel"
-    fully_connected {
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode3_fc"
-    name: "decode3_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "decode3_bn"
-    name: "decode3"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # decode2
-  layer {
-    parents: "decode3"
-    name: "decode2_deconv"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 1
-      num_output_channels: 512
-      conv_dims: "12"
-      conv_pads: "0"
-      conv_strides: "12"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "decode2_deconv"
-    name: "decode2_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "decode2_bn"
-    name: "decode2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  # decode1
-  layer {
-    parents: "decode2"
-    name: "decode1_deconv"
-    data_layout: "data_parallel"
-    deconvolution {
-      num_dims: 1
-      num_output_channels: 1
-      conv_dims: "20"
-      conv_pads: "0"
-      conv_strides: "20"
-      has_bias: true
-      has_vectors: true
-    }
-  }
-  layer {
-    parents: "decode1_deconv"
-    name: "decode1_bn"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "decode1_bn"
-    name: "decode1"
-    data_layout: "data_parallel"
-    elu {}
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "decode1 data"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-}
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext
deleted file mode 100644
index 91810258bd0..00000000000
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext
+++ /dev/null
@@ -1,110 +0,0 @@
-model {
-  mini_batch_size: 512
-  block_size: 256
-  num_epochs: 4
-  num_parallel_readers: 1
-  procs_per_trainer: 0
-
-  ##############################################
-  # Objective function
-  ##############################################
-
-  objective_function {
-    layer_term { layer: "mean_squared_error" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ##############################################
-  # Callbacks
-  ##############################################
-
-  callback { print {} }
-  callback { timer {} }
-
-  ##############################################
-  # Layers
-  ##############################################
-
-  ######################
-  # Data
-  ######################
-
-  layer {
-    name: "input"
-    children: "data label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input"
-    name: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input"
-    name: "label"
-    data_layout: "data_parallel"
-    dummy {}
-  }
-
-  ######################
-  # Encoder
-  ######################
-
-  layer {
-    parents: "data"
-    name: "encode1_fc"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 12
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode1_fc"
-    name: "encode1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  ######################
-  # Decoder
-  ######################
-
-  weights {
-    name: "decode1_fc_matrix"
-    glorot_uniform_initializer {}
-  }
-  layer {
-    parents: "encode1"
-    name: "decode1_fc"
-    weights: "decode1_fc_matrix"
-    hint_layer: "data"
-    data_layout: "data_parallel"
-    fully_connected {
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "decode1_fc"
-    name: "decode1"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  ######################
-  # Reconstruction
-  ######################
-
-  layer {
-    parents: "decode1 data"
-    name: "mean_squared_error"
-    data_layout: "data_parallel"
-    mean_squared_error {}
-  }
-
-}
diff --git a/model_zoo/models/python/keras/mnist_cnn.prototext b/model_zoo/models/python/keras/mnist_cnn.prototext
index f32940773dc..f06df78d39e 100644
--- a/model_zoo/models/python/keras/mnist_cnn.prototext
+++ b/model_zoo/models/python/keras/mnist_cnn.prototext
@@ -132,5 +132,4 @@ model {
     print {
     }
   }
-  block_size: 256
 }
diff --git a/model_zoo/models/resnet50/README.md b/model_zoo/models/resnet50/README.md
deleted file mode 100644
index 1d687181663..00000000000
--- a/model_zoo/models/resnet50/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-example invocation:
- $ srun  --nodes=8 --ntasks-per-node=2  --nvidia_compute_mode=default ./../../build/surface.llnl.gov/model_zoo/lbann --model=model_resnet50.prototext --reader=../prototext/data_reader_imagenet.prototext --optimizer=../prototext/opt_adam.prototext
diff --git a/model_zoo/models/resnet50/model_resnet50.prototext b/model_zoo/models/resnet50/model_resnet50.prototext
deleted file mode 100644
index 520bb8aa11a..00000000000
--- a/model_zoo/models/resnet50/model_resnet50.prototext
+++ /dev/null
@@ -1,1902 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-5 categorical accuracy"
-      layer: "top5_accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  # callback {
-  #   summary {
-  #     dir: "."
-  #     mat_interval: 25
-  #   }
-  # }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  # data
-  layer {
-    name: "data"
-    children: "images labels"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    name: "images"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    name: "labels"
-    parents: "data"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  # conv1
-  layer {
-    parents: "images"
-    name: "conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 7
-      conv_pads_i: 3
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "conv1"
-    name: "bn_conv1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn_conv1"
-    name: "conv1_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "conv1_relu"
-    name: "pool1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 3
-      pool_pads_i: 1
-      pool_strides_i: 2
-      pool_mode: "max"
-    }
-  }
-
-  # res2a
-  layer {
-    parents: "pool1"
-    name: "res2a_branch1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2a_branch1"
-    name: "bn2a_branch1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "pool1"
-    name: "res2a_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2a_branch2a"
-    name: "bn2a_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2a_branch2a"
-    name: "res2a_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2a_branch2a_relu"
-    name: "res2a_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2a_branch2b"
-    name: "bn2a_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2a_branch2b"
-    name: "res2a_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2a_branch2b_relu"
-    name: "res2a_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2a_branch2c"
-    name: "bn2a_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2a_branch1 bn2a_branch2c"
-    name: "res2a"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res2a"
-    name: "res2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res2b
-  layer {
-    parents: "res2a_relu"
-    name: "res2b_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2b_branch2a"
-    name: "bn2b_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2b_branch2a"
-    name: "res2b_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2b_branch2a_relu"
-    name: "res2b_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2b_branch2b"
-    name: "bn2b_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2b_branch2b"
-    name: "res2b_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2b_branch2b_relu"
-    name: "res2b_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2b_branch2c"
-    name: "bn2b_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res2a_relu bn2b_branch2c"
-    name: "res2b"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res2b"
-    name: "res2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res2c
-  layer {
-    parents: "res2b_relu"
-    name: "res2c_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2c_branch2a"
-    name: "bn2c_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2c_branch2a"
-    name: "res2c_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2c_branch2a_relu"
-    name: "res2c_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2c_branch2b"
-    name: "bn2c_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn2c_branch2b"
-    name: "res2c_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res2c_branch2b_relu"
-    name: "res2c_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res2c_branch2c"
-    name: "bn2c_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res2b_relu bn2c_branch2c"
-    name: "res2c"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res2c"
-    name: "res2c_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res3a
-  layer {
-    parents: "res2c_relu"
-    name: "res3a_branch1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3a_branch1"
-    name: "bn3a_branch1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res2c_relu"
-    name: "res3a_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3a_branch2a"
-    name: "bn3a_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3a_branch2a"
-    name: "res3a_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3a_branch2a_relu"
-    name: "res3a_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3a_branch2b"
-    name: "bn3a_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3a_branch2b"
-    name: "res3a_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3a_branch2b_relu"
-    name: "res3a_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3a_branch2c"
-    name: "bn3a_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3a_branch1 bn3a_branch2c"
-    name: "res3a"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res3a"
-    name: "res3a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res3b
-  layer {
-    parents: "res3a_relu"
-    name: "res3b_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3b_branch2a"
-    name: "bn3b_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3b_branch2a"
-    name: "res3b_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3b_branch2a_relu"
-    name: "res3b_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3b_branch2b"
-    name: "bn3b_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3b_branch2b"
-    name: "res3b_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3b_branch2b_relu"
-    name: "res3b_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3b_branch2c"
-    name: "bn3b_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res3a_relu bn3b_branch2c"
-    name: "res3b"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res3b"
-    name: "res3b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res3c
-  layer {
-    parents: "res3b_relu"
-    name: "res3c_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3c_branch2a"
-    name: "bn3c_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3c_branch2a"
-    name: "res3c_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3c_branch2a_relu"
-    name: "res3c_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3c_branch2b"
-    name: "bn3c_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3c_branch2b"
-    name: "res3c_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3c_branch2b_relu"
-    name: "res3c_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3c_branch2c"
-    name: "bn3c_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res3b_relu bn3c_branch2c"
-    name: "res3c"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res3c"
-    name: "res3c_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res3d
-  layer {
-    parents: "res3c_relu"
-    name: "res3d_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3d_branch2a"
-    name: "bn3d_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3d_branch2a"
-    name: "res3d_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3d_branch2a_relu"
-    name: "res3d_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3d_branch2b"
-    name: "bn3d_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn3d_branch2b"
-    name: "res3d_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res3d_branch2b_relu"
-    name: "res3d_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res3d_branch2c"
-    name: "bn3d_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res3c_relu bn3d_branch2c"
-    name: "res3d"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res3d"
-    name: "res3d_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4a
-  layer {
-    parents: "res3d_relu"
-    name: "res4a_branch1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4a_branch1"
-    name: "bn4a_branch1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res3d_relu"
-    name: "res4a_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4a_branch2a"
-    name: "bn4a_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4a_branch2a"
-    name: "res4a_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4a_branch2a_relu"
-    name: "res4a_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4a_branch2b"
-    name: "bn4a_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4a_branch2b"
-    name: "res4a_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4a_branch2b_relu"
-    name: "res4a_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4a_branch2c"
-    name: "bn4a_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4a_branch1 bn4a_branch2c"
-    name: "res4a"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4a"
-    name: "res4a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4b
-  layer {
-    parents: "res4a_relu"
-    name: "res4b_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4b_branch2a"
-    name: "bn4b_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4b_branch2a"
-    name: "res4b_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4b_branch2a_relu"
-    name: "res4b_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4b_branch2b"
-    name: "bn4b_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4b_branch2b"
-    name: "res4b_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4b_branch2b_relu"
-    name: "res4b_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4b_branch2c"
-    name: "bn4b_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4a_relu bn4b_branch2c"
-    name: "res4b"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4b"
-    name: "res4b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4c
-  layer {
-    parents: "res4b_relu"
-    name: "res4c_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4c_branch2a"
-    name: "bn4c_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4c_branch2a"
-    name: "res4c_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4c_branch2a_relu"
-    name: "res4c_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4c_branch2b"
-    name: "bn4c_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4c_branch2b"
-    name: "res4c_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4c_branch2b_relu"
-    name: "res4c_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4c_branch2c"
-    name: "bn4c_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4b_relu bn4c_branch2c"
-    name: "res4c"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4c"
-    name: "res4c_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4d
-  layer {
-    parents: "res4c_relu"
-    name: "res4d_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4d_branch2a"
-    name: "bn4d_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4d_branch2a"
-    name: "res4d_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4d_branch2a_relu"
-    name: "res4d_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4d_branch2b"
-    name: "bn4d_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4d_branch2b"
-    name: "res4d_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4d_branch2b_relu"
-    name: "res4d_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4d_branch2c"
-    name: "bn4d_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4c_relu bn4d_branch2c"
-    name: "res4d"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4d"
-    name: "res4d_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4e
-  layer {
-    parents: "res4d_relu"
-    name: "res4e_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4e_branch2a"
-    name: "bn4e_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4e_branch2a"
-    name: "res4e_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4e_branch2a_relu"
-    name: "res4e_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4e_branch2b"
-    name: "bn4e_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4e_branch2b"
-    name: "res4e_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4e_branch2b_relu"
-    name: "res4e_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4e_branch2c"
-    name: "bn4e_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4d_relu bn4e_branch2c"
-    name: "res4e"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4e"
-    name: "res4e_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res4f
-  layer {
-    parents: "res4e_relu"
-    name: "res4f_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4f_branch2a"
-    name: "bn4f_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4f_branch2a"
-    name: "res4f_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4f_branch2a_relu"
-    name: "res4f_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4f_branch2b"
-    name: "bn4f_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn4f_branch2b"
-    name: "res4f_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res4f_branch2b_relu"
-    name: "res4f_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res4f_branch2c"
-    name: "bn4f_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4e_relu bn4f_branch2c"
-    name: "res4f"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res4f"
-    name: "res4f_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res5a
-  layer {
-    parents: "res4f_relu"
-    name: "res5a_branch1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5a_branch1"
-    name: "bn5a_branch1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res4f_relu"
-    name: "res5a_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5a_branch2a"
-    name: "bn5a_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5a_branch2a"
-    name: "res5a_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5a_branch2a_relu"
-    name: "res5a_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5a_branch2b"
-    name: "bn5a_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5a_branch2b"
-    name: "res5a_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5a_branch2b_relu"
-    name: "res5a_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5a_branch2c"
-    name: "bn5a_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5a_branch1 bn5a_branch2c"
-    name: "res5a"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res5a"
-    name: "res5a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res5b
-  layer {
-    parents: "res5a_relu"
-    name: "res5b_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5b_branch2a"
-    name: "bn5b_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5b_branch2a"
-    name: "res5b_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5b_branch2a_relu"
-    name: "res5b_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5b_branch2b"
-    name: "bn5b_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5b_branch2b"
-    name: "res5b_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5b_branch2b_relu"
-    name: "res5b_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5b_branch2c"
-    name: "bn5b_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res5a_relu bn5b_branch2c"
-    name: "res5b"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res5b"
-    name: "res5b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # res5c
-  layer {
-    parents: "res5b_relu"
-    name: "res5c_branch2a"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5c_branch2a"
-    name: "bn5c_branch2a"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5c_branch2a"
-    name: "res5c_branch2a_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5c_branch2a_relu"
-    name: "res5c_branch2b"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5c_branch2b"
-    name: "bn5c_branch2b"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "bn5c_branch2b"
-    name: "res5c_branch2b_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    parents: "res5c_branch2b_relu"
-    name: "res5c_branch2c"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "res5c_branch2c"
-    name: "bn5c_branch2c"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "res5b_relu bn5c_branch2c"
-    name: "res5c"
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    parents: "res5c"
-    name: "res5c_relu"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-
-  # Inference
-  layer {
-    parents: "res5c_relu"
-    name: "pool5"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 7
-      pool_pads_i: 0
-      pool_strides_i: 1
-      pool_mode: "average"
-    }
-  }
-  layer {
-    parents: "pool5"
-    name: "fc1000"
-    data_layout: "model_parallel"
-    fully_connected {
-    num_neurons: 1000
-    has_bias: false
-    }
-  }
-  layer {
-    parents: "fc1000"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {
-    }
-  }
-  layer {
-    name: "cross_entropy"
-    parents: "prob labels"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-  layer {
-    name: "top1_accuracy"
-    parents: "prob labels"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-  layer {
-    name: "top5_accuracy"
-    parents: "prob labels"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 5 }
-  }
-
-}
diff --git a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext b/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext
deleted file mode 100644
index 038f2dadc39..00000000000
--- a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext
+++ /dev/null
@@ -1,90 +0,0 @@
-data_reader {
-  reader {
-    name: "imagenet"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/"
-    data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/train_list.txt"
-    label_filename: ""
-    validation_percent: 0.01
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 200
-
-    image_preprocessor {
-      raw_width: 256
-      raw_height: 256
-      raw_num_channels: 3
-
-      cropper {
-        disable: false
-        crop_width: 224
-        crop_height: 224
-        crop_randomly: true
-        resized_width: 256
-        resized_height: 256
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-      }
-    }
-  }
-
-  reader {
-    name: "imagenet"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/"
-    data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/test_list.txt"
-    label_filename: ""
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 200
-
-    image_preprocessor {
-      raw_width: 256
-      raw_height: 256
-      raw_num_channels: 3
-
-      cropper {
-        disable: false
-        crop_width: 224
-        crop_height: 224
-        crop_randomly: false
-        resized_width: 256
-        resized_height: 256
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/siamese/finetune-cub/model_cub.prototext b/model_zoo/models/siamese/finetune-cub/model_cub.prototext
deleted file mode 100644
index 6d9e9761e1f..00000000000
--- a/model_zoo/models/siamese/finetune-cub/model_cub.prototext
+++ /dev/null
@@ -1,620 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 50
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy_new" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy_new"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-5 categorical accuracy"
-      layer: "top5_accuracy_new"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    poly_learning_rate {
-      power: 0.5
-    }
-  }
-
-  ###################################################
-  # start of weights
-  ###################################################
-
-  # The weights of the layers conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0
-  # will be initialized as described here but overwritten with pretrained ones.
-  # The optimizer states may not be transferred if lbann2 is used.
-  # The weights of the rest learning layers will be initialized as described here and trained fresh.
-
-  weights {
-    name: "conv1_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_bias"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_bias"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_bias"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc6_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.005
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc6_new_bias"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.005
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_bias"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  ###################################################
-  # start of replicate layers
-  ###################################################
-
-  layer {
-    name: "input_new"
-    children: "data_new label_new"
-    data_layout: "data_parallel"
-    input {
-    }
-  }
-  layer {
-    parents: "input_new"
-    name: "data_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input_new"
-    name: "label_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "data_new"
-    name: "conv1_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "5 5"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head0"
-    name: "relu1_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_new"
-    name: "pool1_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_new"
-    name: "norm1_new"
-    data_layout: "data_parallel"
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-  }
-
-  layer {
-    parents: "norm1_new"
-    name: "conv2_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head0"
-    name: "relu2_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_new"
-    name: "pool2_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_new"
-    name: "norm2_new"
-    data_layout: "data_parallel"
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-  }
-
-  layer {
-    parents: "norm2_new"
-    name: "conv3_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head0"
-    name: "relu3_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_new"
-    name: "conv4_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head0"
-    name: "relu4_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_new"
-    name: "conv5_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-
-    # Control whether to freeze the pre-trained layers of this sequential model:
-    # conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0.
-    # Setting true for a sequential model freezes the current layer as well as those precedes.
-    freeze: false
-  }
-
-  layer {
-    parents: "conv5_head0"
-    name: "relu5_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_new"
-    name: "pool5_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  ###################################################
-  # end of replicate layers
-  ###################################################
-
-  layer {
-    parents: "pool5_new"
-    name: "fc6_new"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc6_new_linearity fc6_new_bias"
-  }
-
-  layer {
-    parents: "fc6_new"
-    name: "relu6_new"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu6_new"
-    name: "drop6_new"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.9
-    }
-  }
-
-  layer {
-    parents: "drop6_new"
-    name: "fc7_new"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc7_new_linearity fc7_new_bias"
-  }
-
-  layer {
-    parents: "fc7_new"
-    name: "relu7_new"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu7_new"
-    name: "drop7_new"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.9
-    }
-  }
-
-  layer {
-    parents: "drop7_new"
-    name: "fc8_new"
-    data_layout: "model_parallel"
-    fully_connected {
-      # The number of outputs specific to the dataset used.
-      # E.g., 200 for CUB, and 431 for CompCars.
-      num_neurons_is_num_labels: true
-      has_bias: false
-    }
-    weights: "fc8_new_linearity fc8_new_bias"
-  }
-
-  layer {
-    parents: "fc8_new"
-    name: "prob_new"
-    data_layout: "model_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob_new label_new"
-    name: "cross_entropy_new"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top1_accuracy_new"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top5_accuracy_new"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 5 }
-  }
-
-}
diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext
deleted file mode 100644
index 091ed4b5acd..00000000000
--- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext
+++ /dev/null
@@ -1,770 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 50
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy_new" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy_new"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-5 categorical accuracy"
-      layer: "top5_accuracy_new"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    poly_learning_rate {
-      power: 0.5
-    }
-  }
-
-  ###################################################
-  # start of weights
-  ###################################################
-
-  # The weights of the layers conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0
-  # will be initialized as described here but overwritten with pretrained ones.
-  # The optimizer states may not be transferred if lbann2 is used.
-  # The weights of the rest learning layers will be initialized as described below and trained fresh.
-
-  weights {
-    name: "conv1_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6_new_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6b_new_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6b_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.005
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  ###################################################
-  # start of replicate layers
-  ###################################################
-
-  layer {
-    name: "input_new"
-    children: "data_new label_new"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input_new"
-    name: "data_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input_new"
-    name: "label_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "data_new"
-    name: "conv1_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "5 5"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head0"
-    name: "bn_conv1_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv1_new"
-    name: "relu1_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_new"
-    name: "pool1_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_new"
-    name: "conv2_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head0"
-    name: "bn_conv2_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv2_new"
-    name: "relu2_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_new"
-    name: "pool2_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_new"
-    name: "conv3_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head0"
-    name: "bn_conv3_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv3_new"
-    name: "relu3_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_new"
-    name: "conv4_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head0"
-    name: "bn_conv4_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv4_new"
-    name: "relu4_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_new"
-    name: "conv5_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-
-    # Control whether to freeze the pre-trained layers of this sequential model:
-    # conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0.
-    # Setting true for a sequential model freezes the current layer as well as those precedes.
-    freeze: false
-  }
-
-  layer {
-    parents: "conv5_head0"
-    name: "bn_conv5_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv5_new"
-    name: "relu5_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_new"
-    name: "pool5_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  ###################################################
-  # end of replicate layers
-  ###################################################
-  ######################################
-  # Start of Doersch Layer 6
-  ######################################
-
-  layer {
-    parents: "pool5_new"
-    name: "conv6_new"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  4096
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv6_new_kernel conv6_new_bias"
-  }
-
-  layer {
-    parents: "conv6_new"
-    name: "bn_conv6_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv6_new"
-    name: "relu6_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu6_new"
-    name: "conv6b_new"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  1024
-      conv_dims: "1 1"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv6b_new_kernel conv6b_new_bias"
-  }
-
-  layer {
-    parents: "conv6b_new"
-    name: "bn_conv6b_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv6b_new"
-    name: "relu6b_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu6b_new"
-    name: "pool6_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  ######################################
-  # End of Doersch Layer 6
-  ######################################
-
-  layer {
-    parents: "pool6_new"
-    name: "fc7_new"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc7_new_linearity fc7_new_bias"
-  }
-
-  layer {
-    parents: "fc7_new"
-    name: "bn_fc7_new"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_fc7_new"
-    name: "relu7_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu7_new"
-    name: "drop7_new"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.9
-    }
-  }
-
-  layer {
-    parents: "drop7_new"
-    name: "fc8_new"
-    data_layout: "data_parallel"
-    fully_connected {
-      # The number of outputs specific to the dataset used.
-      # E.g., 200 for CUB, and 431 for CompCars.
-      num_neurons_is_num_labels: true
-      has_bias: false
-    }
-    weights: "fc8_new_linearity fc8_new_bias"
-  }
-
-  layer {
-    parents: "fc8_new"
-    name: "prob_new"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob_new label_new"
-    name: "cross_entropy_new"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top1_accuracy_new"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top5_accuracy_new"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 5 }
-  }
-
-}
diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext
deleted file mode 100644
index 1c8b5a4ffae..00000000000
--- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext
+++ /dev/null
@@ -1,992 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 50
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy_new" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy_new"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-5 categorical accuracy"
-      layer: "top5_accuracy_new"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    poly_learning_rate {
-      power: 0.5
-    }
-  }
-
-  ###################################################
-  # start of weights
-  ###################################################
-
-  # The weights of the layers conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0
-  # will be initialized as described here but overwritten with pretrained ones.
-  # The optimizer states may not be transferred if lbann2 is used.
-  # The weights of the rest learning layers will be initialized as described below and trained fresh.
-
-  weights {
-    name: "conv1_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6_new_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6b_new_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv6b_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.005
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_new_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        decay_rate: 0.0002
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_new_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        decay_rate: 0
-        nesterov: false
-      }
-    }
-  }
-
-  ###################################################################
-  # weights of batch normalization layers shared among Siamese heads
-  ###################################################################
-
-  weights {
-    name: "bn_conv1_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  ###################################################
-  # start of replicate layers
-  ###################################################
-
-  layer {
-    name: "input_new"
-    children: "data_new label_new"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input_new"
-    name: "data_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "input_new"
-    name: "label_new"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "data_new"
-    name: "conv1_head0"
-    children: "bn_conv1_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "5 5"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head0"
-    name: "bn_conv1_head0"
-    children: "relu1_new"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv1_head0"
-    name: "relu1_new"
-    children: "pool1_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_new"
-    name: "pool1_new"
-    children: "conv2_head0"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_new"
-    name: "conv2_head0"
-    children: "bn_conv2_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head0"
-    name: "bn_conv2_head0"
-    children: "relu2_new"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv2_head0"
-    name: "relu2_new"
-    children: "pool2_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_new"
-    name: "pool2_new"
-    children: "conv3_head0"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_new"
-    name: "conv3_head0"
-    children: "bn_conv3_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head0"
-    name: "bn_conv3_head0"
-    children: "relu3_new"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv3_head0"
-    name: "relu3_new"
-    children: "conv4_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_new"
-    name: "conv4_head0"
-    children: "bn_conv4_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head0"
-    name: "bn_conv4_head0"
-    children: "relu4_new"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv4_head0"
-    name: "relu4_new"
-    children: "conv5_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_new"
-    name: "conv5_head0"
-    children: "bn_conv5_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-  }
-
-  layer {
-    parents: "conv5_head0"
-    name: "bn_conv5_head0"
-    children: "relu5_new"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv5_head0"
-    name: "relu5_new"
-    children: "pool5_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_new"
-    name: "pool5_new"
-    children: "conv6_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  ###################################################
-  # end of replicate layers
-  ###################################################
-  ######################################
-  # Start of Doersch Layer 6
-  ######################################
-
-  layer {
-    parents: "pool5_new"
-    name: "conv6_new"
-    children: "bn_conv6_new"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  4096
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv6_new_kernel conv6_new_bias"
-  }
-
-  layer {
-    parents: "conv6_new"
-    name: "bn_conv6_new"
-    children: "relu6_new"
-    data_layout: "data_parallel"
-    freeze: false
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv6_new"
-    name: "relu6_new"
-    children: "conv6b_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu6_new"
-    name: "conv6b_new"
-    children: "bn_conv6b_new"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  1024
-      conv_dims: "1 1"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv6b_new_kernel conv6b_new_bias"
-  }
-
-  layer {
-    parents: "conv6b_new"
-    name: "bn_conv6b_new"
-    children: "relu6b_new"
-    data_layout: "data_parallel"
-    freeze: false
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_conv6b_new"
-    name: "relu6b_new"
-    children: "pool6_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu6b_new"
-    name: "pool6_new"
-    children: "fc7_new"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  ######################################
-  # End of Doersch Layer 6
-  ######################################
-
-  layer {
-    parents: "pool6_new"
-    name: "fc7_new"
-    children: "bn_fc7_new"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc7_new_linearity fc7_new_bias"
-  }
-
-  layer {
-    parents: "fc7_new"
-    name: "bn_fc7_new"
-    children: "relu7_new"
-    data_layout: "data_parallel"
-    freeze: false
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_fc7_new"
-    name: "relu7_new"
-    children: "drop7_new"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu7_new"
-    name: "drop7_new"
-    children: "fc8_new"
-    data_layout: "data_parallel"
-    dropout {
-      keep_prob: 0.9
-    }
-  }
-
-  layer {
-    parents: "drop7_new"
-    name: "fc8_new"
-    children: "prob_new"
-    data_layout: "data_parallel"
-    fully_connected {
-      # The number of outputs specific to the dataset used.
-      # E.g., 200 for CUB, and 431 for CompCars.
-      num_neurons_is_num_labels: true
-      has_bias: false
-    }
-    weights: "fc8_new_linearity fc8_new_bias"
-  }
-
-  layer {
-    parents: "fc8_new"
-    name: "prob_new"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob_new label_new"
-    name: "cross_entropy_new"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top1_accuracy_new"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-  layer {
-    parents: "prob_new label_new"
-    name: "top5_accuracy_new"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 5 }
-  }
-
-}
diff --git a/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext b/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext
deleted file mode 100644
index 782c1029026..00000000000
--- a/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext
+++ /dev/null
@@ -1,130 +0,0 @@
-data_reader {
-  reader {
-    name: "imagenet_patches"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt"
-    label_filename: ""
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 0.001
-    num_labels: 8
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 440
-      raw_height: 440
-
-      # crop_size must be at least 3*patch_size+2*patch_gap+2*patch_jitter
-      # In addition, it might be better to leave some margin such that patches are
-      # taken from the central area where there are actual interesting objects/patterns.
-
-      cropper {
-        disable: false
-        crop_width: 440
-        crop_height: 440
-        crop_randomly: false
-        resized_width: 440
-        resized_height: 440
-      }
-
-      decolorizer {
-        disable: false
-        pick_1ch: true
-      }
-
-#      colorizer {
-#        disable: false
-#      }
-
-      augmenter {
-        disable: true
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-
-      normalizer {
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-
-      patch_extractor {
-        patch_width: 96
-        patch_height: 96
-        patch_gap: 48
-        patch_jitter: 7
-        centering_mode: 1
-        ca_correction_mode: 0
-      }
-    }
-  }
-
-  reader {
-    name: "imagenet_patches"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt"
-    label_filename: ""
-    absolute_sample_count: 0
-    percent_of_data_to_use: 0.1
-    num_labels: 8
-
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 440
-      raw_height: 440
-
-      cropper {
-        disable: false
-        crop_width: 440
-        crop_height: 440
-        crop_randomly: false
-        resized_width: 440
-        resized_height: 440
-      }
-
-      decolorizer {
-        disable: false
-        pick_1ch: true
-      }
-
-#      colorizer {
-#        disable: false
-#      }
-
-      augmenter {
-        disable: true
-        horizontal_flip: false
-        vertical_flip: false
-        rotation: 0
-        horizontal_shift: 0
-        vertical_shift: 0
-        shear_range: 0
-      }
-
-      normalizer {
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-
-      patch_extractor {
-        patch_width: 96
-        patch_height: 96
-        patch_gap: 48
-        patch_jitter: 7
-        centering_mode: 1
-        ca_correction_mode: 0
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext b/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext
deleted file mode 100644
index 7d825c946a4..00000000000
--- a/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext
+++ /dev/null
@@ -1,98 +0,0 @@
-data_reader {
-  reader {
-    name: "multi_images"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/birds_and_cars/patches_84h_110x110_13x13-blur-ab_compact/train_list_8h_birds.txt"
-    label_filename: ""
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 20
-    num_image_srcs: 3
-
-    image_preprocessor {
-      raw_width: 110
-      raw_height: 110
-
-      cropper {
-        disable: false
-        crop_width: 96
-        crop_height: 96
-        crop_randomly: false
-        resized_width: 96
-        resized_height: 96
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-    }
-  }
-
-  reader {
-    name: "multi_images"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/birds_and_cars/patches_84h_110x110_13x13-blur-ab_compact/val_list_8h.txt"
-    label_filename: ""
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 20
-    num_image_srcs: 3
-
-    image_preprocessor {
-      raw_width: 110
-      raw_height: 110
-
-      cropper {
-        disable: false
-        crop_width: 96
-        crop_height: 96
-        crop_randomly: false
-        resized_width: 96
-        resized_height: 96
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext b/model_zoo/models/siamese/triplet/data_reader_triplet.prototext
deleted file mode 100644
index baa90c196bf..00000000000
--- a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext
+++ /dev/null
@@ -1,98 +0,0 @@
-data_reader {
-  reader {
-    name: "triplet"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz"
-    label_filename: ""
-    validation_percent: 0.1
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 20
-
-    image_preprocessor {
-      raw_width: 110
-      raw_height: 110
-      raw_num_channels: 3
-
-      cropper {
-        disable: false
-        crop_width: 96
-        crop_height: 96
-        crop_randomly: false
-        resized_width: 96
-        resized_height: 96
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-    }
-  }
-
-  reader {
-    name: "triplet"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/"
-    data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz"
-    label_filename: ""
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-    num_labels: 20
-
-    image_preprocessor {
-      raw_width: 110
-      raw_height: 110
-      raw_num_channels: 3
-
-      cropper {
-        disable: false
-        crop_width: 96
-        crop_height: 96
-        crop_randomly: false
-        resized_width: 96
-        resized_height: 96
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-
-      subtractor {
-        disable: false
-        channel_mean: [0.40625, 0.45703, 0.48047]
-      }
-
-      normalizer {
-        disable: true
-        scale: true
-        subtract_mean: true
-        unit_variance: true
-        z_score: false
-      }
-    }
-  }
-}
diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext
deleted file mode 100644
index f85cd86db11..00000000000
--- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext
+++ /dev/null
@@ -1,1560 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 128
-  block_size: 256
-  num_epochs: 1
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "top1_accuracy"
-      unit: "%"
-    }
-  }
-  metric {
-    layer_metric {
-      name: "top-2 categorical accuracy"
-      layer: "top2_accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    imcomm {
-      intertrainer_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    poly_learning_rate {
-      power: 0.5
-    }
-  }
-  callback {
-    checkpoint {
-      checkpoint_dir: "pretrain-stage12"
-      checkpoint_epochs: 1
-      checkpoint_steps: 5000
-    }
-  }
-
-  ###################################################
-  # start of weights
-  ###################################################
-  # In general it is not necessary to explicitly describe weights whether they are shared or not.
-  # Here, we do so to apply a specific initialization method for each weight.
-
-  weights {
-    name: "conv1_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.007
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.008
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv2_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.009
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.01
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv4_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_kernel"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.009
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "conv5_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc6_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.005
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.008
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc6_bias"
-    constant_initializer {
-      value: 0.1
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.007
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc7_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.006
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc8_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc9_linearity"
-    normal_initializer {
-      mean: 0.0
-      standard_deviation: 0.01
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.005
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  weights {
-    name: "fc9_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {
-      sgd {
-        learn_rate: 0.02
-        momentum: 0.9
-        nesterov: false
-      }
-    }
-  }
-
-  ###################################################################
-  # weights of batch normalization layers shared among Siamese heads
-  ###################################################################
-
-  weights {
-    name: "bn_conv1_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv1_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv2_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv3_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv4_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_conv5_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_fc6_scale"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_fc6_bias"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_fc6_running_mean"
-    constant_initializer {
-      value: 0.0
-    }
-    optimizer {}
-  }
-
-
-  weights {
-    name: "bn_fc6_running_variance"
-    constant_initializer {
-      value: 1.0
-    }
-    optimizer {}
-  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  layer {
-    name: "input"
-    children: "slice label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "input"
-    name: "slice"
-    children: "conv1_head0 conv1_head1 conv1_head2"
-    data_layout: "data_parallel"
-    slice {
-      axis: 0
-      slice_points: "0 3 6 9"
-    }
-  }
-  layer {
-    parents: "input"
-    name: "label"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  #### Siamese head 0 begins ####
-
-  layer {
-    parents: "slice"
-    name: "conv1_head0"
-    children: "bn_conv1_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "0 0"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head0"
-    name: "bn_conv1_head0"
-    children: "relu1_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv1_head0"
-    name: "relu1_head0"
-    children: "pool1_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_head0"
-    name: "pool1_head0"
-    children: "conv2_head0"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_head0"
-    name: "conv2_head0"
-    children: "bn_conv2_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head0"
-    name: "bn_conv2_head0"
-    children: "relu2_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv2_head0"
-    name: "relu2_head0"
-    children: "pool2_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_head0"
-    name: "pool2_head0"
-    children: "conv3_head0"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_head0"
-    name: "conv3_head0"
-    children: "bn_conv3_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head0"
-    name: "bn_conv3_head0"
-    children: "relu3_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv3_head0"
-    name: "relu3_head0"
-    children: "conv4_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_head0"
-    name: "conv4_head0"
-    children: "bn_conv4_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head0"
-    name: "bn_conv4_head0"
-    children: "relu4_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv4_head0"
-    name: "relu4_head0"
-    children: "conv5_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_head0"
-    name: "conv5_head0"
-    children: "bn_conv5_head0"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-  }
-
-  layer {
-    parents: "conv5_head0"
-    name: "bn_conv5_head0"
-    children: "relu5_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv5_head0"
-    name: "relu5_head0"
-    children: "pool3_head0"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_head0"
-    name: "pool3_head0"
-    children: "fc6_head0"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool3_head0"
-    name: "fc6_head0"
-    children: "bn_fc6_head0"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc6_linearity fc6_bias"
-  }
-
-  layer {
-    parents: "fc6_head0"
-    name: "bn_fc6_head0"
-    children: "relu6_head0"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
-  }
-
-  layer {
-    parents: "bn_fc6_head0"
-    name: "relu6_head0"
-    children: "concatenation"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  #### Siamese head 0 ends ####
-
-  #### Siamese head 1 begins ####
-
-  layer {
-    parents: "slice"
-    name: "conv1_head1"
-    children: "bn_conv1_head1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "0 0"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head1"
-    name: "bn_conv1_head1"
-    children: "relu1_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv1_head1"
-    name: "relu1_head1"
-    children: "pool1_head1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_head1"
-    name: "pool1_head1"
-    children: "conv2_head1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_head1"
-    name: "conv2_head1"
-    children: "bn_conv2_head1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head1"
-    name: "bn_conv2_head1"
-    children: "relu2_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv2_head1"
-    name: "relu2_head1"
-    children: "pool2_head1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_head1"
-    name: "pool2_head1"
-    children: "conv3_head1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_head1"
-    name: "conv3_head1"
-    children: "bn_conv3_head1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head1"
-    name: "bn_conv3_head1"
-    children: "relu3_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv3_head1"
-    name: "relu3_head1"
-    children: "conv4_head1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_head1"
-    name: "conv4_head1"
-    children: "bn_conv4_head1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head1"
-    name: "bn_conv4_head1"
-    children: "relu4_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv4_head1"
-    name: "relu4_head1"
-    children: "conv5_head1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_head1"
-    name: "conv5_head1"
-    children: "bn_conv5_head1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-  }
-
-  layer {
-    parents: "conv5_head1"
-    name: "bn_conv5_head1"
-    children: "relu5_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv5_head1"
-    name: "relu5_head1"
-    children: "pool3_head1"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_head1"
-    name: "pool3_head1"
-    children: "fc6_head1"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool3_head1"
-    name: "fc6_head1"
-    children: "bn_fc6_head1"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc6_linearity fc6_bias"
-  }
-
-  layer {
-    parents: "fc6_head1"
-    name: "bn_fc6_head1"
-    children: "relu6_head1"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
-  }
-
-  layer {
-    parents: "bn_fc6_head1"
-    name: "relu6_head1"
-    children: "concatenation"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  #### Siamese head 1 ends ####
-
-  #### Siamese head 2 begins ####
-
-  layer {
-    parents: "slice"
-    name: "conv1_head2"
-    children: "bn_conv1_head2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "0 0"
-      conv_strides: "4 4"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv1_kernel conv1_bias"
-  }
-
-  layer {
-    parents: "conv1_head2"
-    name: "bn_conv1_head2"
-    children: "relu1_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv1_head2"
-    name: "relu1_head2"
-    children: "pool1_head2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1_head2"
-    name: "pool1_head2"
-    children: "conv2_head2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool1_head2"
-    name: "conv2_head2"
-    children: "bn_conv2_head2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv2_kernel conv2_bias"
-  }
-
-  layer {
-    parents: "conv2_head2"
-    name: "bn_conv2_head2"
-    children: "relu2_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv2_head2"
-    name: "relu2_head2"
-    children: "pool2_head2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu2_head2"
-    name: "pool2_head2"
-    children: "conv3_head2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool2_head2"
-    name: "conv3_head2"
-    children: "bn_conv3_head2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv3_kernel conv3_bias"
-  }
-
-  layer {
-    parents: "conv3_head2"
-    name: "bn_conv3_head2"
-    children: "relu3_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv3_head2"
-    name: "relu3_head2"
-    children: "conv4_head2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu3_head2"
-    name: "conv4_head2"
-    children: "bn_conv4_head2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv4_kernel conv4_bias"
-  }
-
-  layer {
-    parents: "conv4_head2"
-    name: "bn_conv4_head2"
-    children: "relu4_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv4_head2"
-    name: "relu4_head2"
-    children: "conv5_head2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu4_head2"
-    name: "conv5_head2"
-    children: "bn_conv5_head2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels:  256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      has_bias: true
-      has_vectors: true
-    }
-    weights: "conv5_kernel conv5_bias"
-  }
-
-  layer {
-    parents: "conv5_head2"
-    name: "bn_conv5_head2"
-    children: "relu5_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
-  }
-
-  layer {
-    parents: "bn_conv5_head2"
-    name: "relu5_head2"
-    children: "pool3_head2"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu5_head2"
-    name: "pool3_head2"
-    children: "fc6_head2"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-      has_vectors: true
-    }
-  }
-
-  layer {
-    parents: "pool3_head2"
-    name: "fc6_head2"
-    children: "bn_fc6_head2"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc6_linearity fc6_bias"
-  }
-
-  layer {
-    parents: "fc6_head2"
-    name: "bn_fc6_head2"
-    children: "relu6_head2"
-    data_layout: "data_parallel"
-    freeze: true
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
-  }
-
-  layer {
-    parents: "bn_fc6_head2"
-    name: "relu6_head2"
-    children: "concatenation"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  #### Siamese head 2 ends ####
-
-  layer {
-    parents: "relu6_head0 relu6_head1 relu6_head2"
-    name: "concatenation"
-    children: "fc7"
-    data_layout: "data_parallel"
-    concatenation {}
-  }
-
-  layer {
-    parents: "concatenation"
-    name: "fc7"
-    children: "bn_fc7"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc7_linearity fc7_bias"
-  }
-
-  layer {
-    parents: "fc7"
-    name: "bn_fc7"
-    children: "relu7"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_fc7"
-    name: "relu7"
-    children: "fc8"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu7"
-    name: "fc8"
-    children: "bn_fc8"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons: 4096
-      has_bias: true
-    }
-    weights: "fc8_linearity fc8_bias"
-  }
-
-  layer {
-    parents: "fc8"
-    name: "bn_fc8"
-    children: "relu8"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-      stats_aggregation: "global"
-    }
-  }
-
-  layer {
-    parents: "bn_fc8"
-    name: "relu8"
-    children: "fc9"
-    data_layout: "data_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu8"
-    name: "fc9"
-    children: "prob"
-    data_layout: "data_parallel"
-    fully_connected {
-      num_neurons_is_num_labels: true
-      has_bias: false
-    }
-    weights: "fc9_linearity fc9_bias"
-  }
-
-  layer {
-    parents: "fc9"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-  layer {
-    parents: "prob label"
-    name: "top1_accuracy"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-  layer {
-    parents: "prob label"
-    name: "top2_accuracy"
-    data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 2 }
-  }
-
-}
diff --git a/model_zoo/models/siamese/triplet/opt_sgd.prototext b/model_zoo/models/siamese/triplet/opt_sgd.prototext
deleted file mode 100644
index 9cc83b7fca8..00000000000
--- a/model_zoo/models/siamese/triplet/opt_sgd.prototext
+++ /dev/null
@@ -1,8 +0,0 @@
-optimizer {
-  sgd {
-    learn_rate: 0.01
-    momentum: 0.9
-    decay_rate: 0.0002
-    nesterov: false
-  }
-}
diff --git a/model_zoo/models/simple_mnist/README b/model_zoo/models/simple_mnist/README
deleted file mode 100644
index c25fc5fd140..00000000000
--- a/model_zoo/models/simple_mnist/README
+++ /dev/null
@@ -1,19 +0,0 @@
-This directory contains two simplistic models that may be useful 
-for testing/debugging:
-
-  model_mnist_simple_1.prototext  
-
-    contains two fully connected layers: 500 and 10 neurons
-  
-  model_mnist_simple_2.prototext
-
-    contains three fully connected layers: 500, 200 and 10 neurons
-
-Sample execution for testing lbann2:
-
-  $ srun --nodes=1 --ntasks-per-node=12 \
-      ../../../build/catalyst.llnl.gov/model_zoo/lbann2 \
-      --model={model_mnist_simple_1.prototext,model_mnist_simple_2.prototext} \
-      --reader=../../data_readers/data_reader_mnist.prototext \
-      --optimizer=../../optimizers/opt_sgd.prototext \
-      --num_epochs=2
diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext
deleted file mode 100644
index 77a1c7ed256..00000000000
--- a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext
+++ /dev/null
@@ -1,122 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 3
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    summary {
-      dir: "."
-      mat_interval: 25
-    }
-  }
-  callback {
-    adaptive_learning_rate {
-      patience: 4
-      amt: 0.1
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "image"
-    name: "ip1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1"
-    name: "ip2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip2"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "accuracy"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext
deleted file mode 100644
index 0be924650f9..00000000000
--- a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext
+++ /dev/null
@@ -1,138 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
-  num_epochs: 3
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric {
-    layer_metric {
-      name: "categorical accuracy"
-      layer: "accuracy"
-      unit: "%"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    summary {
-      dir: "."
-      mat_interval: 25
-    }
-  }
-  callback {
-    adaptive_learning_rate {
-      patience: 4
-      amt: 0.1
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "data_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "data_parallel"
-    split {}
-  }
-
-  layer {
-    parents: "image"
-    name: "ip1"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 500
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip1"
-    name: "relu1"
-    data_layout: "model_parallel"
-    relu {}
-  }
-
-  layer {
-    parents: "relu1"
-    name: "ip2"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 200
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip2"
-    name: "relu2"
-    data_layout: "model_parallel"
-    relu {}
-  }
-  layer {
-    parents: "relu2"
-    name: "ip3"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-
-  layer {
-    parents: "ip3"
-    name: "prob"
-    data_layout: "data_parallel"
-    softmax {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "data_parallel"
-    cross_entropy {}
-  }
-
-  layer {
-    parents: "prob label"
-    name: "accuracy"
-    data_layout: "data_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/model_zoo/models/vram/.gitignore b/model_zoo/models/vram/.gitignore
deleted file mode 100644
index a0ac97666ed..00000000000
--- a/model_zoo/models/vram/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-lbann_pb2.py
-lbann_pb2.pyc
-dram.prototext
-vram.prototext
-vram_template.prototext
-generate_vram.py
diff --git a/model_zoo/models/vram/dram_template.prototext b/model_zoo/models/vram/dram_template.prototext
deleted file mode 100644
index 62b6f66de05..00000000000
--- a/model_zoo/models/vram/dram_template.prototext
+++ /dev/null
@@ -1,24 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-  objective_function {
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback { print {} }
-  callback { timer {} }
-
-}
diff --git a/model_zoo/models/vram/generate_dram.py b/model_zoo/models/vram/generate_dram.py
deleted file mode 100755
index 9dd68656da4..00000000000
--- a/model_zoo/models/vram/generate_dram.py
+++ /dev/null
@@ -1,419 +0,0 @@
-#!/usr/bin/env python
-import sys
-import os
-import subprocess
-import functools
-import collections
-
-# Parameters
-lbann_dir       = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip()
-lbann_proto_dir = lbann_dir + "/src/proto/"
-work_dir        = lbann_dir + "/model_zoo/models/vram"
-template_proto  = lbann_dir + "/model_zoo/models/vram/dram_template.prototext"
-output_proto    = lbann_dir + "/model_zoo/models/vram/dram.prototext"
-
-# Convert a list into a space-separated string
-def str_list(l):
-    if isinstance(l, str):
-        return l
-    else:
-        return " ".join(str(i) for i in l)
-
-# Construct a new layer and add it to the model
-def new_layer(model, name, parents, layer_type, device = "", weights = []):
-    if not isinstance(parents, collections.Iterable):
-        return new_layer(model, name, [parents], layer_type, device, weights)
-    if not isinstance(weights, collections.Iterable):
-        return new_layer(model, name, parents, layer_type, device, [weights])
-    l = model.layer.add()
-    l.name = name
-    l.parents = str_list(map(lambda l : l.name, parents))
-    exec("l." + layer_type + ".SetInParent()")
-    l.weights = str_list(map(lambda w : w.name, weights))
-    l.device_allocation = device
-    return l
-
-# Construct a new set of weights and add it to the model
-def new_weights(model, name, initializer = ""):
-    w = model.weights.add()
-    w.name = name
-    if initializer:
-        exec("w." + initializer + ".SetInParent()")
-    return w
-
-class FullyConnectedCell:
-
-    name = ""
-    size = 0
-    model = None
-    has_bias = False
-    activation = None
-    weights = []
-    step = -1
-
-    def __init__(self, name, size, model,
-                 activation = None, initializer = "constant_initializer", has_bias = True):
-        self.name = name
-        self.size = size
-        self.model = model
-        self.has_bias = has_bias
-        self.activation = activation
-
-        # Initialize weights
-        self.weights = [new_weights(model, name + "_linearity", initializer),
-                        new_weights(model, name + "_bias", "constant_initializer")]
-
-    def __call__(self, parent):
-        self.step += 1
-        fc = new_layer(self.model, "%s_fc_step%d" % (self.name, self.step),
-                       parent, "fully_connected", "" ,self.weights)
-        fc.fully_connected.num_neurons = self.size
-        fc.fully_connected.has_bias = self.has_bias
-        if self.activation:
-           act = new_layer(self.model,
-                           "%s_step%d" % (self.name, self.step),
-                           fc, self.activation)
-           return act
-        else:
-            fc.name = "%s_step%d" % (self.name, self.step)
-            return fc
-
-class ConvolutionCell:
-
-    name = ""
-    num_output_channels = 0
-    num_dims = 0
-    conv_dim = 0
-    conv_stride = 0
-    conv_pad = 0
-    model = None
-    has_bias = False
-    activation = None
-    weights = []
-    step = -1
-
-    def __init__(self, name, num_output_channels,
-                 num_dims, conv_dim, conv_stride, conv_pad,
-                 model,
-                 activation = None,
-                 initializer = "constant_initializer",
-                 has_bias = True):
-        self.name = name
-        self.num_output_channels = num_output_channels
-        self.num_dims = num_dims
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_pad = conv_pad
-        self.model = model
-        self.has_bias = has_bias
-        self.activation = activation
-
-        # Initialize weights
-        self.weights = [new_weights(model, name + "_kernel", initializer),
-                        new_weights(model, name + "_bias", "constant_initializer")]
-
-    def __call__(self, parent):
-        self.step += 1
-        conv = new_layer(self.model, "%s_conv_step%d" % (self.name, self.step),
-                         parent, "convolution", "", self.weights)
-        conv.convolution.num_output_channels = self.num_output_channels
-        conv.convolution.num_dims = self.num_dims
-        conv.convolution.conv_dims_i = self.conv_dim
-        conv.convolution.conv_strides_i = self.conv_stride
-        conv.convolution.conv_pads_i = self.conv_pad
-        conv.convolution.has_bias = self.has_bias
-        if self.activation:
-           act = new_layer(self.model,
-                           "%s_step%d" % (self.name, self.step),
-                           conv, self.activation)
-           return act
-        else:
-            conv.name = "%s_step%d" % (self.name, self.step)
-            return conv
-
-# Uses reLU activations
-class LstmCell:
-
-    name = ""
-    size = 0
-    model  = None
-    step = -1
-    outputs = []
-    cells = []
-
-    # Fully-connected layers
-    forget_fc = None
-    input_fc = None
-    output_fc = None
-    cell_fc = None
-
-    def __init__(self, name, size, model):
-        self.name = name
-        self.size = size
-        self.model = model
-
-        # Fully-connected layers
-        self.forget_gate = FullyConnectedCell(name + "_forget_gate_fc", size, model,
-                                              "sigmoid", "glorot_normal_initializer", True)
-        self.input_gate = FullyConnectedCell(name + "_input_gate_fc", size, model,
-                                             "sigmoid", "glorot_normal_initializer", True)
-        self.output_gate = FullyConnectedCell(name + "_output_gate_fc", size, model,
-                                              "sigmoid", "glorot_normal_initializer", True)
-        self.cell_update = FullyConnectedCell(name + "_cell_update_fc", size, model,
-                                              "relu", "he_normal_initializer", True)
-
-        # Initial state
-        self.outputs = [new_layer(model, name + "_output_init", [], "constant")]
-        self.outputs[0].constant.num_neurons = str(size)
-        self.cells = [new_layer(model, name + "_cell_init", [], "constant")]
-        self.cells[0].constant.num_neurons = str(size)
-
-    def __call__(self, parent):
-        self.step += 1
-
-        # LSTM input state is from parent layer and previous output
-        input_state = new_layer(self.model,
-                                "%s_input_state_step%d" % (self.name, self.step),
-                                [parent, self.outputs[-1]],
-                                "concatenation")
-
-        # Gating units
-        f = self.forget_gate(input_state)
-        i = self.input_gate(input_state)
-        o = self.output_gate(input_state)
-
-        # Cell state
-        c = self.cell_update(input_state)
-        cell_forget = new_layer(self.model,
-                                "%s_cell_forget_step%d" % (self.name, self.step),
-                                [f, self.cells[-1]], "hadamard")
-        cell_input = new_layer(self.model,
-                               "%s_cell_input_step%d" % (self.name, self.step),
-                               [i, c], "hadamard")
-        self.cells.append(new_layer(self.model,
-                                    "%s_cell_step%d" % (self.name, self.step),
-                                    [cell_forget, cell_input],
-                                    "sum"))
-
-        # Output
-        act = new_layer(self.model,
-                        "%s_cell_activation_step%d" % (self.name, self.step),
-                        self.cells[-1], "relu")
-        self.outputs.append(new_layer(self.model,
-                                      "%s_step%d" % (self.name, self.step),
-                                      [o, act], "hadamard"))
-        return self.outputs[-1]
-
-# Configure a prototext model (e.g. add layers)
-def configure_model(model):
-
-    # Model parameters
-    unroll_depth = 4
-    image_dims = [3, 227, 227]
-    label_dims = [1000]
-    hidden_size = 128   # RNN state size
-    num_locs = 32
-
-    # Initialize input
-    data = new_layer(model, "data", [], "input", "cpu")
-    image = new_layer(model, "image", data, "split")
-    label = new_layer(model, "label", data, "split")
-    data.children = str_list([image.name, label.name])
-
-    # Initialize useful constants
-    zero1 = new_layer(model, "zero1", [], "constant", "cpu")
-    zero1.constant.value = 0.0
-    zero1.constant.num_neurons = str_list([1])
-    zero3 = new_layer(model, "zero3", [], "constant", "cpu")
-    zero3.constant.value = 0.0
-    zero3.constant.num_neurons = str_list([3])
-    one3 = new_layer(model, "one3", [], "constant", "cpu")
-    one3.constant.value = 1.0
-    one3.constant.num_neurons = str_list([3])
-
-    # Glimpse network components
-    glimpse1_conv1 = ConvolutionCell("glimpse1_conv1", 32, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse1_conv2 = ConvolutionCell("glimpse1_conv2", 64, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse1_conv3 = ConvolutionCell("glimpse1_conv3", 128, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse2_conv1 = ConvolutionCell("glimpse2_conv1", 32, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse2_conv2 = ConvolutionCell("glimpse2_conv2", 64, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse2_conv3 = ConvolutionCell("glimpse2_conv3", 128, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse3_conv1 = ConvolutionCell("glimpse3_conv1", 32, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse3_conv2 = ConvolutionCell("glimpse3_conv2", 64, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-    glimpse3_conv3 = ConvolutionCell("glimpse3_conv3", 128, 2, 3, 1, 1,
-                                     model, "relu", "he_normal_initializer")
-
-    # Recurrent network components
-    lstm1 = LstmCell("lstm1", hidden_size, model)
-    lstm2 = LstmCell("lstm2", hidden_size, model)
-
-    # Location network components
-    loc_list = map(lambda i: 2.0 * i / num_locs - 1.0, range(num_locs))
-    loc = zero3
-    locx_network = FullyConnectedCell("locx_prob", num_locs, model,
-                                   "softmax", "glorot_normal_initializer", False)
-    locy_network = FullyConnectedCell("locy_prob", num_locs, model,
-                                   "softmax", "glorot_normal_initializer", False)
-
-    # Classification network components
-    class_network = FullyConnectedCell("class_prob", label_dims[0], model,
-                                    "softmax", "glorot_normal_initializer", False)
-
-    # Construct unrolled model
-    for step in range(unroll_depth):
-
-        # Extract crops and resize
-        scaled_loc = new_layer(model, "loc_scaled_step%d" % step,
-                               [loc, one3], "weighted_sum", "cpu")
-        scaled_loc.weighted_sum.scaling_factors = str_list([0.5, 0.5])
-        crop1 = new_layer(model, "crop1_step%d" % step,
-                          [image, scaled_loc], "crop", "cpu")
-        crop1.crop.dims = str_list([3, 32, 32])
-        crop2 = new_layer(model, "crop2_step%d" % step,
-                          [image, scaled_loc], "crop", "cpu")
-        crop2.crop.dims = str_list([3, 64, 64])
-        crop2 = new_layer(model, "crop2_resized_step%d" % step, crop2, "pooling")
-        crop2.pooling.num_dims = 2
-        crop2.pooling.pool_dims_i = 2
-        crop2.pooling.pool_strides_i = crop2.pooling.pool_dims_i
-        crop2.pooling.pool_mode = "average"
-        crop3 = new_layer(model, "crop3_step%d" % step,
-                          [image, scaled_loc], "crop", "cpu")
-        crop3.crop.dims = str_list([3, 128, 128])
-        crop3 = new_layer(model, "crop3_resized_step%d" % step, crop3, "pooling")
-        crop3.pooling.num_dims = 2
-        crop3.pooling.pool_dims_i = 4
-        crop3.pooling.pool_strides_i = crop3.pooling.pool_dims_i
-        crop3.pooling.pool_mode = "average"
-
-        # Glimpse networks
-        glimpse1 = glimpse1_conv1(crop1)
-        glimpse1 = glimpse1_conv2(glimpse1)
-        glimpse1 = glimpse1_conv3(glimpse1)
-        glimpse1 = new_layer(model, "glimpse1_step%d" % step, glimpse1, "pooling")
-        glimpse1.pooling.num_dims = 2
-        glimpse1.pooling.pool_dims_i = 32
-        glimpse1.pooling.pool_strides_i = glimpse1.pooling.pool_dims_i
-        glimpse1.pooling.pool_mode = "average"
-        glimpse2 = glimpse2_conv1(crop2)
-        glimpse2 = glimpse2_conv2(glimpse2)
-        glimpse2 = glimpse2_conv3(glimpse2)
-        glimpse2 = new_layer(model, "glimpse2_step%d" % step, glimpse2, "pooling")
-        glimpse2.pooling.num_dims = 2
-        glimpse2.pooling.pool_dims_i = 32
-        glimpse2.pooling.pool_strides_i = glimpse2.pooling.pool_dims_i
-        glimpse2.pooling.pool_mode = "average"
-        glimpse3 = glimpse3_conv1(crop3)
-        glimpse3 = glimpse3_conv2(glimpse3)
-        glimpse3 = glimpse3_conv3(glimpse3)
-        glimpse3 = new_layer(model, "glimpse3_step%d" % step, glimpse3, "pooling")
-        glimpse3.pooling.num_dims = 2
-        glimpse3.pooling.pool_dims_i = 32
-        glimpse3.pooling.pool_strides_i = glimpse3.pooling.pool_dims_i
-        glimpse3.pooling.pool_mode = "average"
-        glimpse = new_layer(model, "glimpse_step%d" % step,
-                            [glimpse1, glimpse2, glimpse3], "concatenation")
-        glimpse = new_layer(model, "glimpse_flat_step%d" % step,
-                            glimpse, "reshape")
-        glimpse.reshape.num_dims = 1
-        glimpse.reshape.dims = str_list([128 * 3])
-
-        # Recurrent network
-        h1 = lstm1(glimpse)
-        h2 = lstm2(h1)
-
-        # Location network
-        locx_prob = locx_network(h2)
-        locx_onehot = new_layer(model, "locx_onehot_step%d" % step,
-                                locx_prob, "categorical_random", "cpu")
-        locx = new_layer(model, "locx_step%d" % step,
-                         locx_onehot, "discrete_random", "cpu")
-        locx.discrete_random.values = str_list(loc_list)
-        locx.discrete_random.dims = str_list([1])
-        locy_prob = locy_network(h2)
-        locy_onehot = new_layer(model, "locy_onehot_step%d" % step,
-                                locy_prob, "categorical_random", "cpu")
-        locy = new_layer(model, "locy_step%d" % step,
-                         locy_onehot, "discrete_random", "cpu")
-        locy.discrete_random.values = str_list(loc_list)
-        locy.discrete_random.dims = str_list([1])
-        loc = new_layer(model, "loc_step%d" % (step+1),
-                        [zero1, locy, locx], "concatenation", "cpu")
-
-        # Classification network
-        class_prob = class_network(h1)
-
-        # Categorical accuracy
-        acc1 = new_layer(model, "top1_accuracy_step%d" % step,
-                         [class_prob, label], "categorical_accuracy")
-        acc5 = new_layer(model, "top5_accuracy_step%d" % step,
-                         [class_prob, label], "top_k_categorical_accuracy")
-        acc5.top_k_categorical_accuracy.k = 5
-        met = model.metric.add()
-        met.layer_metric.name = "categorical accuracy (step %d)" % step
-        met.layer_metric.layer = acc1.name
-        met.layer_metric.unit = "%"
-        met = model.metric.add()
-        met.layer_metric.name = "top-5 categorical accuracy (step %d)" % step
-        met.layer_metric.layer = acc5.name
-        met.layer_metric.unit = "%"
-
-        # Objective function
-        class_obj = new_layer(model, "classification_cross_entropy_step%d" % step,
-                              [class_prob, label], "cross_entropy")
-        locx_obj = new_layer(model, "locx_cross_entropy_step%d" % step,
-                             [locx_prob, locx_onehot], "cross_entropy")
-        locy_obj = new_layer(model, "locy_cross_entropy_step%d" % step,
-                             [locy_prob, locy_onehot], "cross_entropy")
-        obj = model.objective_function.layer_term.add()
-        obj.scale_factor = 1.0
-        obj.layer = class_obj.name
-        obj = model.objective_function.layer_term.add()
-        obj.scale_factor = 1.0
-        obj.layer = locx_obj.name
-        obj = model.objective_function.layer_term.add()
-        obj.scale_factor = 1.0
-        obj.layer = locy_obj.name
-
-
-if __name__ == "__main__":
-
-    # Make sure protobuf Python implementation is built
-    host = subprocess.check_output("hostname").strip("\n1234567890")
-    protoc = lbann_dir + "/build/gnu.Release." + host + ".llnl.gov/install/bin/protoc"
-    proto_python_dir = lbann_dir + "/build/gnu.Release." + host + ".llnl.gov/protobuf/src/python"
-    os.putenv("PROTOC", protoc)
-    subprocess.call("cd " + proto_python_dir + "; "
-                    + sys.executable + " "
-                    + proto_python_dir + "/setup.py build",
-                    shell=True)
-    sys.path.append(proto_python_dir)
-    import google.protobuf.text_format as txtf
-
-    # Compile LBANN protobuf
-    subprocess.call([protoc,
-                     "-I=" + lbann_proto_dir,
-                     "--python_out=" + work_dir,
-                     lbann_proto_dir + "/lbann.proto"])
-    sys.path.append(work_dir)
-    global lbann_pb2
-    import lbann_pb2
-
-    # Load template prototext
-    with open(template_proto, "r") as f:
-        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
-
-    # Configure prototext model
-    configure_model(pb.model)
-
-    # Export prototext
-    with open(output_proto, "w") as f:
-        f.write(txtf.MessageToString(pb))
diff --git a/model_zoo/tests/conduit_timing_test.cpp b/model_zoo/tests/conduit_timing_test.cpp
index a37cee28f54..bf63a81e4f3 100644
--- a/model_zoo/tests/conduit_timing_test.cpp
+++ b/model_zoo/tests/conduit_timing_test.cpp
@@ -27,8 +27,6 @@
 
 #include "lbann_config.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_io_hdf5.hpp"
@@ -75,8 +73,7 @@ std::vector<std::string> jag_scalar_keys = {
 };
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  lbann_comm *comm = initialize(argc, argv, random_seed);
+  lbann_comm *comm = initialize(argc, argv);
   bool master = comm->am_world_master();
   int np = comm->get_procs_in_world();
 
@@ -274,4 +271,3 @@ void test_conduit_3(int from, int to, std::vector<std::string> filenames) {
   std::cerr << "  time to load entire (images) sample: " << n1 << "\n";
   std::cerr << "  time to access image values: " << n2 << "\n";
 }
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext
new file mode 100644
index 00000000000..b4aa6efff8b
--- /dev/null
+++ b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext
@@ -0,0 +1,111 @@
+model {
+  name: "ae_model"
+  shareable_training_data_reader:false
+  serialize_io: true
+  data_layout: "data_parallel"
+  mini_batch_size: 128
+  num_epochs: 4
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "img_loss" }
+    l2_weight_regularization {
+      scale_factor: 1e-4
+    }
+  }
+
+  ###################################################
+  # Metrics
+  ###################################################
+
+  metric {
+    layer_metric {
+      name: "reconstr_loss"
+      layer: "img_loss"
+    }
+  }
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    print {
+      interval: 1
+    }
+  }
+  callback { timer {} }
+
+  ###################################################
+  # start of layers
+  ###################################################
+
+  # Data
+  layer {
+    input {
+      io_buffer: "partitioned"
+      target_mode: "N/A"
+    }
+    name: "data"
+    data_layout: "data_parallel"
+    parents: " "
+  }
+  layer {
+    name: "slice_data"
+    data_layout: "data_parallel"
+    parents: "data"
+    children: "image_data_dummy param_data_id"
+    slice {
+      get_slice_points_from_reader: "independent"
+    }
+  }
+  #Y (images + scalar)
+  layer {
+    identity {
+    }
+    name: "image_data_dummy"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  # X (params not used)
+  layer {
+    identity {
+    }
+    name: "param_data_id"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  ## Hidden layer
+  layer {
+    fully_connected {
+      num_neurons: 1024
+      has_bias: true
+    }
+    name: "encodefc1"
+    data_layout: "data_parallel"
+    parents: "image_data_dummy"
+  }
+
+  #Y'(reconstructed images and scalar)
+  layer {
+    parents: "encodefc1"
+    name: "decode0"
+    data_layout: "data_parallel"
+    hint_layer: "image_data_dummy"
+    fully_connected {
+      has_bias: true
+    }
+  }
+  # Loss/Metric layer
+  layer {
+    parents: "decode0 image_data_dummy"
+    name: "img_loss"
+    data_layout: "data_parallel"
+    mean_squared_error {}
+  }
+
+  ###################################################
+  # end of layers
+  ###################################################
+}
diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext
deleted file mode 100644
index d530e311f1b..00000000000
--- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext
+++ /dev/null
@@ -1,91 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 2
-      lower_bound: 1.999
-      upper_bound: 2.001
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "2 3 2"
-    }
-    data_layout: "data_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "1.2 1 0.8 3.3 -0.2 -0.1 -0.9 -1.1 -2 -1.3 0.3 -1"
-    }
-  }
-
-  # Variations of channel-wise mean layer
-  layer {
-    parents: "x"
-    name: "channelwise_mean_data_parallel"
-    channelwise_mean {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "channelwise_mean_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext
deleted file mode 100644
index b02fd5919ec..00000000000
--- a/model_zoo/tests/layer_tests/model_clamp.prototext
+++ /dev/null
@@ -1,121 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 25.25
-      lower_bound: 25.24
-      upper_bound: 25.26
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-2 -0.25 0.25 0.5 2"
-    }
-  }
-
-  # Variations of clamp layer
-  layer {
-    parents: "x"
-    name: "clamp_0_1_data_parallel"
-    clamp {
-      min: 0
-      max: 1
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "clamp_0_1_model_parallel"
-    clamp {
-      min: 0
-      max: 1
-    }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "clamp_neg1_1_data_parallel"
-    clamp {
-      min: -1
-      max: 1
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "clamp_neg1_1_model_parallel"
-    clamp {
-      min: -1
-      max: 1
-    }
-    data_layout: "model_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "clamp_0_1_data_parallel clamp_0_1_model_parallel clamp_neg1_1_data_parallel clamp_neg1_1_model_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext
deleted file mode 100644
index 1324f945ec8..00000000000
--- a/model_zoo/tests/layer_tests/model_covariance.prototext
+++ /dev/null
@@ -1,123 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 0.08365
-      lower_bound: 0.08364
-      upper_bound: 0.08366
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x0"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x0_vals"
-  }
-  weights {
-    name: "x0_vals"
-    value_initializer {
-      values: "1 -0.5 0.25 -0.125 0.0675"
-    }
-  }
-  layer {
-    name: "x1"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x1_vals"
-  }
-  weights {
-    name: "x1_vals"
-    value_initializer {
-      values: "0.1 0.2 0.4 0.8 1.6"
-    }
-  }
-
-  # Variations of covariance layer
-  layer {
-    parents: "x0 x1"
-    name: "unbiased_covariance_model_parallel"
-    covariance { biased: false }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x0 x1"
-    name: "biased_covariance_model_parallel"
-    covariance { biased: true }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x0 x1"
-    name: "unbiased_covariance_data_parallel"
-    covariance { biased: false }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x0 x1"
-    name: "biased_covariance_data_parallel"
-    covariance { biased: true }
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "unbiased_covariance_model_parallel biased_covariance_model_parallel unbiased_covariance_data_parallel biased_covariance_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext
deleted file mode 100644
index ce20c7cb110..00000000000
--- a/model_zoo/tests/layer_tests/model_elu.prototext
+++ /dev/null
@@ -1,113 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 28.17
-      lower_bound: 28.16
-      upper_bound: 28.18
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-2 -0.25 0.25 0.5 1"
-    }
-  }
-
-  # Variations of ELU layer
-  layer {
-    parents: "x"
-    name: "elu_alpha_default_data_parallel"
-    elu {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "elu_alpha_default_model_parallel"
-    elu {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "elu_alpha_05_data_parallel"
-    elu {
-      alpha: 0.5
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "elu_alpha_05_model_parallel"
-    elu {
-      alpha: 0.5
-    }
-    data_layout: "model_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "elu_alpha_default_data_parallel elu_alpha_default_model_parallel elu_alpha_05_data_parallel elu_alpha_05_model_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext
deleted file mode 100644
index 98eb617f70e..00000000000
--- a/model_zoo/tests/layer_tests/model_identity.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 14.25
-      lower_bound: 14.24
-      upper_bound: 14.26
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-1.5 -0.25 0 0.5 1"
-    }
-  }
-
-  # Variations of identity layer
-  layer {
-    parents: "x"
-    name: "identity_model_parallel"
-    identity {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "identity_data_parallel"
-    identity {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "identity_model_parallel identity_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext
deleted file mode 100644
index 9192a686411..00000000000
--- a/model_zoo/tests/layer_tests/model_l1_norm.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 16
-      lower_bound: 15.99
-      upper_bound: 16.01
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "1 -0.5 0.25 -0.125 0.125"
-    }
-  }
-
-  # Variations of L1 norm layer
-  layer {
-    parents: "x"
-    name: "l1_norm_model_parallel"
-    l1_norm {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "l1_norm_data_parallel"
-    l1_norm {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "l1_norm_model_parallel l1_norm_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext
deleted file mode 100644
index 07c72d2ef85..00000000000
--- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext
+++ /dev/null
@@ -1,77 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 2.5
-      lower_bound: 2.499
-      upper_bound: 2.501
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "0 1 -0.5 0.5 -1"
-    }
-  }
-
-  layer {
-    parents: "x"
-    name: "l2"
-    l2_norm2 {}
-    data_layout: "model_parallel"
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext
deleted file mode 100644
index cc6473695cb..00000000000
--- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext
+++ /dev/null
@@ -1,113 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 6.946
-      lower_bound: 6.945
-      upper_bound: 6.947
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-2 -1 -0.25 0.25 0.5"
-    }
-  }
-
-  # Variations of L1 norm layer
-  layer {
-    parents: "x"
-    name: "leaky_relu_slope_default_data_parallel"
-    leaky_relu {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "leaky_relu_slope_default_model_parallel"
-    leaky_relu {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "leaky_relu_slope_03_data_parallel"
-    leaky_relu {
-      negative_slope: 0.3
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "leaky_relu_slope_03_model_parallel"
-    leaky_relu {
-      negative_slope: 0.3
-    }
-    data_layout: "model_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "leaky_relu_slope_default_data_parallel leaky_relu_slope_default_model_parallel leaky_relu_slope_03_data_parallel leaky_relu_slope_03_model_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext
deleted file mode 100644
index b3e58f7fd15..00000000000
--- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 12.51
-      lower_bound: 12.50
-      upper_bound: 12.52
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-1 -0.25 0 0.5 2"
-    }
-  }
-
-  # Variations of log sigmoid layer
-  layer {
-    parents: "x"
-    name: "log_sigmoid_model_parallel"
-    log_sigmoid {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "log_sigmoid_data_parallel"
-    log_sigmoid {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "log_sigmoid_model_parallel log_sigmoid_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext
deleted file mode 100644
index 12555305705..00000000000
--- a/model_zoo/tests/layer_tests/model_log_softmax.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 275.4
-      lower_bound: 275.3
-      upper_bound: 275.5
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-4 -2 0 1 2"
-    }
-  }
-
-  # Variations of log softmax layer
-  layer {
-    parents: "x"
-    name: "log_softmax_model_parallel"
-    log_softmax {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "log_softmax_data_parallel"
-    log_softmax {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "log_softmax_model_parallel log_softmax_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext
deleted file mode 100644
index beda327e807..00000000000
--- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext
+++ /dev/null
@@ -1,111 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 1
-      lower_bound: 0.999
-      upper_bound: 1.001
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x0"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x0_vals"
-  }
-  weights {
-    name: "x0_vals"
-    value_initializer {
-      values: "1 -0.5 0.25 -0.125 0.125"
-    }
-  }
-  layer {
-    name: "x1"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x1_vals"
-  }
-  weights {
-    name: "x1_vals"
-    value_initializer {
-      values: "1.5 0 -1 -0.125 -0.125"
-    }
-  }
-
-  # Variations of mean absolute error layer
-  layer {
-    parents: "x0 x1"
-    name: "mean_absolute_error_model_parallel"
-    mean_absolute_error {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x0 x1"
-    name: "mean_absolute_error_data_parallel"
-    mean_absolute_error {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "mean_absolute_error_model_parallel mean_absolute_error_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext
deleted file mode 100644
index db91a7ba590..00000000000
--- a/model_zoo/tests/layer_tests/model_relu.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 5.25
-      lower_bound: 5.249
-      upper_bound: 5.251
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-1.5 -0.25 0.25 0.5 1"
-    }
-  }
-
-  # Variations of ReLU layer
-  layer {
-    parents: "x"
-    name: "relu_model_parallel"
-    relu {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "relu_data_parallel"
-    relu {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "relu_model_parallel relu_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext
deleted file mode 100644
index 9e98a04ea17..00000000000
--- a/model_zoo/tests/layer_tests/model_selu.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 15.64
-      lower_bound: 15.63
-      upper_bound: 15.65
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-2 -0.25 0.25 0.5 1"
-    }
-  }
-
-  # Variations of SELU layer
-  layer {
-    parents: "x"
-    name: "selu_model_parallel"
-    selu {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "selu_data_parallel"
-    selu {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "selu_model_parallel selu_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext
deleted file mode 100644
index 989c1fb4c5c..00000000000
--- a/model_zoo/tests/layer_tests/model_sigmoid.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 7.317
-      lower_bound: 7.316
-      upper_bound: 7.318
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-200 -0.25 0 0.5 100"
-    }
-  }
-
-  # Variations of sigmoid layer
-  layer {
-    parents: "x"
-    name: "sigmoid_model_parallel"
-    sigmoid {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "sigmoid_data_parallel"
-    sigmoid {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "sigmoid_model_parallel sigmoid_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext
deleted file mode 100644
index b231ff7d179..00000000000
--- a/model_zoo/tests/layer_tests/model_softmax.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 1.987
-      lower_bound: 1.986
-      upper_bound: 1.988
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-4 -2 0 1 2"
-    }
-  }
-
-  # Variations of softmax layer
-  layer {
-    parents: "x"
-    name: "softmax_model_parallel"
-    softmax {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "softmax_data_parallel"
-    softmax {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "softmax_model_parallel softmax_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext
deleted file mode 100644
index fc4d06823b3..00000000000
--- a/model_zoo/tests/layer_tests/model_softplus.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 14.01
-      lower_bound: 14.00
-      upper_bound: 14.02
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-2 -0.25 0 0.5 1"
-    }
-  }
-
-  # Variations of softplus layer
-  layer {
-    parents: "x"
-    name: "softplus_model_parallel"
-    softplus {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "softplus_data_parallel"
-    softplus {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "softplus_model_parallel softplus_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext
deleted file mode 100644
index 55e4e89cfc9..00000000000
--- a/model_zoo/tests/layer_tests/model_softsign.prototext
+++ /dev/null
@@ -1,97 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 8.486
-      lower_bound: 8.485
-      upper_bound: 8.487
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "-200 -0.25 0 0.5 100"
-    }
-  }
-
-  # Variations of softsign layer
-  layer {
-    parents: "x"
-    name: "softsign_model_parallel"
-    softsign {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "softsign_data_parallel"
-    softsign {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "softsign_model_parallel softsign_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext
deleted file mode 100644
index 87b8a14c7c7..00000000000
--- a/model_zoo/tests/layer_tests/model_squared_difference.prototext
+++ /dev/null
@@ -1,108 +0,0 @@
-model {
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 10.28
-      lower_bound: 10.27
-      upper_bound: 10.29
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x0"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x0_vals"
-  }
-  weights {
-    name: "x0_vals"
-    value_initializer {
-      values: "1 -0.5 0.25 -0.125 0.125"
-    }
-  }
-  layer {
-    name: "x1"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x1_vals"
-  }
-  weights {
-    name: "x1_vals"
-    value_initializer {
-      values: "1.5 0 -1 -0.125 -0.125"
-    }
-  }
-
-  # Variations of mean absolute error layer
-  layer {
-    parents: "x0 x1"
-    name: "squared_difference_model_parallel"
-    squared_difference {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x0 x1"
-    name: "squared_difference_data_parallel"
-    squared_difference {}
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "squared_difference_model_parallel squared_difference_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext
deleted file mode 100644
index 11440379413..00000000000
--- a/model_zoo/tests/layer_tests/model_tessellate.prototext
+++ /dev/null
@@ -1,115 +0,0 @@
-model {
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 131.5
-      lower_bound: 131.4
-      upper_bound: 131.6
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "1 3 1"
-    }
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "0.4 0.6 -0.5"
-    }
-  }
-
-  # Variations of tessellate layer
-  layer {
-    parents: "x"
-    name: "tessellate_data_parallel"
-    tessellate {
-      dims: "2 4 3"
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "tessellate_model_parallel"
-    tessellate {
-      dims: "2 4 3"
-    }
-    data_layout: "model_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "tessellate_data_parallel tessellate_model_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    name: "scales"
-    weights_layer {}
-    weights: "scales_vals"
-    hint_layer: "sum"
-  }
-  weights {
-    name: "scales_vals"
-    value_initializer {
-      values: "1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5"
-    }
-    optimizer {} # No optimizer
-  }
-  layer {
-    parents: "sum scales"
-    name: "scaled_sum"
-    multiply {}
-  }
-  layer {
-    parents: "scaled_sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext
deleted file mode 100644
index 33d0ac06373..00000000000
--- a/model_zoo/tests/layer_tests/model_variance.prototext
+++ /dev/null
@@ -1,109 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 11
-  block_size: 256
-  num_epochs: 0
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function and metrics
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "l2" }
-  }
-  metric {
-    layer_metric {
-      layer: "l2"
-      name: "L2 norm"
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_metric {
-      metric: "L2 norm" # Expected value: 1.239
-      lower_bound: 1.238
-      upper_bound: 1.240
-      error_on_failure: true
-      execution_modes: "test"
-    }
-  }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    data_layout: "data_parallel"
-    input {}
-  }
-
-  # Input data
-  layer {
-    name: "x"
-    weights_layer {
-      dims: "5"
-    }
-    data_layout: "model_parallel"
-    weights: "x_vals"
-  }
-  weights {
-    name: "x_vals"
-    value_initializer {
-      values: "1 -0.5 0.25 -0.125 0.0675"
-    }
-  }
-
-  # Variations of variance layer
-  layer {
-    parents: "x"
-    name: "unbiased_variance_model_parallel"
-    variance { biased: false }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "biased_variance_model_parallel"
-    variance { biased: true }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "unbiased_variance_data_parallel"
-    variance { biased: false }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "x"
-    name: "biased_variance_data_parallel"
-    variance { biased: true }
-    data_layout: "data_parallel"
-  }
-
-  # Combine into objective function
-  layer {
-    parents: "unbiased_variance_model_parallel biased_variance_model_parallel unbiased_variance_data_parallel biased_variance_data_parallel"
-    name: "sum"
-    sum {}
-  }
-  layer {
-    parents: "sum"
-    name: "l2"
-    l2_norm2 {}
-  }
-
-}
diff --git a/model_zoo/tests/model_jag_single_layer_ae.prototext b/model_zoo/tests/model_jag_single_layer_ae.prototext
new file mode 100644
index 00000000000..103b30ef76a
--- /dev/null
+++ b/model_zoo/tests/model_jag_single_layer_ae.prototext
@@ -0,0 +1,142 @@
+# Unit test for JAG model and (particularly) data reader
+# Run time for this example is about 2s per epoch on 16 nodes (32 tasks)
+# Example on how to run:
+# srun --nodes=16 --ntasks=32  build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=model_zoo/tests/model_jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=model_zoo/data_readers/data_reader_jag.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext
+trainer {
+  mini_batch_size: 128
+  shareable_training_data_reader:false
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { checkpoint {
+              checkpoint_dir: "ckpt"
+              checkpoint_epochs: 1
+#              checkpoint_steps: 1
+              } }
+}
+model {
+  name: "ae_model"
+  serialize_io: true
+  data_layout: "data_parallel"
+  num_epochs: 4
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "img_loss" }
+    l2_weight_regularization {
+      scale_factor: 1e-4
+    }
+  }
+
+  ###################################################
+  # Metrics
+  ###################################################
+
+  metric {
+    layer_metric {
+      name: "reconstr_loss"
+      layer: "img_loss"
+    }
+  }
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    print {
+      interval: 1
+    }
+  }
+  callback { timer {} }
+  # See lbann/src/proto/lbann.proto CallbackCheckMetric
+  # See lbann/src/callbacks/callback_check_metric.cpp
+  callback {
+    check_metric {
+      metric: "reconstr_loss",
+      lower_bound: 20.3956,
+      upper_bound: 22.3956,
+      error_on_failure: true,
+      execution_modes: "testing"
+    }
+  }
+
+  ###################################################
+  # start of layers
+  ###################################################
+
+  # Data
+  layer {
+    input {
+      io_buffer: "partitioned"
+      target_mode: "N/A"
+    }
+    name: "data"
+    data_layout: "data_parallel"
+    parents: " "
+  }
+  layer {
+    name: "slice_data"
+    data_layout: "data_parallel"
+    parents: "data"
+    children: "image_data_dummy param_data_id"
+    slice {
+      get_slice_points_from_reader: "independent"
+    }
+  }
+  #Y (images + scalar)
+  layer {
+    identity {
+    }
+    name: "image_data_dummy"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  # X (params not used)
+  layer {
+    identity {
+    }
+    name: "param_data_id"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  ## Hidden layer
+  layer {
+    fully_connected {
+      num_neurons: 1024
+      has_bias: true
+    }
+    name: "encodefc"
+    data_layout: "data_parallel"
+    parents: "image_data_dummy"
+  }
+  layer {
+    parents: "encodefc"
+    name: "encodeelu"
+    data_layout: "data_parallel"
+    elu {}
+  }
+  #Y'(reconstructed images and scalar)
+  layer {
+    parents: "encodeelu"
+    name: "decode"
+    data_layout: "data_parallel"
+    hint_layer: "image_data_dummy"
+    fully_connected {
+      has_bias: true
+    }
+  }
+  # Loss/Metric layer
+  layer {
+    parents: "decode image_data_dummy"
+    name: "img_loss"
+    data_layout: "data_parallel"
+    mean_squared_error {}
+  }
+
+  ###################################################
+  # end of layers
+  ###################################################
+}
diff --git a/model_zoo/tests/model_lenet_mnist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_ckpt.prototext
index e717e129366..10503595d87 100644
--- a/model_zoo/tests/model_lenet_mnist_ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_ckpt.prototext
@@ -1,10 +1,20 @@
+trainer {
+  mini_batch_size: 64
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    checkpoint {
+      checkpoint_dir: "ckpt"
+      checkpoint_epochs: 1
+      checkpoint_steps: 845
+    }
+  }
+}
 model {
   data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
   num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
   disable_cuda: true
   ###################################################
   # Objective function
@@ -32,23 +42,18 @@ model {
   ###################################################
   # Callbacks
   ###################################################
+  summarizer {
+    dir: "."
+  }
 
   callback { print {} }
   callback { timer {} }
   callback {
     summary {
-      dir: "."
       mat_interval: 25
     }
   }
 
-  callback {
-    checkpoint {
-      checkpoint_dir: "ckpt"
-      checkpoint_epochs: 1
-      checkpoint_steps: 845
-    }
-  }
   callback {
     adaptive_learning_rate {
       patience: 4
diff --git a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
index 8afa85edd18..3658275f321 100644
--- a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
@@ -1,10 +1,21 @@
+trainer {
+  mini_batch_size: 64
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    checkpoint {
+      checkpoint_dir: "ckpt"
+      ckpt_dist_epochs: 1
+      ckpt_dist_steps: 845
+      per_rank_dir: "."
+    }
+  }
+}
 model {
   data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
   num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
   disable_cuda: true
   ###################################################
   # Objective function
@@ -33,23 +44,18 @@ model {
   # Callbacks
   ###################################################
 
+  summarizer {
+    dir: "."
+  }
+
   callback { print {} }
   callback { timer {} }
   callback {
     summary {
-      dir: "."
       mat_interval: 25
     }
   }
 
-  callback {
-    checkpoint {
-      checkpoint_dir: "ckpt"
-      ckpt_dist_epochs: 1
-      ckpt_dist_steps: 845
-      per_rank_dir: "."
-    }
-  }
   callback {
     adaptive_learning_rate {
       patience: 4
diff --git a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
index d8e7066afd5..fb508e31dda 100644
--- a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
@@ -1,11 +1,12 @@
+trainer {
+  mini_batch_size: 64
+}
 model {
   data_layout: "data_parallel"
-  mini_batch_size: 64
-  block_size: 256
   num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_trainer: 0
   disable_cuda: true
+  num_epochs: 3
+
   ###################################################
   # Objective function
   ###################################################
@@ -33,11 +34,14 @@ model {
   # Callbacks
   ###################################################
 
+  summarizer {
+    dir: "."
+  }
+
   callback { print {} }
   callback { timer {} }
   callback {
     summary {
-      dir: "."
       mat_interval: 25
     }
   }
diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext
deleted file mode 100644
index 21e5b210d53..00000000000
--- a/model_zoo/tests/model_mnist_conv_graph.prototext
+++ /dev/null
@@ -1,231 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 31
-  block_size: 257
-  num_epochs: 4
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  # data
-  layer {
-    name: "data"
-    children: "images labels"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    name: "images"
-    parents: "data"
-    data_layout: "data_parallel"
-    identity {}
-  }
-  layer {
-    name: "labels"
-    parents: "data"
-    data_layout: "model_parallel"
-    identity {}
-  }
-
-  # conv1
-  layer {
-    parents: "images"
-    name: "conv1"
-    convolution {
-      num_dims: 2
-      num_output_channels: 29
-      conv_dims_i: 7
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "conv1"
-    name: "conv1_pool"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 3
-      pool_pads_i: 1
-      pool_strides_i: 2
-      pool_mode: "average"
-    }
-    data_layout: "data_parallel"
-  }
-
-  # branch1
-  layer {
-    parents: "conv1_pool"
-    name: "branch1_conv1"
-    convolution {
-      num_dims: 2
-      num_output_channels: 10
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-
-  # branch2
-  layer {
-    parents: "conv1_pool"
-    name: "branch2_conv1"
-    convolution {
-      num_dims: 2
-      num_output_channels: 13
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "branch2_conv1"
-    name: "branch2_bn1"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    parents: "branch2_bn1"
-    name: "branch2_conv2"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 10
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-
-  # branch3
-  layer {
-    parents: "conv1_pool"
-    name: "branch3_slice"
-    children: "branch3_conv1 branch3_conv2"
-    data_layout: "data_parallel"
-    slice {
-      axis: 1
-      slice_points: "0 4 6"
-    }
-  }
-  weights {
-    name: "branch3_conv_kernel"
-    glorot_uniform_initializer {}
-  }
-  weights {
-    name: "branch3_conv_bias"
-    constant_initializer {}
-  }
-  layer {
-    parents: "branch3_slice"
-    name: "branch3_conv1"
-    data_layout: "data_parallel"
-    weights: "branch3_conv_kernel branch3_conv_bias"
-    convolution {
-      num_dims: 2
-      num_output_channels: 10
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "branch3_slice"
-    name: "branch3_conv2"
-    data_layout: "data_parallel"
-    weights: "branch3_conv_kernel branch3_conv_bias"
-    convolution {
-      num_dims: 2
-      num_output_channels: 10
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "branch3_conv1 branch3_conv2"
-    name: "branch3_concat"
-    data_layout: "data_parallel"
-    concatenation {
-      axis: 1
-    }
-  }
-
-  # sum
-  layer {
-    parents: "branch1_conv1 branch2_conv2 branch3_concat"
-    name: "sum"
-    data_layout: "data_parallel"
-    sum {}
-  }
-
-  # prob
-  layer {
-    parents: "sum"
-    name: "prob_pool"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 6
-      pool_pads_i: 0
-      pool_strides_i: 1
-      pool_mode: "average"
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "prob_pool"
-    name: "prob_flat"
-    reshape { dims: "-1" }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "prob_flat"
-    name: "prob"
-    softmax {}
-    data_layout: "data_parallel"
-  }
-
-  # cross_entropy
-  layer {
-    name: "cross_entropy"
-    parents: "prob labels"
-    data_layout: "model_parallel"
-    cross_entropy {}
-  }
-
-}
diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext
deleted file mode 100644
index 173ea38fc71..00000000000
--- a/model_zoo/tests/model_mnist_ridge_regression.prototext
+++ /dev/null
@@ -1,76 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 131
-  block_size: 257
-  num_epochs: 4
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "mse" }
-    l2_weight_regularization {
-      scale_factor: 0.01
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric { layer_metric { layer: "mse" } }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "image"
-    name: "fc"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "fc label"
-    name: "mse"
-    data_layout: "model_parallel"
-    mean_squared_error {}
-  }
-
-}
diff --git a/model_zoo/tests/model_mnist_softmax_classifier.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext
deleted file mode 100644
index 8bbd7fa5bc5..00000000000
--- a/model_zoo/tests/model_mnist_softmax_classifier.prototext
+++ /dev/null
@@ -1,86 +0,0 @@
-model {
-  data_layout: "data_parallel"
-  mini_batch_size: 103
-  block_size: 199
-  num_epochs: 4
-  num_parallel_readers: 0
-  procs_per_trainer: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    layer_term { layer: "cross_entropy" }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric { layer_metric { layer: "accuracy" } }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-
-  callback { print {} }
-  callback { timer {} }
-  callback {
-    check_gradients {
-      verbose: false
-      error_on_failure: true
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "data"
-    children: "image label"
-    data_layout: "data_parallel"
-    input {}
-  }
-  layer {
-    parents: "data"
-    name: "image"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "data"
-    name: "label"
-    data_layout: "model_parallel"
-    split {}
-  }
-  layer {
-    parents: "image"
-    name: "fc"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 10
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "fc"
-    name: "prob"
-    data_layout: "model_parallel"
-    softmax {}
-  }
-  layer {
-    parents: "prob label"
-    name: "cross_entropy"
-    data_layout: "model_parallel"
-    cross_entropy {}
-  }
-  layer {
-    parents: "prob label"
-    name: "accuracy"
-    data_layout: "model_parallel"
-    categorical_accuracy {}
-  }
-
-}
diff --git a/model_zoo/vision/.gitignore b/model_zoo/vision/.gitignore
deleted file mode 100644
index 8a9d92cf4c7..00000000000
--- a/model_zoo/vision/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-model.prototext
diff --git a/model_zoo/vision/alexnet.py b/model_zoo/vision/alexnet.py
deleted file mode 100755
index 54fbdbafa5f..00000000000
--- a/model_zoo/vision/alexnet.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from os.path import dirname, join
-import google.protobuf.text_format as txtf
-import lbann
-import lbann.models
-import lbann.proto
-import lbann.contrib.args
-
-# Default data reader
-model_zoo_dir = dirname(dirname(__file__))
-data_reader_prototext = join(model_zoo_dir,
-                             'data_readers',
-                             'data_reader_imagenet.prototext')
-
-# Command-line arguments
-desc = ('Construct and run AlexNet on ImageNet-1K data. '
-        'Running the experiment is only supported on LC systems.')
-parser = argparse.ArgumentParser(description=desc)
-lbann.contrib.args.add_scheduler_arguments(parser)
-parser.add_argument(
-    '--mini-batch-size', action='store', default=256, type=int,
-    help='mini-batch size (default: 256)', metavar='NUM')
-parser.add_argument(
-    '--num-epochs', action='store', default=100, type=int,
-    help='number of epochs (default: 100)', metavar='NUM')
-parser.add_argument(
-    '--num-labels', action='store', default=1000, type=int,
-    help='number of data classes (default: 1000)', metavar='NUM')
-lbann.contrib.args.add_optimizer_arguments(parser)
-parser.add_argument(
-    '--data-reader', action='store',
-    default=data_reader_prototext, type=str,
-    help='data reader prototext file (default: ' + data_reader_prototext + ')',
-    metavar='FILE')
-parser.add_argument(
-    '--prototext', action='store', type=str,
-    help='exported prototext file', metavar='FILE')
-args = parser.parse_args()
-
-# Due to a data reader limitation, the actual model realization must be
-# hardcoded to 1000 labels for ImageNet.
-imagenet_labels = 1000
-
-# Construct layer graph
-input = lbann.Input()
-images = lbann.Identity(input)
-labels = lbann.Identity(input)
-preds = lbann.models.AlexNet(imagenet_labels)(images)
-probs = lbann.Softmax(preds)
-cross_entropy = lbann.CrossEntropy([probs, labels])
-top1 = lbann.CategoricalAccuracy([probs, labels])
-top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
-layers = list(lbann.traverse_layer_graph(input))
-
-# Setup objective function
-weights = set()
-for l in layers:
-    weights.update(l.weights)
-l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4)
-obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
-
-# Setup model
-metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
-           lbann.Metric(top5, name='top-5 accuracy', unit='%')]
-callbacks = [lbann.CallbackPrint(),
-             lbann.CallbackTimer(),
-             lbann.CallbackDropFixedLearningRate(
-                 drop_epoch=[20,40,60], amt=0.1)]
-model = lbann.Model(args.mini_batch_size,
-                    args.num_epochs,
-                    layers=layers,
-                    weights=weights,
-                    objective_function=obj,
-                    metrics=metrics,
-                    callbacks=callbacks)
-
-# Setup optimizer
-opt = lbann.contrib.args.create_optimizer(args)
-
-# Load data reader from prototext
-data_reader_proto = lbann.lbann_pb2.LbannPB()
-with open(args.data_reader, 'r') as f:
-  txtf.Merge(f.read(), data_reader_proto)
-data_reader_proto = data_reader_proto.data_reader
-
-# Save prototext
-if args.prototext:
-    lbann.proto.save_prototext(args.prototext,
-                               model=model, optimizer=opt,
-                               data_reader=data_reader_proto)
-
-# Run experiment
-if not args.prototext:
-    from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels
-    import lbann.contrib.lc.launcher
-    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
-    classes = args.num_labels
-    kwargs['lbann_args'] = (
-        '--data_filedir_train={} --data_filename_train={} '
-        '--data_filedir_test={} --data_filename_test={}'
-        .format(imagenet_dir(data_set='train', num_classes=classes),
-                imagenet_labels(data_set='train', num_classes=classes),
-                imagenet_dir(data_set='val', num_classes=classes),
-                imagenet_labels(data_set='val', num_classes=classes)))
-    lbann.contrib.lc.launcher.run(model, data_reader_proto, opt,
-                                  job_name = 'lbann_alexnet',
-                                  **kwargs)
diff --git a/model_zoo/vision/lenet.py b/model_zoo/vision/lenet.py
deleted file mode 100755
index 30eb060d798..00000000000
--- a/model_zoo/vision/lenet.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os.path
-import google.protobuf.text_format as txtf
-import lbann
-
-# ----------------------------------
-# Command-line arguments
-# ----------------------------------
-
-desc = ('Construct and run LeNet on MNIST data. '
-        'Running the experiment is only supported on LC systems.')
-parser = argparse.ArgumentParser(description=desc)
-parser.add_argument(
-    '--partition', action='store', type=str,
-    help='scheduler partition', metavar='NAME')
-parser.add_argument(
-    '--account', action='store', type=str,
-    help='scheduler account', metavar='NAME')
-args = parser.parse_args()
-
-# ----------------------------------
-# Construct layer graph
-# ----------------------------------
-
-# Input data
-input = lbann.Input()
-images = lbann.Identity(input)
-labels = lbann.Identity(input)
-
-# LeNet
-x = lbann.Convolution(images,
-                      num_dims = 2,
-                      num_output_channels = 6,
-                      num_groups = 1,
-                      conv_dims_i = 5,
-                      conv_strides_i = 1,
-                      conv_dilations_i = 1,
-                      has_bias = True)
-x = lbann.Relu(x)
-x = lbann.Pooling(x,
-                  num_dims = 2,
-                  pool_dims_i = 2,
-                  pool_strides_i = 2,
-                  pool_mode = "max")
-x = lbann.Convolution(x,
-                      num_dims = 2,
-                      num_output_channels = 16,
-                      num_groups = 1,
-                      conv_dims_i = 5,
-                      conv_strides_i = 1,
-                      conv_dilations_i = 1,
-                      has_bias = True)
-x = lbann.Relu(x)
-x = lbann.Pooling(x,
-                  num_dims = 2,
-                  pool_dims_i = 2,
-                  pool_strides_i = 2,
-                  pool_mode = "max")
-x = lbann.FullyConnected(x, num_neurons = 120, has_bias = True)
-x = lbann.Relu(x)
-x = lbann.FullyConnected(x, num_neurons = 84, has_bias = True)
-x = lbann.Relu(x)
-x = lbann.FullyConnected(x, num_neurons = 10, has_bias = True)
-probs = lbann.Softmax(x)
-
-# Loss function and accuracy
-loss = lbann.CrossEntropy([probs, labels])
-acc = lbann.CategoricalAccuracy([probs, labels])
-
-# ----------------------------------
-# Setup experiment
-# ----------------------------------
-
-# Setup model
-mini_batch_size = 64
-num_epochs = 20
-model = lbann.Model(mini_batch_size,
-                    num_epochs,
-                    layers=lbann.traverse_layer_graph(input),
-                    objective_function=loss,
-                    metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
-                    callbacks=[lbann.CallbackPrint(), lbann.CallbackTimer()])
-
-# Setup optimizer
-opt = lbann.SGD(learn_rate=0.01, momentum=0.9)
-
-# Load data reader from prototext
-model_zoo_dir = os.path.dirname(os.path.dirname(__file__))
-data_reader_file = os.path.join(model_zoo_dir,
-                                'data_readers',
-                                'data_reader_mnist.prototext')
-data_reader_proto = lbann.lbann_pb2.LbannPB()
-with open(data_reader_file, 'r') as f:
-    txtf.Merge(f.read(), data_reader_proto)
-data_reader_proto = data_reader_proto.data_reader
-
-# ----------------------------------
-# Run experiment
-# ----------------------------------
-# Note: Use `lbann.contrib.lc.launcher.run` instead for optimized
-# defaults on LC systems.
-
-kwargs = {}
-if args.partition: kwargs['partition'] = args.partition
-if args.account: kwargs['account'] = args.account
-lbann.run(model, data_reader_proto, opt,
-          job_name='lbann_lenet',
-          **kwargs)
diff --git a/model_zoo/vision/resnet.py b/model_zoo/vision/resnet.py
deleted file mode 100755
index adda3b98c50..00000000000
--- a/model_zoo/vision/resnet.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from os.path import dirname, join
-import google.protobuf.text_format as txtf
-import lbann
-import lbann.models
-import lbann.models.resnet
-import lbann.proto
-import lbann.contrib.args
-import lbann.contrib.models.wide_resnet
-
-# Default data reader
-model_zoo_dir = dirname(dirname(__file__))
-data_reader_prototext = join(model_zoo_dir,
-                             'data_readers',
-                             'data_reader_imagenet.prototext')
-
-# Command-line arguments
-desc = ('Construct and run ResNet on ImageNet-1K data. '
-        'Running the experiment is only supported on LC systems.')
-parser = argparse.ArgumentParser(description=desc)
-lbann.contrib.args.add_scheduler_arguments(parser)
-parser.add_argument(
-    '--resnet', action='store', default=50, type=int,
-    choices=(18, 34, 50, 101, 152),
-    help='ResNet variant (default: 50)')
-parser.add_argument(
-    '--width', action='store', default=1, type=float,
-    help='Wide ResNet width factor (default: 1)')
-parser.add_argument(
-    '--block-type', action='store', default=None, type=str,
-    choices=('basic', 'bottleneck'),
-    help='ResNet block type')
-parser.add_argument(
-    '--blocks', action='store', default=None, type=str,
-    help='ResNet block counts (comma-separated list)')
-parser.add_argument(
-    '--block-channels', action='store', default=None, type=str,
-    help='Internal channels in each ResNet block (comma-separated list)')
-parser.add_argument(
-    '--bn-stats-aggregation', action='store', default='local', type=str,
-    help=('aggregation mode for batch normalization statistics '
-          '(default: "local")'))
-parser.add_argument(
-    '--warmup', action='store_true', help='use a linear warmup')
-parser.add_argument(
-    '--mini-batch-size', action='store', default=256, type=int,
-    help='mini-batch size (default: 256)', metavar='NUM')
-parser.add_argument(
-    '--num-epochs', action='store', default=90, type=int,
-    help='number of epochs (default: 90)', metavar='NUM')
-parser.add_argument(
-    '--num-labels', action='store', default=1000, type=int,
-    help='number of data classes (default: 1000)', metavar='NUM')
-lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1)
-parser.add_argument(
-    '--data-reader', action='store',
-    default=data_reader_prototext, type=str,
-    help='data reader prototext file (default: ' + data_reader_prototext + ')',
-    metavar='FILE')
-parser.add_argument(
-    '--prototext', action='store', type=str,
-    help='exported prototext file (do not run experiment)', metavar='FILE')
-args = parser.parse_args()
-
-# Due to a data reader limitation, the actual model realization must be
-# hardcoded to 1000 labels for ImageNet.
-imagenet_labels = 1000
-
-# Choose ResNet variant
-resnet_variant_dict = {18: lbann.models.ResNet18,
-                       34: lbann.models.ResNet34,
-                       50: lbann.models.ResNet50,
-                       101: lbann.models.ResNet101,
-                       152: lbann.models.ResNet152}
-wide_resnet_variant_dict = {50: lbann.contrib.models.wide_resnet.WideResNet50_2}
-block_variant_dict = {
-    'basic': lbann.models.resnet.BasicBlock,
-    'bottleneck': lbann.models.resnet.BottleneckBlock
-}
-
-if (any([args.block_type, args.blocks, args.block_channels])
-    and not all([args.block_type, args.blocks, args.block_channels])):
-    raise RuntimeError('Must specify all of --block-type, --blocks, --block-channels')
-if args.block_type and args.blocks and args.block_channels:
-    # Build custom ResNet.
-    resnet = lbann.models.ResNet(
-        block_variant_dict[args.block_type],
-        imagenet_labels,
-        list(map(int, args.blocks.split(','))),
-        list(map(int, args.block_channels.split(','))),
-        zero_init_residual=True,
-        bn_stats_aggregation=args.bn_stats_aggregation,
-        name='custom_resnet',
-        width=args.width)
-elif args.width == 1:
-    # Vanilla ResNet.
-    resnet = resnet_variant_dict[args.resnet](
-        imagenet_labels,
-        bn_stats_aggregation=args.bn_stats_aggregation)
-elif args.width == 2 and args.resnet == 50:
-    # Use pre-defined WRN-50-2.
-    resnet = wide_resnet_variant_dict[args.resnet](
-        imagenet_labels,
-        bn_stats_aggregation=args.bn_stats_aggregation)
-else:
-    # Some other Wide ResNet.
-    resnet = resnet_variant_dict[args.resnet](
-        imagenet_labels,
-        bn_stats_aggregation=args.bn_stats_aggregation,
-        width=args.width)
-
-# Construct layer graph
-input = lbann.Input()
-images = lbann.Identity(input)
-labels = lbann.Identity(input)
-preds = resnet(images)
-probs = lbann.Softmax(preds)
-cross_entropy = lbann.CrossEntropy([probs, labels])
-top1 = lbann.CategoricalAccuracy([probs, labels])
-top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
-layers = list(lbann.traverse_layer_graph(input))
-
-# Setup objective function
-l2_reg_weights = set()
-for l in layers:
-    if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
-        l2_reg_weights.update(l.weights)
-l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
-obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])
-
-# Setup model
-metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
-           lbann.Metric(top5, name='top-5 accuracy', unit='%')]
-callbacks = [lbann.CallbackPrint(),
-             lbann.CallbackTimer(),
-             lbann.CallbackDropFixedLearningRate(
-                 drop_epoch=[30, 60, 80], amt=0.1)]
-if args.warmup:
-    callbacks.append(
-        lbann.CallbackLinearGrowthLearningRate(
-            target=0.1 * args.mini_batch_size / 256, num_epochs=5))
-model = lbann.Model(args.mini_batch_size,
-                    args.num_epochs,
-                    layers=layers,
-                    objective_function=obj,
-                    metrics=metrics,
-                    callbacks=callbacks)
-
-# Setup optimizer
-opt = lbann.contrib.args.create_optimizer(args)
-
-# Load data reader from prototext
-data_reader_proto = lbann.lbann_pb2.LbannPB()
-with open(args.data_reader, 'r') as f:
-  txtf.Merge(f.read(), data_reader_proto)
-data_reader_proto = data_reader_proto.data_reader
-
-# Save prototext
-if args.prototext:
-    lbann.proto.save_prototext(args.prototext,
-                               model=model, optimizer=opt,
-                               data_reader=data_reader_proto)
-
-# Run experiment
-if not args.prototext:
-    from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels
-    import lbann.contrib.lc.launcher
-    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
-    classes = args.num_labels
-    kwargs['lbann_args'] = (
-        '--data_filedir_train={} --data_filename_train={} '
-        '--data_filedir_test={} --data_filename_test={}'
-        .format(imagenet_dir(data_set='train', num_classes=classes),
-                imagenet_labels(data_set='train', num_classes=classes),
-                imagenet_dir(data_set='val', num_classes=classes),
-                imagenet_labels(data_set='val', num_classes=classes)))
-    lbann.contrib.lc.launcher.run(model, data_reader_proto, opt,
-                                  job_name='lbann_resnet',
-                                  **kwargs)
diff --git a/python/README.md b/python/README.md
deleted file mode 100644
index fec92161dc9..00000000000
--- a/python/README.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# LBANN Python Interface
-
-This provides a convenient Python wrapper for configuring and running
-LBANN experiments. The syntax is meant to be deliberately reminiscent
-of [PyTorch](https://pytorch.org/).
-
-This is still a work in progress, so please [open an
-issue](https://github.com/LLNL/lbann/issues/new) if you find any
-problems or have feature suggestions.
-
-* For more details about the LBANN/ONNX converter,
-see [here](docs/onnx/README.md).
-* For more details about the *accuracy/loss* visualization script
-(also known as `lbplot`), see [here](docs/plot/README.md).
-
-## Setup
-
-The `lbann` Python package is installed as part of the LBANN build
-process. Usage instructions depend on which build method was used.
-
-_Spack_: `module load lbann`.
-
-_CMake_: The Python package is typically installed inside the install
-directory at `<install directory>/share/python`. To make sure Python
-can detect it, update the `PYTHONPATH` environment variable:
-```sh
-export PYTHONPATH=<install directory>/share/python:${PYTHONPATH}
-```
-Alternatively, the package can be installed into a Python
-site-packages directory so that Python can detect it immediately. This
-usually requires an active virtual environment or root access. To
-build with this approach, pass `-DLBANN_PYTHON_IN_INSTALL_DIR=OFF` as
-an argument into CMake during the build process.
-
-_Warnings_:
-* The build system is still under active development.
-* Python 2 is not supported.
-* The CMake build process does not handle package dependencies. See
-  `$LBANN_HOME/cmake/configure_files/setup.py.in` for the full list of
-  dependencies.
-* Installing the ONNX Python package may require some work. See [the
-  documentation](https://github.com/onnx/onnx#source).
-  * If you do not already have the ONNX Python package installed, you
-    will need to ensure the `protoc` compiler is in your path when you
-    run this. Either load the appropriate Spack module or add
-    `<install directory>/bin` to `$PATH` before running.
-
-## Modules
-
-### `lbann`
-
-The `Model` class describes a neural network model and contains the
-following components:
-
-* A `Layer` is a tensor operation, arranged within a directed acyclic
-  graph. A layer will recieve input tensors from its parents and will
-  send output tensor to its children. Once the layer graph has been
-  constructed, it may be helpful to call `traverse_layer_graph`, which
-  is a generator function that traverses the layer graph in a
-  topological order.
-* A `Weights` is a set of trainable parameters, typically associated
-  with one or more layers. The initial values are populated with an
-  `Initializer` and it is optimized with an `Optimizer`.
-* The `ObjectiveFunction` is a mathematical expression that the
-  optimization algorithm will attempt to minimize. It is made up of
-  multiple `ObjectiveFunctionTerm`s, which are added up (possibly with
-  scaling factors) to obtain the full objective function. There are
-  currently two objective function terms:
-    - `LayerTerm` gets its value from a `Layer`. The layer must output
-      a scalar (tensor with one entry).
-    - `L2WeightRegularization` gets its value by computing the L2 norm
-      of the model weights.
-* A `Metric` reports values to the user, which is helpful for
-  evaluating the progress of training. They get the their values from
-  layers, which must output scalars (tensors with one entry).
-* A `Callback` performs some function at various points during
-  training. They are helpful for performing advanced training
-  techniques.
-
-Many of these components, e.g. layers, are automatically generated by
-parsing messages defined in `src/proto/lbann.proto`. This file is
-currently the best source for documentation. Note that LBANN currently
-only supports static models, i.e. models with static execution graphs.
-
-### `lbann.proto`
-
-The `save_prototext` function can be used to export an LBANN
-experiment to a prototext file. A typical experiment is comprised of a
-model, data reader, and optimizer.
-
-### `lbann.modules`
-
-This is a collection of neural network modules, which are patterns of
-layers that take an input layer to produce an output layer. Once
-created, a `Module` is _callable_. Calling it with an input layer will
-add the module's pattern to the layer graph and will return the output
-layer.
-
-_A possible note of confusion_: "modules" in LBANN are similar to
-"layers" in PyTorch, TensorFlow, and Keras. LBANN uses "layer" in a
-similar manner as Caffe.
-
-### `lbann.models`
-
-This consists of common and influential neural network models. They
-are implemented as `Module`s and can be used as components within more
-complicated models.
-
-### `lbann.launcher`
-
-The `run` function interfaces with job schedulers on HPC clusters. It
-will either submit a batch job (if on a login node) or run with an
-existing node allocation (if on a compute node).
-
-_LLNL users_: The `run` function in the `lbann.contrib.lc.launcher`
-module provides similar functionality, with defaults and optimizations
-for LC systems.
-
-### `lbann.onnx`
-
-This contains functionality to convert between LBANN and ONNX models.
-
-## Examples
-
-A simple (and not very good) convolutional neural network for MNIST
-data:
-
-```py
-import lbann
-import lbann.proto
-
-# ----------------------------------------------------------
-# Construct layer graph
-# ----------------------------------------------------------
-# Note: The first argument to every layer specifies its parents,
-# i.e. the sources for its input tensors.
-
-# Input data
-# Note: Order matters for the children of the input layer!
-input = lbann.Input()           # Interacts with data reader
-images = lbann.Identity(input)  # NCHW image tensor
-labels = lbann.Identity(input)  # One-hot vector
-
-# Simple convolutional network
-conv = lbann.Convolution(
-    images,
-    num_dims=2,             # 2D convolution for NCHW tensors
-    num_output_channels=64, # I.e. number of filters
-    conv_dims_i=5,          # Convolution window size (64x3x5x5 kernel)
-    conv_pads_i=2,          # Padding of 2 in every dimension
-    conv_strides_i=2,       # Stride of 2 in every dimension
-    has_bias=True)          # Channel-wise bias
-bn = lbann.BatchNormalization(conv)
-relu = lbann.Relu(bn)
-pool = lbann.Pooling(
-    relu,
-    num_dims=2,         # 2D pooling (for NCHW tensors)
-    pool_dims_i=3,      # 3x3 pooling window
-    pool_pads_i=1,      # Padding of 1 in every dimension
-    pool_strides_i=2,   # Stride of 2 in every dimension
-    pool_mode='max')    # Max pooling
-fc = lbann.FullyConnected(
-    pool,
-    num_neurons=10, # Output size
-    has_bias=False) # Entry-wise bias
-softmax = lbann.Softmax(fc)
-
-# Compute values for objective function and metrics
-loss = lbann.CrossEntropy([softmax, labels])
-acc = lbann.CategoricalAccuracy([softmax, labels])
-
-# ----------------------------------------------------------
-# Construct model
-# ----------------------------------------------------------
-
-mini_batch_size = 256
-num_epochs = 10
-obj = lbann.ObjectiveFunction([loss])
-metrics = [lbann.Metric(acc, name='accuracy', unit='%')]
-callbacks = [
-    lbann.CallbackPrint(), # Print basic information
-    lbann.CallbackTimer()  # Print timing information
-]
-model = lbann.Model(
-    mini_batch_size, num_epochs,
-    layers=lbann.traverse_layer_graph(input),   # Layers connected to input
-    objective_function=obj,
-    metrics=metrics,
-    callbacks=callbacks)
-
-# ----------------------------------------------------------
-# Save the model to a prototext file.
-# ----------------------------------------------------------
-
-lbann.proto.save_prototext('test.prototext', model=model)
-
-```
-
-See the implementation of LeNet in
-`$LBANN_HOME/model_zoo/vision/lenet.py` for a more comprehensive
-example.
diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py
index 7b48f126daf..2b25fc00e59 100644
--- a/python/lbann/__init__.py
+++ b/python/lbann/__init__.py
@@ -19,17 +19,29 @@
         _lbann_exe = _config['Paths']['lbann_exe']
     except:
         pass
-import lbann_pb2
+import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, reader_pb2, weights_pb2, trainer_pb2, training_algorithm_pb2
+
+# Import enums
+enums = layers_pb2.DESCRIPTOR.enum_types_by_name
+
+for enum_name, enum_desc in enums.items():
+    enum_val_to_num = {}
+    enum_val_descs = enum_desc.values_by_name
+    for val_name, val_desc in enum_val_descs.items():
+        enum_val_to_num[val_name] = val_desc.number
+    globals()[enum_name] = type(enum_name, (), enum_val_to_num)
+
 def lbann_exe():
     """LBANN executable."""
     return _lbann_exe if _lbann_exe else 'lbann'
 
 # Import core functionality into lbann namespace
-from lbann.callback import *
-from lbann.layer import *
-from lbann.metric import *
-from lbann.model import *
-from lbann.objective_function import *
-from lbann.optimizer import *
-from lbann.weights import *
+from lbann.core.callback import *
+from lbann.core.layer import *
+from lbann.core.metric import *
+from lbann.core.model import *
+from lbann.core.objective_function import *
+from lbann.core.optimizer import *
+from lbann.core.trainer import *
+from lbann.core.weights import *
 from lbann.launcher import run
diff --git a/python/lbann/callback.py b/python/lbann/callback.py
deleted file mode 100644
index 151986e3db8..00000000000
--- a/python/lbann/callback.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""Callbacks for neural network training."""
-import abc
-from lbann import lbann_pb2
-import lbann.util.class_generator
-
-class Callback(abc.ABC):
-    """Callback for neural network training."""
-
-    def __init__(self):
-        pass
-
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-        return lbann_pb2.Callback()
-
-# Generate Callback sub-classes from lbann.proto
-# Note: The list of skip fields must be updated if any new fields are
-# added to the Callback message in lbann.proto
-classes = lbann.util.class_generator.generate_classes_from_protobuf_message(
-    lbann_pb2.Callback,
-    base_class = Callback,
-    base_has_export_proto = True)
-for c in classes:
-    globals()[c.__name__] = c
diff --git a/python/lbann/contrib/args.py b/python/lbann/contrib/args.py
index 86e08e0c9b5..eef2f73fa77 100644
--- a/python/lbann/contrib/args.py
+++ b/python/lbann/contrib/args.py
@@ -1,6 +1,6 @@
 """Helper functions to add common command-line arguments."""
 import argparse
-import lbann.optimizer
+import lbann.core.optimizer
 
 def add_scheduler_arguments(parser):
     """Add command-line arguments for common scheduler settings.
@@ -118,14 +118,16 @@ def create_optimizer(args):
 
     # Create optimizer
     if opt == 'momentum':
-        return lbann.optimizer.SGD(learn_rate=lr, momentum=0.9)
+        return lbann.core.optimizer.SGD(learn_rate=lr, momentum=0.9)
     elif opt == 'sgd':
-        return lbann.optimizer.SGD(learn_rate=lr)
+        return lbann.core.optimizer.SGD(learn_rate=lr)
     elif opt == 'adam':
-        return lbann.optimizer.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, eps=1e-8)
+        return lbann.core.optimizer.Adam(learn_rate=lr, beta1=0.9, beta2=0.99,
+                                         eps=1e-8)
     elif opt == 'adagrad':
-        return lbann.optimizer.AdaGrad(learn_rate=lr, eps=1e-8)
+        return lbann.core.optimizer.AdaGrad(learn_rate=lr, eps=1e-8)
     elif opt == 'rmsprop':
-        return lbann.optimizer.RMSprop(learn_rate=lr, decay_rate=0.99, eps=1e-8)
+        return lbann.core.optimizer.RMSprop(learn_rate=lr, decay_rate=0.99,
+                                            eps=1e-8)
     else:
         raise ValueError('invalid optimizer type ({})'.format(opt))
diff --git a/python/lbann/contrib/launcher.py b/python/lbann/contrib/launcher.py
new file mode 100644
index 00000000000..775e79a7408
--- /dev/null
+++ b/python/lbann/contrib/launcher.py
@@ -0,0 +1,115 @@
+import os, os.path
+import socket
+import lbann
+import lbann.launcher
+from lbann.util import make_iterable
+
+# ==============================================
+# Detect the current compute center
+# ==============================================
+
+def is_lc_center():
+    """Current system is operated by Livermore Computing at Lawrence
+    Livermore National Laboratory.
+
+    Checks whether the domain name ends with ".llnl.gov".
+
+    """
+    domain = socket.getfqdn().split('.')
+    return (len(domain) > 2
+            and domain[-2] == 'llnl'
+            and domain[-1] == 'gov')
+
+def is_nersc_center():
+    """Current system is operated by the National Energy Research
+    Scientific Computing Center at Lawrence Berkeley National
+    Laboratory.
+
+    Checks whether the environment variable NERSC_HOST is set.
+
+    """
+    return bool(os.getenv('NERSC_HOST'))
+
+# Detect compute center and choose launcher
+_center = 'unknown'
+launcher = lbann.launcher
+if is_lc_center():
+    _center = 'lc'
+    import lbann.contrib.lc.launcher
+    launcher = lbann.contrib.lc.launcher
+elif is_nersc_center():
+    _center = 'nersc'
+    import lbann.contrib.nersc.launcher
+    launcher = lbann.contrib.nersc.launcher
+
+def compute_center():
+    """Name of organization that operates current system."""
+    return _center
+
+# ==============================================
+# Launcher functions
+# ==============================================
+
+def run(
+    trainer,
+    model,
+    data_reader,
+    optimizer,
+    lbann_exe=lbann.lbann_exe(),
+    lbann_args=[],
+    overwrite_script=False,
+    setup_only=False,
+    batch_job=False,
+    *args,
+    **kwargs,
+):
+    """Run LBANN with system-specific optimizations.
+
+    This is intended to match the behavior of `lbann.run`, with
+    defaults and optimizations for the current system. See that
+    function for a full list of options.
+
+    """
+
+    # Create batch script generator
+    script = make_batch_script(*args, **kwargs)
+
+    # Batch script prints start time
+    script.add_command('echo "Started at $(date)"')
+
+    # Batch script invokes LBANN
+    lbann_command = [lbann_exe]
+    lbann_command.extend(make_iterable(lbann_args))
+    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
+    lbann.proto.save_prototext(prototext_file,
+                               trainer=trainer,
+                               model=model,
+                               data_reader=data_reader,
+                               optimizer=optimizer)
+    lbann_command.append('--prototext={}'.format(prototext_file))
+    script.add_parallel_command(lbann_command)
+    script.add_command('status=$?')
+
+    # Batch script prints finish time and returns status
+    script.add_command('echo "Finished at $(date)"')
+    script.add_command('exit ${status}')
+
+    # Write, run, or submit batch script
+    status = 0
+    if setup_only:
+        script.write(overwrite=overwrite_script)
+    elif batch_job:
+        status = script.submit(overwrite=overwrite_script)
+    else:
+        status = script.run(overwrite=overwrite_script)
+    return status
+
+def make_batch_script(*args, **kwargs):
+    """Construct batch script manager with system-specific optimizations.
+
+    This is intended to match the behavior of
+    `lbann.launcher.make_batch_script`, with defaults and
+    optimizations for the current system.
+
+    """
+    return launcher.make_batch_script(*args, **kwargs)
diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py
index e459a13978c..b31b6a7b763 100644
--- a/python/lbann/contrib/lc/launcher.py
+++ b/python/lbann/contrib/lc/launcher.py
@@ -1,83 +1,124 @@
-from lbann import lbann_exe
+import os
 from lbann.contrib.lc.systems import *
 import lbann.launcher
+from lbann.util import make_iterable
 
-def run(model, data_reader, optimizer,
-        lbann_exe=lbann_exe(),
-        lbann_args='',
-        experiment_dir=None,
-        nodes=1,
-        procs_per_node=procs_per_node(),
-        time_limit=60,
-        scheduler=scheduler(),
-        job_name='lbann',
-        system=system(),
-        partition=partition(),
-        account=account(),
-        reservation=None,
-        launcher_args='',
-        environment={},
-        setup_only=False):
-    """Run LBANN experiment with LC-specific optimizations.
-
-    This is a convenience wrapper around the `lbann.launcher.run`
-    function, with defaults and optimizations for LC systems.
+def run(*args, **kwargs):
+    """Run LBANN with LC-specific optimizations (deprecated).
+
+    This is deprecated. Use `lbann.contrib.launcher.run` instead.
+
+    """
+
+    import warnings
+    warnings.warn(
+        'Using deprecated function `lbann.contrib.lc.launcher.run`. '
+        'Use `lbann.contrib.launcher.run` instead.'
+    )
+    from ..launcher import run as _run
+    _run(*args, **kwargs)
+
+def make_batch_script(
+    system=system(),
+    procs_per_node=procs_per_node(),
+    scheduler=scheduler(),
+    launcher_args=[],
+    environment={},
+    *args,
+    **kwargs,
+):
+    """Construct batch script manager with LC-specific optimizations.
+
+    This is a wrapper around `lbann.launcher.make_batch_script`, with
+    defaults and optimizations for LC systems. See that function for a
+    full list of options.
 
     """
 
+    # Create shallow copies of input arguments
+    launcher_args = list(make_iterable(launcher_args))
+    environment = environment.copy()
+
+    # Helper function to configure environment variables
+    # Note: User-provided values take precedence, followed by values
+    # in the environment, followed by default values.
+    def set_environment(key, default):
+        if key not in environment:
+            environment[key] = os.getenv(key, default)
+
     # Setup GPU bindings
-    # Note: Hydrogen processes take ownership of the GPU indices that
-    # matches their node communicator ranks. mpibind assigns each rank
-    # a unique GPU with index 0, so it should be disabled. Processes
-    # may touch the wrong GPUs in the process of figuring out GPU
-    # ownership, so an exclusive GPU compute mode causes problems.
+    # Note: Each Hydrogen process is assigned to the GPU index that
+    # matches its node communicator rank. This is not compatible with
+    # mpibind, which assigns a GPU with index 0 to each process. We
+    # can't use an exclusive GPU compute mode since processes may
+    # touch the wrong GPU while figuring out ownership.
     if scheduler == 'slurm' and has_gpu(system):
-        launcher_args += ' --mpibind=off --nvidia_compute_mode=default'
+        launcher_args.extend(['--mpibind=off',
+                              '--nvidia_compute_mode=default'])
 
-    # Deal with Pascal's strange hardware topology
-    # Note: Both GPUs on a Pascal node are on the same socket, so we
-    # only use cores on that socket.
-    if system == 'pascal' and procs_per_node == 2:
+    # Optimized thread affinity for Pascal
+    # Note: Both GPUs are on socket 0, so we only use cores on that
+    # socket.
+    if system == 'pascal':
+        cores_per_socket = cores_per_node(system) // 2
+        cores_per_proc = cores_per_socket // procs_per_node
+        set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_node)
+        set_environment('OMP_NUM_THREADS', cores_per_proc - 1)
         if scheduler == 'slurm':
-            launcher_args += ' --cpu_bind=mask_cpu:0x000001ff,0x0003fe00'
-        environment['OMP_NUM_THREADS'] = 8
-        environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2
+            # Include the hyperthreaded cores in the mask
+            masks = [2**cores_per_proc - 1 | ((2**cores_per_proc - 1) << 2*cores_per_socket)]
+            while len(masks) < procs_per_node:
+                masks.append(masks[-1] << cores_per_proc)
+            mask_str = ','.join([hex(mask) for mask in masks])
+            launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str))
 
-    # Hacked bugfix for MPI_Init in MVAPICH2-2.3
+    # Hacked bugfix for MPI_Init in MVAPICH2-2.3 (8/23/18)
     # Note: MPI_Init hangs when started with more than 35
     # processes. This bug is not present in MVAPICH2-2.2 but is
     # present in MVAPICH2-2.3rc2.
-    environment['MV2_USE_RDMA_CM'] = 0
+    set_environment('MV2_USE_RDMA_CM', 0)
 
-    # Hacked bugfix for MPI_Sendrecv in MVAPICH2-2.3
-    # Note: MPI_Sendrecv produces incorrect output under certain
-    # circumstances. This bug is not present in MVAPICH2-2.2 or
-    # MVAPICH2-2.3.1.
-    environment['MV2_USE_LAZY_MEM_UNREGISTER'] = 0
-
-    # Magic default arguments to jsrun/etc.
-    # Note: Pack processes using ten cores for each, with 40 cores total, and
-    # all four GPUs visible to each process.
+    # Optimizations for Sierra-like systems
     if system in ('sierra', 'lassen'):
+
+        # Set thread affinity
+        # Note: Aluminum's default thread affinity is incorrect since
+        # hwloc treats GPUs as NUMA domains.
+        # Note: There are actually 22 cores/socket, but it seems that
+        # powers of 2 are better for performance.
+        cores_per_socket = 16
+        procs_per_socket = (procs_per_node + 1) // 2
+        cores_per_proc = cores_per_socket // procs_per_socket
+        set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_socket)
+        set_environment('OMP_NUM_THREADS', cores_per_proc)
         if scheduler == 'lsf':
-            launcher_args += ' -d packed -b "packed:10" -r 1 -c 40 -g 4'
-        environment['OMP_NUM_THREADS'] = 4
-        # Deal with topology mis-identification on Sierra/Lassen.
-        environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2
-
-    # Run LBANN
-    lbann.launcher.run(model, data_reader, optimizer,
-                       lbann_exe = lbann_exe,
-                       lbann_args = lbann_args,
-                       experiment_dir = experiment_dir,
-                       nodes = nodes,
-                       procs_per_node = procs_per_node,
-                       time_limit = time_limit,
-                       scheduler = scheduler,
-                       job_name = job_name,
-                       system = system,
-                       partition = partition,
-                       account = account,
-                       launcher_args = launcher_args,
-                       environment = environment,
-                       setup_only = setup_only)
+            launcher_args.append('--bind packed:{}'.format(cores_per_proc))
+
+        # Hack to enable process forking
+        # Note: InfiniBand is known to experience hangs if an MPI
+        # process is forked (see
+        # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork).
+        # Setting IBV_FORK_SAFE seems to fix this issue, but it may
+        # hurt performance (see
+        # https://linux.die.net/man/3/ibv_fork_init).
+        set_environment('IBV_FORK_SAFE', 1)
+
+        # Hacked bugfix for hcoll (1/23/19)
+        # Note: Fixes hangs in MPI_Bcast.
+        set_environment('HCOLL_ENABLE_SHARP', 0)
+        set_environment('OMPI_MCA_coll_hcoll_enable', 0)
+
+        # Hacked bugfix for Spectrum MPI PAMI (9/17/19)
+        set_environment('PAMI_MAX_NUM_CACHED_PAGES', 0)
+
+        # Configure NVSHMEM to load Spectrum MPI
+        set_environment('NVSHMEM_MPI_LIB_NAME', 'libmpi_ibm.so')
+
+    return lbann.launcher.make_batch_script(
+        procs_per_node=procs_per_node,
+        scheduler=scheduler,
+        launcher_args=launcher_args,
+        environment=environment,
+        *args,
+        **kwargs,
+    )
diff --git a/python/lbann/contrib/lc/paths.py b/python/lbann/contrib/lc/paths.py
index 2ca67f2f4a1..fecbf1dec6a 100644
--- a/python/lbann/contrib/lc/paths.py
+++ b/python/lbann/contrib/lc/paths.py
@@ -10,6 +10,8 @@ def parallel_file_system_path(system = system()):
     """Base path to parallel file system."""
     if system in ('lassen', 'sierra'):
         return '/p/gpfs1/'
+    elif system == 'ray':
+        return '/p/gscratchr/'
     else:
         return '/p/lustre2/'
 
@@ -24,6 +26,10 @@ def mnist_dir(system = system()):
     """
     return parallel_file_system_path(system) + 'brainusr/datasets/MNIST'
 
+def cifar10_dir(system = system()):
+    """CIFAR10 directory on LC systems."""
+    return parallel_file_system_path(system) + 'brainusr/datasets/cifar10-bin'
+
 def imagenet_dir(system = system(), data_set = 'training',
                  num_classes = 1000):
     """ImageNet directory on LC system.
diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py
index 4156a979bc2..8e5641572ce 100644
--- a/python/lbann/contrib/lc/systems.py
+++ b/python/lbann/contrib/lc/systems.py
@@ -8,22 +8,22 @@
 
 class SystemParams:
     """Simple data structure to describe an LC system."""
-    def __init__(self,
-                 cores_per_node, gpus_per_node,
-                 scheduler, partition, account):
+    def __init__(self, cores_per_node, gpus_per_node, scheduler):
         self.cores_per_node = cores_per_node
         self.gpus_per_node = gpus_per_node
         self.scheduler = scheduler
-        self.partition = partition
-        self.account = account
 
 # Supported LC systems
-_system_params = {'catalyst': SystemParams(24, 0, 'slurm', 'pbatch', 'brain'),
-                  'pascal':   SystemParams(36, 2, 'slurm', 'pbatch', 'lc'),
-                  'quartz':   SystemParams(36, 0, 'slurm', 'pbatch', 'brain'),
-                  'surface':  SystemParams(16, 2, 'slurm', 'pbatch', 'hpclearn'),
-                  'lassen':   SystemParams(44, 4, 'lsf', 'pbatch', None),
-                  'sierra':   SystemParams(44, 4, 'lsf', 'pbatch', None)}
+_system_params = {
+    'catalyst': SystemParams(24, 0, 'slurm'),
+    'corona':   SystemParams(24, 0, 'slurm'),
+    'pascal':   SystemParams(36, 2, 'slurm'),
+    'quartz':   SystemParams(36, 0, 'slurm'),
+    'surface':  SystemParams(16, 2, 'slurm'),
+    'lassen':   SystemParams(44, 4, 'lsf'),
+    'ray':      SystemParams(40, 4, 'lsf'),
+    'sierra':   SystemParams(44, 4, 'lsf'),
+}
 
 # Detect system
 _system = re.sub(r'\d+', '', socket.gethostname())
@@ -68,21 +68,11 @@ def scheduler(system = system()):
         raise RuntimeError('unknown system (' + system + ')')
     return _system_params[system].scheduler
 
-def partition(system = system()):
-    """Default scheduler partition."""
-    if not is_lc_system(system):
-        raise RuntimeError('unknown system (' + system + ')')
-    return _system_params[system].partition
-
-def account(system = system()):
-    """Default scheduler account."""
-    if not is_lc_system(system):
-        raise RuntimeError('unknown system (' + system + ')')
-    return _system_params[system].account
-
 def procs_per_node(system = system()):
     """Default number of processes per node."""
     if has_gpu(system):
         return gpus_per_node(system)
     else:
+        # Catalyst and Quartz have 2 sockets per node
+        ### @todo Think of a smarter heuristic
         return 2
diff --git a/python/lbann/contrib/models/wide_resnet.py b/python/lbann/contrib/models/wide_resnet.py
index 349c32c6883..392087625a4 100644
--- a/python/lbann/contrib/models/wide_resnet.py
+++ b/python/lbann/contrib/models/wide_resnet.py
@@ -13,7 +13,7 @@ class WideResNet50_2(lbann.models.resnet.ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None):
         """Initialize WRN-50-2.
 
@@ -22,8 +22,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name.
                 (default: 'wide_resnet50_module<index>')
 
@@ -33,5 +33,5 @@ def __init__(self, output_size,
             WideResNet50_2.global_count)
         super().__init__(lbann.models.resnet.BottleneckBlock,
                          output_size, (3,4,6,3), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation, name,
+                         zero_init_residual, bn_statistics_group_size, name,
                          width=2)
diff --git a/python/lbann/contrib/nersc/__init__.py b/python/lbann/contrib/nersc/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/lbann/contrib/nersc/launcher.py b/python/lbann/contrib/nersc/launcher.py
new file mode 100644
index 00000000000..c43b00ceb40
--- /dev/null
+++ b/python/lbann/contrib/nersc/launcher.py
@@ -0,0 +1,75 @@
+import math
+import os
+from lbann.contrib.nersc.systems import *
+import lbann.launcher
+from lbann.util import make_iterable
+
+def make_batch_script(
+    system=system(),
+    procs_per_node=procs_per_node(),
+    scheduler=scheduler(),
+    launcher_args=[],
+    environment={},
+    *args,
+    **kwargs,
+):
+    """Construct batch script manager with NERSC-specific optimizations.
+
+    This is a wrapper around `lbann.launcher.make_batch_script`, with
+    defaults and optimizations for NERSC systems. See that function for a
+    full list of options.
+
+    """
+
+    # Create shallow copies of input arguments
+    launcher_args = list(make_iterable(launcher_args))
+    environment = environment.copy()
+
+    # Helper function to configure environment variables
+    # Note: User-provided values take precedence, followed by values
+    # in the environment, followed by default values.
+    def set_environment(key, default):
+        if key not in environment:
+            environment[key] = os.getenv(key, default)
+
+    # Optimizations for Cori GPU nodes
+    if system == 'cgpu':
+        cores_per_proc = cores_per_node(system) // procs_per_node
+        set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE',
+                        math.ceil(procs_per_node / numa_nodes_per_node(system)))
+        set_environment('OMP_NUM_THREADS', cores_per_proc - 1)
+        if scheduler == 'slurm':
+            masks = [2**cores_per_proc - 1]
+            while len(masks) < procs_per_node:
+                masks.append(masks[-1] << cores_per_proc)
+            mask_str = ','.join([hex(mask) for mask in masks])
+            launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str))
+
+        launcher_args.extend(['--qos=regular',
+                              f'--cpus-per-task={cores_per_proc}',
+                              '--gpus-per-task=1',
+                              '--constraint=gpu'])
+
+        # Hack to enable process forking
+        # Note: InfiniBand is known to experience hangs if an MPI
+        # process is forked (see
+        # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork).
+        # Setting IBV_FORK_SAFE seems to fix this issue, but it may
+        # hurt performance (see
+        # https://linux.die.net/man/3/ibv_fork_init).
+        set_environment('IBV_FORK_SAFE', 1)
+
+        set_environment('MV2_ENABLE_AFFINITY', 0)
+
+        set_environment('MV2_USE_CUDA', 1)
+
+        set_environment('MKL_THREADING_LAYER', 'GNU')
+
+    return lbann.launcher.make_batch_script(
+        procs_per_node=procs_per_node,
+        scheduler=scheduler,
+        launcher_args=launcher_args,
+        environment=environment,
+        *args,
+        **kwargs,
+    )
diff --git a/python/lbann/contrib/nersc/paths.py b/python/lbann/contrib/nersc/paths.py
new file mode 100644
index 00000000000..a3e88aea77d
--- /dev/null
+++ b/python/lbann/contrib/nersc/paths.py
@@ -0,0 +1,42 @@
+"""Useful file paths on NERSC systems."""
+import os.path
+from lbann.contrib.nersc.systems import system
+
+# ==============================================
+# Data sets
+# ==============================================
+
+def parallel_file_system_path(system = system()):
+    """Base path to parallel file system."""
+    if system == 'cgpu':
+        return '/global/cfs/cdirs/m3363/'
+    else:
+        raise RuntimeError('unknown parallel file system path on ' + system)
+
+def imagenet_dir(system = system(), data_set = 'training'):
+    """ImageNet directory on NERSC system.
+
+    The directory contains JPEG images from the ILSVRC2012
+    competition. File names in the label file are relative to this
+    directory. The images can be obtained from
+    http://image-net.org/challenges/LSVRC/2012/.
+
+    There are three available data sets: 'training', 'validation', and
+    'testing'.
+
+    """
+    raise RuntimeError('ImageNet data is not available on ' + system)
+
+def imagenet_labels(system = system(), data_set = 'train'):
+    """ImageNet label file on NERSC system.
+
+    The file contains ground truth labels from the ILSVRC2012
+    competition. It is a plain text file where each line contains an
+    image file path (relative to the ImageNet directory; see the
+    `imagenet_dir` function) and the corresponding label ID.
+
+    There are three available data sets: 'training', 'validation', and
+    'testing'.
+
+    """
+    raise RuntimeError('ImageNet data is not available on ' + system)
diff --git a/python/lbann/contrib/nersc/systems.py b/python/lbann/contrib/nersc/systems.py
new file mode 100644
index 00000000000..072e92adce5
--- /dev/null
+++ b/python/lbann/contrib/nersc/systems.py
@@ -0,0 +1,77 @@
+"""Default settings for NERSC systems."""
+import socket
+import re
+
+# ==============================================
+# Set system parameters
+# ==============================================
+
+class SystemParams:
+    """Simple data structure to describe an NERSC system."""
+    def __init__(self, cores_per_node, numa_nodes_per_node, gpus_per_node, scheduler):
+        self.cores_per_node = cores_per_node
+        self.numa_nodes_per_node = numa_nodes_per_node
+        self.gpus_per_node = gpus_per_node
+        self.scheduler = scheduler
+
+# Supported NERSC systems
+_system_params = {
+    'cgpu' : SystemParams(40, 2, 8, 'slurm'), # Cori GPU nodes
+}
+
+# Detect system
+_system = re.sub(r'\d+', '', socket.gethostname())
+if _system not in _system_params.keys():
+    _system = None
+
+# ==============================================
+# Access functions
+# ==============================================
+
+def system():
+    """Name of NERSC system."""
+    if _system:
+        return _system
+    else:
+        raise RuntimeError('unknown system '
+                           '(' + socket.gethostname() + ')')
+
+def is_nersc_system(system = system()):
+    """Whether current system is a supported NERSC system."""
+    return (system is not None) and (system in _system_params.keys())
+
+def gpus_per_node(system = system()):
+    """Number of GPUs per node."""
+    if not is_nersc_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].gpus_per_node
+
+def has_gpu(system = system()):
+    """Whether NERSC system has GPUs."""
+    return gpus_per_node(system) > 0
+
+def cores_per_node(system = system()):
+    """Number of CPU cores per node."""
+    if not is_nersc_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].cores_per_node
+
+def numa_nodes_per_node(system = system()):
+    """Number of NUMA nodes per node."""
+    if not is_nersc_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].numa_nodes_per_node
+
+def scheduler(system = system()):
+    """Job scheduler for NERSC system."""
+    if not is_nersc_system(system):
+        raise RuntimeError('unknown system (' + system + ')')
+    return _system_params[system].scheduler
+
+def procs_per_node(system = system()):
+    """Default number of processes per node."""
+    if has_gpu(system):
+        return gpus_per_node(system)
+    else:
+        ### @todo Think of a smarter heuristic
+        return numa_nodes_per_node(system)
diff --git a/python/lbann/core/__init__.py b/python/lbann/core/__init__.py
new file mode 100644
index 00000000000..b7d8cebf6b7
--- /dev/null
+++ b/python/lbann/core/__init__.py
@@ -0,0 +1,6 @@
+"""Core components in LBANN.
+
+Most objects in this namespace correspond to C++ classes in LBANN.
+Most are autogenerated from the Protobuf frontend.
+
+"""
diff --git a/python/lbann/core/callback.py b/python/lbann/core/callback.py
new file mode 100644
index 00000000000..59f12b188f1
--- /dev/null
+++ b/python/lbann/core/callback.py
@@ -0,0 +1,42 @@
+"""Callbacks for neural network training."""
+import abc
+from lbann import callbacks_pb2
+import lbann.core.util
+
+class Callback(abc.ABC):
+    """Callback for neural network training."""
+
+    def __init__(self):
+        pass
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        return callbacks_pb2.Callback()
+
+# Generate Callback sub-classes from lbann.proto
+# Note: The list of skip fields must be updated if any new fields are
+# added to the Callback message in lbann.proto
+classes = lbann.core.util.generate_classes_from_protobuf_message(
+    callbacks_pb2.Callback,
+    base_class = Callback,
+    base_has_export_proto = True)
+for c in classes:
+    globals()[c.__name__] = c
+
+class ImageSelectionStrategy(abc.ABC):
+    """Image selection strategy for summarize images callback."""
+
+    def __init__(self):
+        pass
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        return callbacks_pb2.Callback.CallbackSummarizeImages.SelectionStrategy()
+
+# Build all subclasses
+classes = lbann.core.util.generate_classes_from_protobuf_message(
+    callbacks_pb2.Callback.CallbackSummarizeImages.SelectionStrategy,
+    base_class = ImageSelectionStrategy,
+    base_has_export_proto = True)
+for c in classes:
+    globals()[c.__name__] = c
diff --git a/python/lbann/core/layer.py b/python/lbann/core/layer.py
new file mode 100644
index 00000000000..bbd9ff078c9
--- /dev/null
+++ b/python/lbann/core/layer.py
@@ -0,0 +1,155 @@
+"""Neural network tensor operations."""
+import abc
+from lbann import layers_pb2
+from lbann.util import make_iterable
+import lbann.core.util
+
+class Layer(abc.ABC):
+    """Neural network tensor operation.
+
+    Args:
+        *args (Layer): Parent layers, i.e. sources of input tensors.
+        parents (Iterable of Layer, optional): Sources of input
+            tensors.
+        children (Iterable of Layer, optional): Destinations of output
+            tensors.
+        weights (Iterable of Weights, optional): Trainable parameters.
+        name (str, optional): Unique identifier (default is
+            'layer<index>').
+        device (str, optional): Device to use, e.g. CPU or GPU.
+        data_layout (str, optional): Data distribution scheme.
+        datatype (lbann.DataType, optional): Data type used for activations and weights.
+        hint_layer (Layer, optional): Hint for output dimensions.
+        parallel_strategy (dictionary, optional): Data partitioning scheme.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 *args,
+                 parents=[],
+                 children=[],
+                 weights=[],
+                 name=None,
+                 device=None,
+                 data_layout=None,
+                 datatype=None,
+                 hint_layer=None,
+                 parallel_strategy={}):
+        Layer.global_count += 1
+        self.parents = []
+        self.children = []
+        self.weights = []
+        self.name = name if name else 'layer{0}'.format(Layer.global_count)
+        self.device = device
+        self.data_layout = data_layout
+        self.datatype = datatype
+        self.hint_layer = hint_layer
+        self.parallel_strategy = parallel_strategy
+
+        # Initialize parents, children, and weights
+        for arg in args:
+            self.add_parent(arg)
+        self.add_parent(parents)
+        self.add_child(children)
+        self.add_weights(weights)
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        proto = layers_pb2.Layer()
+        proto.parents = ' '.join([l.name for l in self.parents])
+        proto.children = ' '.join([l.name for l in self.children])
+        proto.weights = ' '.join([w.name for w in self.weights])
+        proto.name = self.name
+        if self.device:
+            proto.device_allocation = self.device
+        if self.data_layout:
+            proto.data_layout = self.data_layout
+        if self.datatype:
+            proto.datatype = self.datatype
+        if self.hint_layer:
+            proto.hint_layer = self.hint_layer.name
+        for k, v in self.parallel_strategy.items():
+            setattr(proto.parallel_strategy, k, v)
+        return proto
+
+    def add_parent(self, parent):
+        """This layer will receive an input tensor from `parent`."""
+        for p in make_iterable(parent):
+            self.parents.append(p)
+            p.children.append(self)
+
+    def add_child(self, child):
+        """"This layer will send an output tensor to `child`."""
+        for c in make_iterable(child):
+            self.children.append(c)
+            c.parents.append(self)
+
+    def add_weights(self, w):
+        """Add w to this layer's weights."""
+        self.weights.extend(make_iterable(w))
+
+    def __call__(self, parent):
+        """This layer will recieve an input tensor from `parent`.
+
+        Syntactic sugar around `add_parent` function.
+
+        """
+        self.add_parent(parent)
+
+# Generate Layer sub-classes from lbann.proto
+# Note: The list of skip fields must be updated if any new fields are
+# added to the Layer message in lbann.proto
+classes = lbann.core.util.generate_classes_from_protobuf_message(
+    layers_pb2.Layer,
+    skip_fields = set([
+        'name', 'parents', 'children', 'data_layout', 'device_allocation', 'datatype',
+        'weights', 'num_neurons_from_data_reader', 'freeze', 'hint_layer',
+        'parallel_strategy', 'weights_data', 'top', 'bottom', 'type', 'motif_layer']),
+    base_class = Layer,
+    base_kwargs = set([
+        'parents', 'children', 'weights',
+        'name', 'device', 'data_layout', 'datatype', 'hint_layer', 'parallel_strategy']),
+    base_has_export_proto = True)
+for c in classes:
+    globals()[c.__name__] = c
+
+def traverse_layer_graph(layers):
+    """Topologically ordered traversal of layer graph.
+
+    All layers that are connected to `layers` will be traversed. The
+    layer graph is assumed to be acyclic. No checks are made for
+    cycles and strange things may happen if one exists.
+
+    Args:
+        layers (Layer or Iterator of Layer): Node(s) in layer graph.
+
+    Yields:
+        Layer: Node in layer graph, in a topological order.
+
+    """
+
+    # DFS to find root nodes in layer graph
+    roots = []
+    visited = set()
+    stack = list(make_iterable(layers))
+    while stack:
+        l = stack.pop()
+        if l not in visited:
+            visited.add(l)
+            stack.extend(l.parents)
+            stack.extend(l.children)
+            if not l.parents:
+                roots.append(l)
+
+    # DFS to traverse layer graph in topological order
+    visited = set()
+    stack = roots
+    while stack:
+        l = stack.pop()
+        if (l not in visited
+            and all([(p in visited) for p in l.parents])):
+            visited.add(l)
+            stack.extend(l.children)
+            yield l
diff --git a/python/lbann/metric.py b/python/lbann/core/metric.py
similarity index 90%
rename from python/lbann/metric.py
rename to python/lbann/core/metric.py
index ebdf4a83e28..69a50afe179 100644
--- a/python/lbann/metric.py
+++ b/python/lbann/core/metric.py
@@ -1,6 +1,5 @@
 """Neural network tensor operations."""
-import abc
-from lbann import lbann_pb2
+from lbann import metrics_pb2
 
 class Metric:
     """Metric that takes value from a layer.
@@ -18,7 +17,7 @@ def __init__(self, layer, name=None, unit=''):
 
     def export_proto(self):
         """Construct and return a protobuf message."""
-        proto = lbann_pb2.Metric()
+        proto = metrics_pb2.Metric()
         proto.layer_metric.layer = self.layer.name
         proto.layer_metric.name = self.name
         proto.layer_metric.unit = self.unit
diff --git a/python/lbann/core/model.py b/python/lbann/core/model.py
new file mode 100644
index 00000000000..27eb4708b2c
--- /dev/null
+++ b/python/lbann/core/model.py
@@ -0,0 +1,55 @@
+"""Neural network model."""
+from lbann import model_pb2
+from lbann.util import make_iterable
+import lbann.core.layer
+import lbann.core.objective_function
+
+class Model:
+    """Neural network model."""
+
+    def __init__(self, epochs,
+                 layers=[], weights=[], objective_function=None,
+                 metrics=[], callbacks=[],
+                 summary_dir=None,serialize_io=False):
+
+        # Scalar fields
+        self.epochs = epochs
+        self.summary_dir = summary_dir
+        self.serialize_io = serialize_io
+        # Get connected layers
+        self.layers = list(lbann.core.layer.traverse_layer_graph(layers))
+
+        # Get weights associated with layers
+        self.weights = set(make_iterable(weights))
+        for l in self.layers:
+            self.weights.update(l.weights)
+
+        # Construct objective function if needed
+        obj_type = lbann.core.objective_function.ObjectiveFunction
+        if isinstance(objective_function, obj_type):
+            self.objective_function = objective_function
+        elif objective_function is None:
+            self.objective_function = obj_type()
+        else:
+            self.objective_function = obj_type(objective_function)
+
+        # Metrics and callbacks
+        self.metrics = make_iterable(metrics)
+        self.callbacks = make_iterable(callbacks)
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        # Initialize protobuf message
+        model = model_pb2.Model()
+        model.num_epochs = self.epochs
+        if self.summary_dir is not None:
+            model.summarizer.dir = self.summary_dir
+        model.serialize_io = self.serialize_io
+        # Add model components
+        model.layer.extend([l.export_proto() for l in self.layers])
+        model.weights.extend([w.export_proto() for w in self.weights])
+        model.objective_function.CopyFrom(self.objective_function.export_proto())
+        model.metric.extend([m.export_proto() for m in self.metrics])
+        model.callback.extend([c.export_proto() for c in self.callbacks])
+
+        return model
diff --git a/python/lbann/objective_function.py b/python/lbann/core/objective_function.py
similarity index 87%
rename from python/lbann/objective_function.py
rename to python/lbann/core/objective_function.py
index 6e30532bf2a..df58cdf54a6 100644
--- a/python/lbann/objective_function.py
+++ b/python/lbann/core/objective_function.py
@@ -1,7 +1,7 @@
 import abc
-from lbann import lbann_pb2
+from lbann import objective_functions_pb2
 from lbann.util import make_iterable
-import lbann.layer
+import lbann.core.layer
 
 # Note: Currently, only layer terms and L2 weight regularization terms
 # are supported in LBANN. If more terms are added, it may be
@@ -19,7 +19,7 @@ def __init__(self, layer, scale=1.0):
 
     def export_proto(self):
         """Construct and return a protobuf message."""
-        proto = lbann_pb2.LayerTerm()
+        proto = objective_functions_pb2.ObjectiveFunction.LayerTerm()
         proto.layer = self.layer.name
         proto.scale_factor = self.scale
         return proto
@@ -33,7 +33,7 @@ def __init__(self, weights=[], scale=1.0):
 
     def export_proto(self):
         """Construct and return a protobuf message."""
-        proto = lbann_pb2.L2WeightRegularization()
+        proto = objective_functions_pb2.ObjectiveFunction.L2WeightRegularization()
         proto.scale_factor = self.scale
         proto.weights = ' '.join([w.name for w in self.weights])
         return proto
@@ -59,13 +59,13 @@ def add_term(self, term):
         constructed and added to the objective function.
 
         """
-        if isinstance(term, lbann.layer.Layer):
+        if isinstance(term, lbann.core.layer.Layer):
             term = LayerTerm(term)
         self.terms.append(term)
 
     def export_proto(self):
         """Construct and return a protobuf message."""
-        proto = lbann_pb2.ObjectiveFunction()
+        proto = objective_functions_pb2.ObjectiveFunction()
         for term in self.terms:
             term_message = term.export_proto()
             if type(term) is LayerTerm:
diff --git a/python/lbann/core/optimizer.py b/python/lbann/core/optimizer.py
new file mode 100644
index 00000000000..e844dfce188
--- /dev/null
+++ b/python/lbann/core/optimizer.py
@@ -0,0 +1,19 @@
+import abc
+from lbann import optimizers_pb2
+import lbann.core.util
+
+class Optimizer(abc.ABC):
+    """Optimization algorithm for a neural network's parameters."""
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        return optimizers_pb2.Optimizer()
+
+# Generate Optimizer sub-classes from lbann.proto
+# Note: The list of skip fields must be updated if any new fields are
+# added to the Optimizer message in lbann.proto
+classes = lbann.core.util.generate_classes_from_protobuf_message(
+    optimizers_pb2.Optimizer,
+    base_class = Optimizer,
+    base_has_export_proto = True)
+for c in classes:
+    globals()[c.__name__] = c
diff --git a/python/lbann/core/trainer.py b/python/lbann/core/trainer.py
new file mode 100644
index 00000000000..2a49f18575a
--- /dev/null
+++ b/python/lbann/core/trainer.py
@@ -0,0 +1,44 @@
+"""LBANN Trainer."""
+from lbann import trainer_pb2
+from lbann.util import make_iterable
+
+class Trainer:
+    """Manages the training of a neural network model."""
+
+    def __init__(self,
+                 mini_batch_size,
+                 name=None,
+                 procs_per_trainer=None,
+                 num_parallel_readers=None,
+                 random_seed=None,
+                 callbacks=[]):
+        self.name = name
+        self.procs_per_trainer = procs_per_trainer
+        self.num_parallel_readers = num_parallel_readers
+        self.random_seed = random_seed
+        self.mini_batch_size = mini_batch_size
+        self.hydrogen_block_size = None
+        # Callbacks
+        self.callbacks = make_iterable(callbacks)
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        # Initialize protobuf message
+        trainer = trainer_pb2.Trainer()
+        if self.name is not None:
+            trainer.name = self.name
+        if self.procs_per_trainer is not None:
+            trainer.procs_per_trainer = self.procs_per_trainer
+        if self.num_parallel_readers is not None:
+            trainer.num_parallel_readers = self.num_parallel_readers
+        if self.random_seed is not None:
+            trainer.random_seed = self.random_seed
+        if self.mini_batch_size is not None:
+            trainer.mini_batch_size = self.mini_batch_size
+        if self.hydrogen_block_size is not None:
+            trainer.hydrogen_block_size = self.hydrogen_block_size
+
+        # Add trainer components
+        trainer.callback.extend([c.export_proto() for c in self.callbacks])
+
+        return trainer
diff --git a/python/lbann/core/util.py b/python/lbann/core/util.py
new file mode 100644
index 00000000000..fc1f58d5e4d
--- /dev/null
+++ b/python/lbann/core/util.py
@@ -0,0 +1,251 @@
+"""Utility functions for core LBANN functionality.
+
+This submodule mostly contains helper functions to generate classes
+from Protobuf messages.
+
+"""
+import google.protobuf.descriptor
+import google.protobuf.wrappers_pb2
+from lbann import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2
+from lbann.util import make_iterable
+
+# Each field in a Protobuf message is labeled as 'optional',
+# 'required', or 'repeated'
+# Note: 'optional' is not used in Protobuf 3.
+_protobuf_field_label_names = {
+    google.protobuf.descriptor.FieldDescriptor.LABEL_OPTIONAL: 'optional',
+    google.protobuf.descriptor.FieldDescriptor.LABEL_REQUIRED: 'required',
+    google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED: 'repeated'
+}
+
+# Each field in a Protobuf message has a type, e.g. float, int64
+_protobuf_field_type_names = {
+    google.protobuf.descriptor.FieldDescriptor.TYPE_BOOL: 'bool',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_BYTES: 'bytes',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_DOUBLE: 'double',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_ENUM: 'enum',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_FIXED32: 'fixed32',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_FIXED64: 'fixed64',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_FLOAT: 'float',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_GROUP: 'group',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_INT32: 'int32',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_INT64: 'int64',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_MESSAGE: 'message',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_SFIXED32: 'sfixed32',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_SFIXED64: 'sfixed64',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_SINT32: 'sint32',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_SINT64: 'sint64',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_STRING: 'string',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_UINT32: 'uint32',
+    google.protobuf.descriptor.FieldDescriptor.TYPE_UINT64: 'uint64'
+}
+
+# Wrapper Protobuf messages for primitive types
+# Note: Protobuf 3 does not support optional message fields with
+# primitive types. If a primitive field is not set, its value is
+# "zero" (false for bool, empty string for string, etc). We need to
+# use these wrapper messages to distinguish between values that are
+# "zero" and values that are not set.
+_protobuf_type_wrappers = (
+    google.protobuf.wrappers_pb2.DoubleValue.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.FloatValue.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.Int64Value.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.Int64Value.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.UInt64Value.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.Int32Value.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.UInt32Value.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.BoolValue.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.StringValue.DESCRIPTOR,
+    google.protobuf.wrappers_pb2.BytesValue.DESCRIPTOR
+)
+
+def _generate_class(message_descriptor,
+                    base_field_name,
+                    base_class,
+                    base_kwargs,
+                    base_has_export_proto):
+    """Generate new class from Protobuf message.
+
+    Args:
+        message (google.protobuf.descriptor.Descriptor): Descriptor
+            for Protobuf message.
+        base_field_name (str): Name of corresponding field in parent
+            message.
+        base_class (type): Base class for generated class.
+        base_kwargs (Iterable of str): Keyword arguments for base
+            class `__init__` method.
+        base_has_export_proto (bool): Whether the base class
+            implements an `export_proto` method. If `True`, the
+            generated class `export_proto` will set the appropriate
+            field in the Protobuf message returned by the base class
+            `export_proto`.
+
+    Returns:
+        type: Generated class.
+
+    """
+
+    # Names of Protobuf message and its fields
+    message_name = message_descriptor.name
+    field_descriptors = message_descriptor.fields_by_name
+    field_names = field_descriptors.keys()
+    enums = message_descriptor.enum_types_by_name
+        # Handle "enum" type data.
+    all_enums = {}
+    for enum_name, enum_desc in enums.items():
+        enum_val_to_num = {}
+        enum_val_descs = enum_desc.values_by_name
+        for val_name, val_desc in enum_val_descs.items():
+            enum_val_to_num[val_name] = val_desc.number
+        all_enums[enum_name] = type(enum_name, (), enum_val_to_num)
+    # Note (trb 12/18/19): This is *NOT* meant to be a rigorous enum
+    # implementation (see the 'enum' module for such a thing). The
+    # goal is to simply expose "enum-like" semantics to the Python
+    # front-end:
+    #
+    #   x = ClassName.EnumName.ENUM_VALUE
+    #
+    # Note that the value held by "x" after this will be "int". Based
+    # on my testing, Protobuf message classes are happy enough to take
+    # their enum-valued field values as "int", so this is not a
+    # problem.
+
+    # Make sure fields in generated and base classes are distinct
+    for arg in base_kwargs:
+        if arg in field_names:
+            raise RuntimeError(
+                'class {0} and its parent class {1} '
+                'both define the field {2}. This is a bug!'
+                .format(message_name, base_class.__name__, arg))
+
+    def __init__(self, *args, **kwargs):
+
+        # Extract arguments to pass to base class constructor
+        _base_kwargs = {}
+        for arg in base_kwargs:
+            if arg in kwargs:
+                _base_kwargs[arg] = kwargs[arg]
+                del kwargs[arg]
+        base_class.__init__(self, *args, **_base_kwargs)
+
+        # Make sure arguments are valid
+        for arg in kwargs:
+            if arg not in field_names:
+                raise ValueError('Unknown argument {0}'.format(arg))
+
+        # Set field values
+        for arg in field_names:
+            setattr(self, arg, kwargs.get(arg, None))
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+
+        # Construct Protobuf message
+        if base_has_export_proto:
+            proto = base_class.export_proto(self)
+            message = getattr(proto, base_field_name)
+            message.SetInParent()
+        else:
+            # TODO (trb 08/01/2019): This list would have to be
+            # updated any time another _pb2 file is created. It might
+            # be better to have this as a global `frozenset`
+            # (ndryden's suggestion) that gets maintained
+            # elsewhere. But this code either works or doesn't get
+            # executed now, so I vote delaying this fix until a need
+            # arises.
+            proto_modules = [callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2]
+            proto_type = None
+            while proto_type is None:
+                proto_type = getattr(proto_modules.pop(), message_name, None)
+            proto = proto_type()
+            message = proto
+
+        # Set message
+        for field_name in field_names:
+            val = getattr(self, field_name)
+            if val is not None:
+                try:
+                    field = getattr(message, field_name)
+                    field_descriptor = field_descriptors[field_name]
+                    if field_descriptor.message_type in _protobuf_type_wrappers:
+                        field.SetInParent()
+                        field.value = val
+                    elif field_descriptor.label == google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED:
+                        field.extend(make_iterable(val))
+                    elif isinstance(val, google.protobuf.message.Message):
+                        getattr(message, field_name).MergeFrom(val)
+                    elif callable(getattr(val, "export_proto", None)):
+                        # 'val' is (hopefully) an LBANN class
+                        # representation of a protobuf message.
+                        getattr(message, field_name).MergeFrom(val.export_proto())
+                    else:
+                        setattr(message, field_name, val)
+                except:
+                    raise TypeError('{} is invalid type for {}.{}'
+                                    .format(type(val).__name__,
+                                            self.__class__.__name__,
+                                            field_name))
+
+        # Return Protobuf message
+        return proto
+
+    def get_field_names(self):
+        """Names of parameters in derived class."""
+        return field_names
+
+    # Generate docstring
+    if message_descriptor.fields:
+        doc = 'Fields:\n'
+        for field in message_descriptor.fields:
+            doc += '    {0} ({1} {2})\n'.format(
+                field.name,
+                _protobuf_field_label_names.get(field.label, 'unknown'),
+                _protobuf_field_type_names.get(field.type, 'unknown'))
+    else:
+        doc = 'Fields: none\n'
+
+    # Create new class
+    class_dictionary = {'__init__': __init__,
+                        '__doc__': doc,
+                        'export_proto': export_proto,
+                        'get_field_names': get_field_names}
+    class_dictionary.update(all_enums)
+
+    return type(message_name, (base_class,), class_dictionary)
+
+def generate_classes_from_protobuf_message(message,
+                                           skip_fields = set(),
+                                           base_class = object,
+                                           base_kwargs = set(),
+                                           base_has_export_proto = False):
+    """Generate new classes based on fields in a Protobuf message.
+
+    Args:
+        message (type): A derived class of
+            `google.protobuf.message.Message`. A new class will be
+            generated for each field in the message.
+        skip_fields (Iterable of str, optional): Protobuf message
+            fields to ignore.
+        base_class (type, optional): Generated classes will inherit
+            from this class.
+        base_kwargs (Iterable of str, optional): Keyword arguments for
+            base class `__init__` method.
+        base_has_export_proto (bool, optional): Whether the base class
+            implements an `export_proto` method. If `True`, the base
+            class `export_proto` is responsible for constructing a
+            message of type `message` and the generated class
+            `export_proto` will set the appropriate field.
+
+    Returns:
+        list of type: Generated classes.
+
+    """
+    classes = []
+    for field in message.DESCRIPTOR.fields:
+        if field.name not in skip_fields:
+            classes.append(_generate_class(field.message_type,
+                                           field.name,
+                                           base_class,
+                                           base_kwargs,
+                                           base_has_export_proto))
+    return classes
diff --git a/python/lbann/core/weights.py b/python/lbann/core/weights.py
new file mode 100644
index 00000000000..c8ed00b2a9e
--- /dev/null
+++ b/python/lbann/core/weights.py
@@ -0,0 +1,55 @@
+"""Trainable model parameters."""
+import abc
+from lbann import weights_pb2
+import lbann.core.util
+
+class Initializer(abc.ABC):
+    """Initialization scheme for `Weights`.
+
+        datatype (lbann.DataType, optional): Data type used for weights.
+
+    """
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        return weights_pb2.Initializer()
+
+# Generate Initializer sub-classes from weights.proto.
+classes = lbann.core.util.generate_classes_from_protobuf_message(
+    weights_pb2.Initializer,
+    base_class = Initializer,
+    base_has_export_proto = True)
+for c in classes:
+    globals()[c.__name__] = c
+
+class Weights:
+    """Trainable parameters for neural network."""
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, initializer=None, optimizer=None, name=None, datatype=None):
+        Weights.global_count += 1
+        self.name = name if name else 'weights{0}'.format(Weights.global_count)
+        self.initializer = initializer
+        self.optimizer = optimizer
+        self.datatype = datatype
+
+    def export_proto(self):
+        """Construct and return a protobuf message."""
+        proto = weights_pb2.Weights()
+        proto.name = self.name
+
+        # Set initializer if needed
+        if self.initializer:
+            proto.initializer.CopyFrom(self.initializer.export_proto())
+            proto.initializer.SetInParent()
+
+        # Set optimizer if needed
+        if self.optimizer:
+            proto.optimizer.CopyFrom(self.optimizer.export_proto())
+            proto.optimizer.SetInParent()
+
+        # Set datatype if needed
+        if self.datatype:
+            proto.datatype = self.datatype
+
+        return proto
diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py
index 35ecd09e7ee..285bf59a25e 100644
--- a/python/lbann/launcher/__init__.py
+++ b/python/lbann/launcher/__init__.py
@@ -1,32 +1,35 @@
-import os
-import os.path
 import datetime
+import os, os.path
+import subprocess
 import lbann
 import lbann.proto
 import lbann.launcher.slurm
 import lbann.launcher.lsf
+from lbann.util import make_iterable
 
 # ==============================================
 # Run experiments
 # ==============================================
 
-def run(model, data_reader, optimizer,
-        lbann_exe=lbann.lbann_exe(),
-        lbann_args='',
-        experiment_dir=None,
+def run(trainer, model, data_reader, optimizer,
+        work_dir=None,
         nodes=1,
         procs_per_node=1,
-        time_limit=60,
-        scheduler='slurm',
+        time_limit=None,
+        scheduler=None,
         job_name='lbann',
-        system=None,
         partition=None,
         account=None,
         reservation=None,
-        launcher_args='',
+        launcher_args=[],
+        lbann_exe=lbann.lbann_exe(),
+        lbann_args=[],
         environment={},
-        setup_only=False):
-    """Run LBANN experiment.
+        overwrite_script=False,
+        setup_only=False,
+        batch_job=False,
+        experiment_dir=None):
+    """Run LBANN.
 
     This is intended to interface with job schedulers on HPC
     clusters. It will either submit a batch job (if on a login node)
@@ -39,87 +42,207 @@ def run(model, data_reader, optimizer,
     can be set with the environment variable `LBANN_EXPERIMENT_DIR`.
 
     Args:
-        model (lbann.model.Model or lbann_pb2.Model): Neural network
+        trainer (lbann.Trainer): LBANN trainer.
+        model (lbann.Model): Neural network model.
+        data_reader (lbann.reader_pb2.DataReader): Data reader.
+        optimizer (lbann.model.Optimizer): Default optimizer for
             model.
-        data_reader (lbann_pb2.DataReader): Data reader.
-        optimizer (lbann.model.Model or lbann_pb2.Optimizer): Default
-            optimizer for model.
-        lbann_exe (str, optional): LBANN executable.
-        lbann_args (str, optional): Command-line arguments to LBANN
-            executable.
-        experiment_dir (str, optional): Experiment directory.
+        work_dir (str, optional): Working directory.
         nodes (int, optional): Number of compute nodes.
         procs_per_node (int, optional): Number of processes per compute
             node.
         time_limit (int, optional): Job time limit, in minutes.
         scheduler (str, optional): Job scheduler.
         job_name (str, optional): Batch job name.
-        system (str, optional): Target system.
         partition (str, optional): Scheduler partition.
         account (str, optional): Scheduler account.
         reservation (str, optional): Scheduler reservation name.
         launcher_args (str, optional): Command-line arguments to
             launcher.
+        lbann_exe (str, optional): LBANN executable.
+        lbann_args (str, optional): Command-line arguments to LBANN
+            executable.
         environment (dict of {str: str}, optional): Environment
             variables.
+        overwrite_script (bool, optional): Whether to overwrite script
+            file if it already exists.
         setup_only (bool, optional): If true, the experiment is not
             run after the experiment directory is initialized.
+        batch_job (bool, optional): If true, the experiment is
+            submitted to the scheduler as a batch job.
+        experiment_dir (str, optional, deprecated): See `work_dir`.
+
+    Returns:
+        int: Exit status.
 
     """
 
-    # Construct experiment directory if needed
-    if not experiment_dir:
+    # Create batch script generator
+    if not work_dir:
+        work_dir = experiment_dir
+    script = make_batch_script(work_dir=work_dir,
+                               nodes=nodes,
+                               procs_per_node=procs_per_node,
+                               time_limit=time_limit,
+                               scheduler=scheduler,
+                               job_name=job_name,
+                               partition=partition,
+                               account=account,
+                               reservation=reservation,
+                               launcher_args=launcher_args,
+                               environment=environment)
+
+    # Batch script prints start time
+    script.add_command('echo "Started at $(date)"')
+
+    # Batch script invokes LBANN
+    lbann_command = [lbann_exe]
+    lbann_command.extend(make_iterable(lbann_args))
+    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
+    lbann.proto.save_prototext(prototext_file,
+                               trainer=trainer,
+                               model=model,
+                               data_reader=data_reader,
+                               optimizer=optimizer)
+    lbann_command.append('--prototext={}'.format(prototext_file))
+    script.add_parallel_command(lbann_command)
+    script.add_command('status=$?')
+
+    # Batch script prints finish time and returns status
+    script.add_command('echo "Finished at $(date)"')
+    script.add_command('exit ${status}')
+
+    # Write, submit, or run batch script
+    status = 0
+    if setup_only:
+        script.write(overwrite=overwrite_script)
+    elif batch_job:
+        status = script.submit(overwrite=overwrite_script)
+    else:
+        status = script.run(overwrite=overwrite_script)
+    return status
+
+def make_batch_script(script_file=None,
+                      work_dir=None,
+                      nodes=1,
+                      procs_per_node=1,
+                      time_limit=None,
+                      scheduler=None,
+                      job_name='lbann',
+                      partition=None,
+                      account=None,
+                      reservation=None,
+                      launcher_args=[],
+                      environment={},
+                      experiment_dir=None):
+    """Construct batch script manager.
+
+    Attempts to detect a scheduler if one is not provided.
+
+    If a working directory is not provided, a timestamped directory is
+    created (by default in the current working directory). The
+    location of autogenerated working directories can be set with the
+    environment variable `LBANN_EXPERIMENT_DIR`.
+
+    Args:
+        script_file (str): Script file.
+        work_dir (str, optional): Working directory
+            (default: autogenerated, timestamped directory).
+        nodes (int, optional): Number of compute nodes
+            (default: 1).
+        procs_per_node (int, optional): Parallel processes per
+            compute node (default: 1).
+        time_limit (int, optional): Job time limit, in minutes.
+        scheduler (str, optional): Job scheduler
+            (default: autodetected scheduler).
+        job_name (str, optional): Job name (default: 'lbann').
+        partition (str, optional): Scheduler partition.
+        account (str, optional): Scheduler account.
+        reservation (str, optional): Scheduler advance reservation.
+        launcher_args (`Iterable` of `str`, optional):
+            Command-line arguments to parallel command launcher.
+        environment (`dict` of `{str: str}`, optional): Environment
+            variables.
+        experiment_dir (str, optional, deprecated): See `work_dir`.
+
+    Returns:
+        `lbann.launcher.batch_script.BatchScript`
+
+    """
+
+    # Try detecting job scheduler if not provided
+    if not scheduler:
+        try:
+            subprocess.call(['sbatch', '--version'],
+                            stdout=subprocess.DEVNULL,
+                            stderr=subprocess.DEVNULL)
+            scheduler = 'slurm'
+        except:
+            pass
+    if not scheduler:
+        try:
+            subprocess.call(['bsub', '-V'],
+                            stdout=subprocess.DEVNULL,
+                            stderr=subprocess.DEVNULL)
+            scheduler = 'lsf'
+        except:
+            pass
+    if not scheduler:
+        raise RuntimeError('could not detect job scheduler')
+
+    # Create work directory if not provided
+    if not work_dir:
+       work_dir = experiment_dir
+    if not work_dir:
         if 'LBANN_EXPERIMENT_DIR' in os.environ:
-            experiment_dir = os.environ['LBANN_EXPERIMENT_DIR']
+            work_dir = os.environ['LBANN_EXPERIMENT_DIR']
         else:
-            experiment_dir = os.path.join(os.getcwd())
+            work_dir = os.path.join(os.getcwd())
         timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-        experiment_dir = os.path.join(experiment_dir,
-                                      '{}_{}'.format(timestamp, job_name))
+        work_dir = os.path.join(work_dir,
+                                '{}_{}'.format(timestamp, job_name))
         i = 1
-        while os.path.lexists(experiment_dir):
+        while os.path.lexists(work_dir):
             i += 1
-            experiment_dir = os.path.join(
-                os.path.dirname(experiment_dir),
+            work_dir = os.path.join(
+                os.path.dirname(work_dir),
                 '{}_{}_{}'.format(timestamp, job_name, i))
-    experiment_dir = os.path.abspath(experiment_dir)
-    os.makedirs(experiment_dir, exist_ok=True)
+    work_dir = os.path.realpath(work_dir)
+    os.makedirs(work_dir, exist_ok=True)
 
-    # Create experiment prototext file
-    prototext_file = os.path.join(experiment_dir, 'experiment.prototext')
-    lbann.proto.save_prototext(prototext_file,
-                               model = model,
-                               data_reader = data_reader,
-                               optimizer = optimizer)
-    lbann_args += ' --prototext=' + prototext_file
-
-    # Run experiment
+    # Create batch script manager
+    if not script_file:
+        script_file = os.path.join(work_dir, 'batch.sh')
+    script = None
     if scheduler.lower() in ('slurm', 'srun', 'sbatch'):
-        slurm.run(experiment_dir=experiment_dir,
-                  command='{} {}'.format(lbann_exe, lbann_args),
-                  nodes=nodes,
-                  procs_per_node=procs_per_node,
-                  time_limit=time_limit,
-                  job_name=job_name,
-                  partition=partition,
-                  account=account,
-                  reservation=reservation,
-                  srun_args=launcher_args,
-                  environment=environment,
-                  setup_only=setup_only)
+        script = lbann.launcher.slurm.SlurmBatchScript(
+            script_file=script_file,
+            work_dir=work_dir,
+            nodes=nodes,
+            procs_per_node=procs_per_node,
+            time_limit=time_limit,
+            job_name=job_name,
+            partition=partition,
+            account=account,
+            launcher_args=launcher_args)
     elif scheduler.lower() in ('lsf', 'jsrun', 'bsub'):
-        lsf.run(experiment_dir=experiment_dir,
-                command='{} {}'.format(lbann_exe, lbann_args),
-                nodes=nodes,
-                procs_per_node=procs_per_node,
-                time_limit=time_limit,
-                job_name=job_name,
-                partition=partition,
-                account=account,
-                reservation=reservation,
-                jsrun_args=launcher_args,
-                environment=environment,
-                setup_only=setup_only)
+        script = lbann.launcher.lsf.LSFBatchScript(
+            script_file=script_file,
+            work_dir=work_dir,
+            nodes=nodes,
+            procs_per_node=procs_per_node,
+            time_limit=time_limit,
+            job_name=job_name,
+            partition=partition,
+            account=account,
+            reservation=reservation,
+            launcher_args=launcher_args)
     else:
         raise RuntimeError('unsupported job scheduler ({})'
                            .format(scheduler))
+
+    # Set batch script environment
+    for variable, value in environment.items():
+        script.add_command('export {0}={1}'.format(variable, value))
+
+    return script
diff --git a/python/lbann/launcher/batch_script.py b/python/lbann/launcher/batch_script.py
new file mode 100644
index 00000000000..e649d62fd37
--- /dev/null
+++ b/python/lbann/launcher/batch_script.py
@@ -0,0 +1,193 @@
+import os
+import os.path
+import subprocess
+from lbann.util import make_iterable
+
+class BatchScript:
+    """Utility class to write batch job scripts.
+
+    This class manages a non-interactive script file that can be
+    submitted as a batch job to an HPC job scheduler. A script is made
+    up of two parts: the header configures the job and the body
+    contains the actual commands to be executed.
+
+    This particular class is not fully implemented. Derived classes
+    for specific job schedulers should implement
+    `add_parallel_command` and `submit`, maintaining the same API.
+
+    """
+
+    def __init__(self,
+                 script_file=None,
+                 work_dir=os.getcwd(),
+                 interpreter='/bin/bash'):
+        """Construct batch script manager.
+
+        Args:
+            script_file (str): Script file.
+            work_dir (str, optional): Working directory
+                (default: current working directory).
+            interpreter (str, optional): Script interpreter
+                (default: /bin/bash).
+
+        """
+
+        # Lines in script are stored as lists of strings
+        self.header = []
+        self.body = []
+
+        # Construct file paths
+        self.work_dir = os.path.realpath(work_dir)
+        self.script_file = script_file
+        if not self.script_file:
+            self.script_file = os.path.join(self.work_dir, 'batch.sh')
+        self.script_file = os.path.realpath(self.script_file)
+        self.out_log_file = os.path.join(self.work_dir, 'out.log')
+        self.err_log_file = os.path.join(self.work_dir, 'err.log')
+
+        # Shebang line
+        if interpreter:
+            self.add_header_line('#!{}'.format(interpreter))
+
+    def add_header_line(self, line):
+        """Add line to script header.
+
+        The header should specify configuration options for the job
+        scheduler, without containing executable commands.
+
+        """
+        self.header.append(line)
+
+    def add_body_line(self, line):
+        """Add line to script body.
+
+        The body should contain the script's executable commands.
+
+        """
+        self.body.append(line)
+
+    def add_command(self, command):
+        """Add executable command to script.
+
+        Args:
+            command (`str` or `Iterable` of `str`s): Program
+                invocation or sequence of program arguments.
+
+        """
+        self.add_body_line(' '.join(make_iterable(command)))
+
+    def add_parallel_command(self,
+                             command,
+                             launcher=None,
+                             launcher_args=None,
+                             nodes=None,
+                             procs_per_node=None):
+        """Add command to be executed in parallel.
+
+        The command is executed via a launcher, e.g. `mpirun`.
+        Parallel processes are distributed evenly amongst the compute
+        nodes.
+
+        Args:
+            command (`str` or `Iterable` of `str`s): Command to be
+                executed in parallel.
+            launcher (str, optional): Parallel command launcher,
+               `mpirun`.
+            launcher_args (`Iterable` of `str`s, optional):
+                Command-line arguments to parallel command launcher.
+            nodes (int, optional): Number of compute nodes.
+            procs_per_node (int, optional): Number of parallel
+                processes per compute node.
+
+        """
+        raise NotImplementedError(
+            'classes that inherit from `BatchScript` should implement '
+            '`add_parallel_command` to use a specific job scheduler'
+        )
+
+    def write(self, overwrite=False):
+        """Write script to file.
+
+        The working directory is created if needed.
+
+        Args:
+            overwrite (bool): Whether to overwrite script file if it
+                already exists (default: false).
+
+        """
+
+        # Create directories if needed
+        os.makedirs(self.work_dir, exist_ok=True)
+        os.makedirs(os.path.dirname(self.script_file), exist_ok=True)
+
+        # Check if script file already exists
+        if not overwrite and os.path.isfile(self.script_file):
+            raise RuntimeError('Attempted to write batch script to {}, '
+                               'but it already exists'
+                               .format(self.script_file))
+
+        # Write script to file
+        with open(self.script_file, 'w') as f:
+            for line in self.header:
+                f.write('{}\n'.format(line))
+            f.write('\n')
+            for line in self.body:
+                f.write('{}\n'.format(line))
+
+        # Make script file executable
+        os.chmod(self.script_file, 0o755)
+
+    def run(self, overwrite=False):
+        """Execute the script.
+
+        The script is executed directly and is _not_ submitted to a
+        job scheduler. The script file is written before being
+        executed.
+
+        Args:
+            overwrite (bool): Whether to overwrite script file if it
+                already exists (default: false).
+
+        Returns:
+            int: Exit status from executing script.
+
+        """
+
+        # Construct script file
+        self.write(overwrite=overwrite)
+
+        # Run script and pipe output to log files
+        run_proc = subprocess.Popen(self.script_file,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    cwd=self.work_dir)
+        out_proc = subprocess.Popen(['tee', self.out_log_file],
+                                    stdin=run_proc.stdout,
+                                    cwd=self.work_dir)
+        err_proc = subprocess.Popen(['tee', self.err_log_file],
+                                    stdin=run_proc.stderr,
+                                    cwd=self.work_dir)
+        run_proc.stdout.close()
+        run_proc.stderr.close()
+        run_proc.wait()
+        out_proc.wait()
+        err_proc.wait()
+        return run_proc.returncode
+
+    def submit(self, overwrite=False):
+        """Submit batch job to job scheduler.
+
+        The script file is written before being submitted.
+
+        Args:
+            overwrite (bool): Whether to overwrite script file if it
+                already exists (default: false).
+
+        Returns:
+            int: Exit status from submitting to job scheduler.
+
+        """
+        raise NotImplementedError(
+            'classes that inherit from `BatchScript` should implement '
+            '`submit` to use a specific job scheduler'
+        )
diff --git a/python/lbann/launcher/lsf.py b/python/lbann/launcher/lsf.py
index e24c2846a10..23f6d5c4a7a 100644
--- a/python/lbann/launcher/lsf.py
+++ b/python/lbann/launcher/lsf.py
@@ -1,130 +1,167 @@
 """Utility functions for LSF."""
 
 import os
-import os.path
 import subprocess
 from lbann.util import make_iterable
+from .batch_script import BatchScript
 
-def run(command,
-        experiment_dir=os.getcwd(),
-        nodes=1,
-        procs_per_node=1,
-        time_limit=-1,
-        job_name=None,
-        partition=None,
-        account=None,
-        reservation=None,
-        jsrun_args='',
-        environment={},
-        setup_only=False):
-    """Run executable with LSF.
-
-    Creates an LSF batch script in the experiment directory. If a LSF
-    job allocation is detected, the script is run directly. Otherwise,
-    the script is submitted to bsub.
-
-    Args:
-        command (str): Program to run under LSF, i.e. an executable and
-            its command-line arguments.
-        experiment_dir (str, optional): Experiment directory.
-        nodes (int, optional): Number of compute nodes.
-        procs_per_node (int, optional): Number of processes per compute
-            node.
-        time_limit (int, optional): Job time limit, in minutes. A
-            negative value implies the system-default time limit.
-        job_name (str, optional): Batch job name.
-        partition (str, optional): Scheduler partition.
-        account (str, optional): Scheduler account.
-        reservation (str, optional): Scheduler reservation name.
-        jsrun_args (str, optional): Command-line arguments to jsrun.
-        environment (dict of {str: str}, optional): Environment
-            variables.
-        setup_only (bool, optional): If true, the experiment is not
-            run after the batch script is created.
-
-    """
-    # Check for an existing job allocation.
-    # Note: Settings for existing allocations take precedence.
-    has_allocation = 'LSB_JOBID' in os.environ
-    if has_allocation:
-        job_name = os.environ['LSB_JOBNAME']
-        partition = os.environ['LSB_QUEUE']
-        # LSF does not provide a way to get the account via env vars.
-        time_limit = -1
-
-    # Experiment directory
-    experiment_dir = os.path.abspath(experiment_dir)
-    os.makedirs(experiment_dir, exist_ok=True)
-    batch_file = os.path.join(experiment_dir, 'batch.sh')
-    out_file = os.path.join(experiment_dir, 'out.log')
-    err_file = os.path.join(experiment_dir, 'err.log')
-    nodes_file = os.path.join(experiment_dir, 'nodes.txt')
-
-    # Create batch script.
-    s = '#!/bin/sh\n'
-    if job_name:
-        s += '#BSUB -J {}\n'.format(job_name)
-    s += '#BSUB -nnodes {}\n'.format(nodes)
-    if partition:
-        s += '#BSUB -q {}\n'.format(partition)
-    if account:
-        s += '#BSUB -G {}\n'.format(account)
-    else:
-        raise ValueError('LSF requires an account')
-    if reservation:
-        s += '#BSUB -U {}\n'.format(reservation)
-    s += '#BSUB -cwd {}\n'.format(experiment_dir)
-    s += '#BSUB -o {}\n'.format(out_file)
-    s += '#BSUB -e {}\n'.format(err_file)
-    if time_limit >= 0:
-        s += '#BSUB -W {}\n'.format(time_limit)
-
-    # Set environment variables.
-    if environment:
-        s += '\n# ==== Environment ====\n'
-        for variable, value in environment.items():
-            s += 'export {}={}\n'.format(variable, value)
-
-    # Time and node list.
-    s += '\n# ==== Useful info ====\n'
-    s += 'date\n'
-    s += 'jsrun -n {} -a 1 hostname > {}\n'.format(nodes, nodes_file)
-    s += 'sort --unique --output={0} {0}\n'.format(nodes_file)
-
-    # Run experiment.
-    s += '\n# ==== Experiment ====\n'
-    for cmd in make_iterable(command):
-        s += 'jsrun -n {} -a {} {} {}\n'.format(
-            nodes, procs_per_node, jsrun_args, cmd)
-
-    with open(batch_file, 'w') as f:
-        f.write(s)
-
-    # Make batch script executable.
-    os.chmod(batch_file, 0o755)
-
-    # Launch if needed.
-    if not setup_only:
-        if has_allocation:
-            run_proc = subprocess.Popen(['sh', batch_file],
-                                        stdout=subprocess.PIPE,
-                                        stderr=subprocess.PIPE,
-                                        cwd=experiment_dir)
-        else:
-            # bsub requires the batch script be read from its stdin.
-            run_proc = subprocess.Popen('bsub < {}'.format(batch_file),
-                                        stdout=subprocess.PIPE,
-                                        stderr=subprocess.PIPE,
-                                        cwd=experiment_dir,
-                                        shell=True)
-        out_proc = subprocess.Popen(['tee', out_file],
+class LSFBatchScript(BatchScript):
+    """Utility class to write LSF batch scripts."""
+
+    def __init__(self,
+                 script_file=None,
+                 work_dir=os.getcwd(),
+                 nodes=1,
+                 procs_per_node=1,
+                 time_limit=None,
+                 job_name=None,
+                 partition=None,
+                 account=None,
+                 reservation=None,
+                 launcher='jsrun',
+                 launcher_args=[],
+                 interpreter='/bin/bash'):
+        """Construct LSF batch script manager.
+
+        Args:
+            script_file (str): Script file.
+            work_dir (str, optional): Working directory
+                (default: current working directory).
+            nodes (int, optional): Number of compute nodes
+                (default: 1).
+            procs_per_node (int, optional): Parallel processes per
+                compute node (default: 1).
+            time_limit (int, optional): Job time limit, in minutes
+                (default: none).
+            job_name (str, optional): Job name (default: none).
+            partition (str, optional): Scheduler partition
+                (default: none).
+            account (str, optional): Scheduler account
+                (default: none).
+            reservation (str, optional): Scheduler advance reservation
+                (default: none).
+            launcher (str, optional): Parallel command launcher
+                (default: jsrun).
+            launcher_args (`Iterable` of `str`, optional):
+                Command-line arguments to jsrun.
+            interpreter (str, optional): Script interpreter
+                (default: /bin/bash).
+
+        """
+        super().__init__(script_file=script_file,
+                         work_dir=work_dir,
+                         interpreter=interpreter)
+        self.nodes = nodes
+        self.procs_per_node = procs_per_node
+        self.reservation = reservation
+        self.launcher = launcher
+        self.launcher_args = launcher_args
+
+        # Configure header with LSF job options
+        self.add_header_line(f'#BSUB -cwd {self.work_dir}')
+        self.add_header_line(f'#BSUB -o {self.out_log_file}')
+        self.add_header_line(f'#BSUB -e {self.err_log_file}')
+        self.add_header_line(f'#BSUB -nnodes {nodes}')
+        if time_limit:
+            minutes = int(round(max(time_limit, 0)))
+            hours, minutes = divmod(minutes, 60)
+            self.add_header_line(f'#BSUB -W {hours}:{minutes:02}')
+        if job_name:
+            self.add_header_line(f'#BSUB -J {job_name}')
+        if partition:
+            self.add_header_line(f'#BSUB -q {partition}')
+        if account:
+            self.add_header_line(f'#BSUB -G {account}')
+        if self.reservation:
+            self.add_header_line(f'#BSUB -U {self.reservation}')
+
+    def add_parallel_command(self,
+                             command,
+                             work_dir=None,
+                             nodes=None,
+                             procs_per_node=None,
+                             reservation=None,
+                             launcher=None,
+                             launcher_args=None):
+        """Add command to be executed in parallel.
+
+        The command is launched with jsrun. Parallel processes are
+        distributed evenly amongst the compute nodes.
+
+        Args:
+            command (`str` or `Iterable` of `str`s): Command to be
+                executed in parallel.
+            work_dir (str, optional): Working directory.
+            nodes (int, optional): Number of compute nodes.
+            procs_per_node (int, optional): Number of parallel
+                processes per compute node.
+            reservation (str, optional): Scheduler advance reservation.
+            launcher (str, optional): jsrun executable.
+            launcher_args (`Iterable` of `str`s, optional):
+                Command-line arguments to jsrun.
+
+        """
+
+        # Use default values if needed
+        if work_dir is None:
+            work_dir = self.work_dir
+        if nodes is None:
+            nodes = self.nodes
+        if procs_per_node is None:
+            procs_per_node = self.procs_per_node
+        if reservation is None:
+            reservation = self.reservation
+        if launcher is None:
+            launcher = self.launcher
+        if launcher_args is None:
+            launcher_args = self.launcher_args
+
+        # Construct jsrun invocation
+        args = [launcher]
+        args.extend(make_iterable(launcher_args))
+        args.append(f'--chdir {work_dir}')
+        args.extend([
+            f'--nrs {nodes}',
+            '--rs_per_host 1',
+            f'--tasks_per_rs {procs_per_node}',
+            '--launch_distribution packed',
+            '--cpu_per_rs ALL_CPUS',
+            '--gpu_per_rs ALL_GPUS',
+        ])
+        args.extend(make_iterable(command))
+        self.add_command(args)
+
+    def submit(self, overwrite=False):
+        """Submit batch job to LSF with bsub.
+
+        The script file is written before being submitted.
+
+        Args:
+            overwrite (bool): Whether to overwrite script file if it
+                already exists (default: false).
+
+        Returns:
+            int: Exit status from bsub.
+
+        """
+
+        # Construct script file
+        self.write(overwrite=overwrite)
+
+        # Submit batch script and pipe output to log files
+        run_proc = subprocess.Popen(['bsub', self.script_file],
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    cwd=self.work_dir)
+        out_proc = subprocess.Popen(['tee', self.out_log_file],
                                     stdin=run_proc.stdout,
-                                    cwd=experiment_dir)
-        err_proc = subprocess.Popen(['tee', err_file],
+                                    cwd=self.work_dir)
+        err_proc = subprocess.Popen(['tee', self.err_log_file],
                                     stdin=run_proc.stderr,
-                                    cwd=experiment_dir)
+                                    cwd=self.work_dir)
         run_proc.stdout.close()
         run_proc.stderr.close()
         run_proc.wait()
         out_proc.wait()
         err_proc.wait()
+        return run_proc.returncode
diff --git a/python/lbann/launcher/slurm.py b/python/lbann/launcher/slurm.py
index 55ca2b71882..e3acb2b3d19 100644
--- a/python/lbann/launcher/slurm.py
+++ b/python/lbann/launcher/slurm.py
@@ -1,130 +1,191 @@
 """Utility functions for Slurm."""
-import os, os.path
+
+import os
 import subprocess
 from lbann.util import make_iterable
+from .batch_script import BatchScript
+
+def _time_string(minutes):
+    """Time D-hh:mm:ss format."""
+    minutes = max(minutes, 0)
+    seconds = int(round((minutes % 1) * 60))
+    hours, minutes = divmod(int(minutes), 60)
+    days, hours = divmod(hours, 24)
+    return f'{days}-{hours:02}:{minutes:02}:{seconds:02}'
+
+class SlurmBatchScript(BatchScript):
+    """Utility class to write Slurm batch scripts."""
+
+    def __init__(self,
+                 script_file=None,
+                 work_dir=os.getcwd(),
+                 nodes=1,
+                 procs_per_node=1,
+                 time_limit=None,
+                 job_name=None,
+                 partition=None,
+                 account=None,
+                 launcher='srun',
+                 launcher_args=[],
+                 interpreter='/bin/bash'):
+        """Construct Slurm batch script manager.
+
+        Args:
+            script_file (str): Script file.
+            work_dir (str, optional): Working directory
+                (default: current working directory).
+            nodes (int, optional): Number of compute nodes
+                (default: 1).
+            procs_per_node (int, optional): Parallel processes per
+                compute node (default: 1).
+            time_limit (int, optional): Job time limit, in minutes
+                (default: none).
+            job_name (str, optional): Job name (default: none).
+            partition (str, optional): Scheduler partition
+                (default: none).
+            account (str, optional): Scheduler account
+                (default: none).
+            launcher (str, optional): Parallel command launcher
+                (default: srun).
+            launcher_args (`Iterable` of `str`, optional):
+                Command-line arguments to srun.
+            interpreter (str, optional): Script interpreter
+                (default: /bin/bash).
+
+        """
+        super().__init__(script_file=script_file,
+                         work_dir=work_dir,
+                         interpreter=interpreter)
+        self.nodes = nodes
+        self.procs_per_node = procs_per_node
+        self.time_limit = time_limit
+        self.job_name = job_name
+        self.partition = partition
+        self.account = account
+        self.launcher = launcher
+        self.launcher_args = launcher_args
+
+        # Configure header with Slurm job options
+        self.add_header_line(f'#SBATCH --chdir={self.work_dir}')
+        self.add_header_line(f'#SBATCH --output={self.out_log_file}')
+        self.add_header_line(f'#SBATCH --error={self.err_log_file}')
+        self.add_header_line(f'#SBATCH --nodes={self.nodes}')
+        self.add_header_line(f'#SBATCH --ntasks={self.nodes * self.procs_per_node}')
+        self.add_header_line(f'#SBATCH --ntasks-per-node={self.procs_per_node}')
+        if self.time_limit is not None:
+            self.add_header_line(f'#SBATCH --time={_time_string(self.time_limit)}')
+        if self.job_name:
+            self.add_header_line(f'#SBATCH --job-name={self.job_name}')
+        if self.partition:
+            self.add_header_line(f'#SBATCH --partition={self.partition}')
+        if self.account:
+            self.add_header_line(f'#SBATCH --account={self.account}')
+
+        for arg in self.launcher_args:
+            self.add_header_line(f'#SBATCH {arg}')
+
+    def add_parallel_command(self,
+                             command,
+                             work_dir=None,
+                             nodes=None,
+                             procs_per_node=None,
+                             time_limit=None,
+                             job_name=None,
+                             partition=None,
+                             account=None,
+                             launcher=None,
+                             launcher_args=None):
+        """Add command to be executed in parallel.
+
+        The command is launched with srun. Parallel processes are
+        distributed evenly amongst the compute nodes.
 
-def run(command,
-        experiment_dir=os.getcwd(),
-        nodes=1,
-        procs_per_node=1,
-        time_limit=-1,
-        job_name=None,
-        partition=None,
-        account=None,
-        reservation=None,
-        srun_args='',
-        environment={},
-        setup_only=False):
-    """Run executable with Slurm.
-
-    Creates a Slurm batch script in the experiment directory. If a
-    Slurm job allocation is detected, the script is run
-    directly. Otherwise, the script is submitted to sbatch.
-
-    Args:
-        command (str): Program to run under Slurm, i.e. an executable
-            and its command-line arguments.
-        experiment_dir (str, optional): Experiment directory.
-        nodes (int, optional): Number of compute nodes.
-        procs_per_node (int, optional): Number of processes per compute
-            node.
-        time_limit (int, optional): Job time limit, in minutes. A
-            negative value implies the system-default time limit.
-        job_name (str, optional): Batch job name.
-        partition (str, optional): Scheduler partition.
-        account (str, optional): Scheduler account.
-        reservation (str, optional): Scheduler reservation name.
-        srun_args (str, optional): Command-line arguments to srun.
-        environment (dict of {str: str}, optional): Environment
-            variables.
-        setup_only (bool, optional): If true, the experiment is not
-            run after the batch script is created.
-
-    """
-
-    # Check for an existing job allocation from Slurm
-    # Note: Settings for current job allocation take precedence
-    has_allocation = 'SLURM_JOB_ID' in os.environ
-    if has_allocation:
-        job_name = os.environ['SLURM_JOB_NAME']
-        partition = os.environ['SLURM_JOB_PARTITION']
-        account = os.environ['SLURM_JOB_ACCOUNT']
-        time_limit = -1
-
-    # Experiment directory
-    experiment_dir = os.path.abspath(experiment_dir)
-    os.makedirs(experiment_dir, exist_ok=True)
-    batch_file = os.path.join(experiment_dir, 'batch.sh')
-    out_file = os.path.join(experiment_dir, 'out.log')
-    err_file = os.path.join(experiment_dir, 'err.log')
-    nodes_file = os.path.join(experiment_dir, 'nodes.txt')
-
-    # Write batch script
-    with open(batch_file, 'w') as f:
-        f.write('#!/bin/sh\n')
-
-        # Slurm job settings
+        Args:
+            command (`str` or `Iterable` of `str`s): Command to be
+                executed in parallel.
+            work_dir (str, optional): Working directory.
+            nodes (int, optional): Number of compute nodes.
+            procs_per_node (int, optional): Number of parallel
+                processes per compute node.
+            time_limit (int, optional): Job time limit, in minutes.
+            job_name (str, optional): Job name.
+            partition (str, optional): Scheduler partition.
+            account (str, optional): Scheduler account.
+            launcher (str, optional): srun executable.
+            launcher_args (`Iterable` of `str`s, optional):
+                Command-line arguments to srun.
+
+        """
+
+        # Use default values if needed
+        if work_dir is None:
+            work_dir = self.work_dir
+        if nodes is None:
+            nodes = self.nodes
+        if procs_per_node is None:
+            procs_per_node = self.procs_per_node
+        if time_limit is None:
+            time_limit = self.time_limit
+        if job_name is None:
+            job_name = self.job_name
+        if partition is None:
+            partition = self.partition
+        if account is None:
+            account = self.account
+        if launcher is None:
+            launcher = self.launcher
+        if launcher_args is None:
+            launcher_args = self.launcher_args
+
+        # Construct srun invocation
+        args = [launcher]
+        args.extend(make_iterable(launcher_args))
+        args.append(f'--chdir={work_dir}')
+        args.append(f'--nodes={nodes}')
+        args.append(f'--ntasks={nodes * procs_per_node}')
+        args.append(f'--ntasks-per-node={procs_per_node}')
+        if time_limit is not None:
+            args.append(f'--time={_time_string(time_limit)}')
         if job_name:
-            f.write('#SBATCH --job-name={}\n'.format(job_name))
-        f.write('#SBATCH --nodes={}\n'.format(nodes))
+            args.append(f'--job-name={job_name}')
         if partition:
-            f.write('#SBATCH --partition={}\n'.format(partition))
+            args.append(f'--partition={partition}')
         if account:
-            f.write('#SBATCH --account={}\n'.format(account))
-        if reservation:
-            raise ValueError('Slurm reservations not supported')
-        f.write('#SBATCH --workdir={}\n'.format(experiment_dir))
-        f.write('#SBATCH --output={}\n'.format(out_file))
-        f.write('#SBATCH --error={}\n'.format(err_file))
-        if time_limit >= 0:
-            seconds = int((time_limit % 1) * 60)
-            hours, minutes = divmod(int(time_limit), 60)
-            days, hours = divmod(hours, 24)
-            f.write('#SBATCH --time={}-{:02d}:{:02d}:{:02d}\n'
-                    .format(days, hours, minutes, seconds))
-
-        # Set environment
-        if environment:
-            f.write('\n')
-            f.write('# ==== Environment ====\n')
-            for variable, value in environment.items():
-                f.write('export {}={}\n'.format(variable, value))
-
-        # Display time and node list
-        f.write('\n')
-        f.write('# ==== Useful info ====\n')
-        f.write('date\n')
-        f.write('srun --nodes={0} --ntasks={0} hostname > {1}\n'
-                .format(nodes, nodes_file))
-        f.write('sort --unique --output={0} {0}\n'.format(nodes_file))
-
-        # Run experiment
-        f.write('\n')
-        f.write('# ==== Experiment ====\n')
-        for cmd in make_iterable(command):
-            f.write('srun {} --nodes={} --ntasks={} {}\n'
-                    .format(srun_args, nodes, nodes * procs_per_node,
-                            cmd))
-
-    # Make batch script executable
-    os.chmod(batch_file, 0o755)
-
-    # Launch job if needed
-    # Note: Pipes output to log files
-    if not setup_only:
-        run_exe = 'sh' if has_allocation else 'sbatch'
-        run_proc = subprocess.Popen([run_exe, batch_file],
-                                    stdout = subprocess.PIPE,
-                                    stderr = subprocess.PIPE,
-                                    cwd = experiment_dir)
-        out_proc = subprocess.Popen(['tee', out_file],
-                                    stdin = run_proc.stdout,
-                                    cwd = experiment_dir)
-        err_proc = subprocess.Popen(['tee', err_file],
-                                    stdin = run_proc.stderr,
-                                    cwd = experiment_dir)
+            args.append(f'--account={account}')
+        args.extend(make_iterable(command))
+        self.add_command(args)
+
+    def submit(self, overwrite=False):
+        """Submit batch job to Slurm with sbatch.
+
+        The script file is written before being submitted.
+
+        Args:
+            overwrite (bool): Whether to overwrite script file if it
+                already exists (default: false).
+
+        Returns:
+            int: Exit status from sbatch.
+
+        """
+
+        # Construct script file
+        self.write(overwrite=overwrite)
+
+        # Submit batch script and pipe output to log files
+        run_proc = subprocess.Popen(['sbatch', self.script_file],
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    cwd=self.work_dir)
+        out_proc = subprocess.Popen(['tee', self.out_log_file],
+                                    stdin=run_proc.stdout,
+                                    cwd=self.work_dir)
+        err_proc = subprocess.Popen(['tee', self.err_log_file],
+                                    stdin=run_proc.stderr,
+                                    cwd=self.work_dir)
         run_proc.stdout.close()
         run_proc.stderr.close()
         run_proc.wait()
         out_proc.wait()
         err_proc.wait()
+        return run_proc.returncode
diff --git a/python/lbann/layer.py b/python/lbann/layer.py
deleted file mode 100644
index b553cba52c2..00000000000
--- a/python/lbann/layer.py
+++ /dev/null
@@ -1,135 +0,0 @@
-"""Neural network tensor operations."""
-import abc
-from lbann import lbann_pb2
-from lbann.util import make_iterable
-import lbann.util.class_generator
-
-class Layer(abc.ABC):
-    """Neural network tensor operation."""
-
-    global_count = 0  # Static counter, used for default names
-
-    def __init__(self, parents = [], children = [], weights = [],
-                 name = None, data_layout = 'data_parallel',
-                 hint_layer = None):
-        """Constructor.
-
-        Args:
-            parents (Iterable of Layer, optional): Sources of input
-                tensors.
-            children (Iterable of Layer, optional): Destinations of
-                output tensors.
-            weights (Iterable of Weights, optional): Trainable
-                parameters.
-            name (str, optional): Unique identifier (default is
-                'layer<index>').
-            data_layout (str, optional): Data distribution scheme.
-            hint_layer (Layer, optional): Hint for output dimensions.
-
-        """
-        Layer.global_count += 1
-        self.parents = []
-        self.children = []
-        self.weights = []
-        self.name = name if name else 'layer{0}'.format(Layer.global_count)
-        self.data_layout = data_layout
-        self.hint_layer = hint_layer
-
-        # Initialize parents, children, and weights
-        for l in make_iterable(parents):
-            self.add_parent(l)
-        for l in make_iterable(children):
-            self.add_child(child)
-        for w in make_iterable(weights):
-            self.add_weights(w)
-
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-        proto = lbann_pb2.Layer()
-        proto.parents = ' '.join([l.name for l in self.parents])
-        proto.children = ' '.join([l.name for l in self.children])
-        proto.weights = ' '.join([w.name for w in self.weights])
-        proto.name = self.name
-        proto.data_layout = self.data_layout
-        proto.hint_layer = self.hint_layer.name if self.hint_layer else ''
-        return proto
-
-    def add_parent(self, parent):
-        """This layer will receive an input tensor from `parent`."""
-        for p in make_iterable(parent):
-            self.parents.append(p)
-            p.children.append(self)
-
-    def add_child(self, child):
-        """"This layer will send an output tensor to `child`."""
-        for c in make_iterable(child):
-            self.children.append(c)
-            c.parents.append(self)
-
-    def add_weights(self, w):
-        """Add w to this layer's weights."""
-        self.weights.extend(make_iterable(w))
-
-    def __call__(self, parent):
-        """This layer will recieve an input tensor from `parent`.
-
-        Syntactic sugar around `add_parent` function.
-
-        """
-        self.add_parent(parent)
-
-# Generate Layer sub-classes from lbann.proto
-# Note: The list of skip fields must be updated if any new fields are
-# added to the Layer message in lbann.proto
-classes = lbann.util.class_generator.generate_classes_from_protobuf_message(
-    lbann_pb2.Layer,
-    skip_fields = set([
-        'name', 'parents', 'children', 'data_layout', 'device_allocation',
-        'weights', 'num_neurons_from_data_reader', 'freeze', 'hint_layer',
-        'weights_data', 'top', 'bottom', 'type', 'motif_layer']),
-    base_class = Layer,
-    base_kwargs = set([
-        'parents', 'children', 'weights',
-        'name', 'data_layout', 'hint_layer']),
-    base_has_export_proto = True)
-for c in classes:
-    globals()[c.__name__] = c
-
-def traverse_layer_graph(layers):
-    """Topologically ordered traversal of layer graph.
-
-    All layers that are connected to `layers` will be traversed. The
-    layer graph is assumed to be acyclic. No checks are made for
-    cycles and strange things may happen if one exists.
-
-    Args:
-        layers (Layer or Iterator of Layer): Node(s) in layer graph.
-
-    Yields:
-        Layer: Node in layer graph, in a topological order.
-
-    """
-
-    # DFS to find root nodes in layer graph
-    roots = []
-    visited = set()
-    stack = list(make_iterable(layers))
-    while stack:
-        l = stack.pop()
-        if l not in visited:
-            visited.add(l)
-            stack.extend(l.parents)
-            stack.extend(l.children)
-            if not l.parents:
-                roots.append(l)
-
-    # DFS to traverse layer graph in topological order
-    visited = set()
-    stack = roots
-    while stack:
-        l = stack.pop()
-        if (l not in visited
-            and all([(p in visited) for p in l.parents])):
-            visited.add(l)
-            stack.extend(l.children)
-            yield l
diff --git a/python/lbann/model.py b/python/lbann/model.py
deleted file mode 100644
index 19e3f79e248..00000000000
--- a/python/lbann/model.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""Neural network model."""
-import abc
-from lbann import lbann_pb2
-from lbann.util import make_iterable
-import lbann.layer
-import lbann.objective_function
-
-class Model:
-    """Neural network model."""
-
-    def __init__(self, mini_batch_size, epochs,
-                 layers=[], weights=[], objective_function=None,
-                 metrics=[], callbacks=[]):
-
-        # Scalar fields
-        self.mini_batch_size = mini_batch_size
-        self.epochs = epochs
-        self.block_size = 256           # TODO: Make configurable
-        self.num_parallel_readers = 0   # TODO: Make configurable
-        self.procs_per_trainer = 0      # TODO: Make configurable
-
-        # Get connected layers
-        self.layers = list(lbann.layer.traverse_layer_graph(layers))
-
-        # Get weights associated with layers
-        self.weights = set(make_iterable(weights))
-        for l in self.layers:
-            self.weights.update(l.weights)
-
-        # Construct objective function if needed
-        obj_type = lbann.objective_function.ObjectiveFunction
-        if isinstance(objective_function, obj_type):
-            self.objective_function = objective_function
-        elif objective_function is None:
-            self.objective_function = obj_type()
-        else:
-            self.objective_function = obj_type(objective_function)
-
-        # Metrics and callbacks
-        self.metrics = make_iterable(metrics)
-        self.callbacks = make_iterable(callbacks)
-
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-        # Initialize protobuf message
-        model = lbann_pb2.Model()
-        model.mini_batch_size = self.mini_batch_size
-        model.num_epochs = self.epochs
-        model.block_size = self.block_size
-        model.num_parallel_readers = self.num_parallel_readers
-        model.procs_per_trainer = self.procs_per_trainer
-
-        # Add model components
-        model.layer.extend([l.export_proto() for l in self.layers])
-        model.weights.extend([w.export_proto() for w in self.weights])
-        model.objective_function.CopyFrom(self.objective_function.export_proto())
-        model.metric.extend([m.export_proto() for m in self.metrics])
-        model.callback.extend([c.export_proto() for c in self.callbacks])
-
-        return model
-
-    def save_proto(self, filename):
-        """Export model to prototext file."""
-        save_prototext(filename, model=self.export_proto())
diff --git a/python/lbann/models/README.md b/python/lbann/models/README.md
index cba3a79b693..7a293e53c63 100644
--- a/python/lbann/models/README.md
+++ b/python/lbann/models/README.md
@@ -8,5 +8,5 @@ used as components within more complicated models.
 _Note to developers_: This directory is reserved for very
 well-established models (i.e. O(1000) citations) and any Python
 scripts should be usable by external users. Experimental models
-should be kept within the model_zoo directory in the base LBANN
+should be kept within the applications directory in the base LBANN
 directory.
diff --git a/python/lbann/models/__init__.py b/python/lbann/models/__init__.py
index 257cfd46152..34f1d19c732 100644
--- a/python/lbann/models/__init__.py
+++ b/python/lbann/models/__init__.py
@@ -1,3 +1,4 @@
 from lbann.models.alexnet import AlexNet
 from lbann.models.lenet import LeNet
 from lbann.models.resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
+from lbann.models.transformer import Transformer, TransformerEncoderLayer, TransformerDecoderLayer
diff --git a/python/lbann/models/resnet.py b/python/lbann/models/resnet.py
index 3becd020348..319c8ec8cb4 100644
--- a/python/lbann/models/resnet.py
+++ b/python/lbann/models/resnet.py
@@ -13,7 +13,7 @@ class ConvBNRelu(lbann.modules.Module):
     """
 
     def __init__(self, out_channels, kernel_size, stride, padding,
-                 bn_zero_init, bn_stats_aggregation,
+                 bn_zero_init, bn_statistics_group_size,
                  relu, name):
         """Initialize ConvBNRelu module.
 
@@ -25,8 +25,8 @@ def __init__(self, out_channels, kernel_size, stride, padding,
             padding (int): Convolution padding.
             bn_zero_init (bool): Zero-initialize batch normalization
                 scale.
-            bn_stats_aggregation (str): Aggregation mode for batch
-                normalization statistics.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
             relu (bool): Apply ReLU activation.
             name (str): Module name.
 
@@ -41,6 +41,7 @@ def __init__(self, out_channels, kernel_size, stride, padding,
             stride=stride, padding=padding,
             bias=False,
             name=self.name + '_conv')
+            
 
         # Initialize batch normalization
         bn_scale_init = 0.0 if bn_zero_init else 1.0
@@ -51,7 +52,7 @@ def __init__(self, out_channels, kernel_size, stride, padding,
             initializer=lbann.ConstantInitializer(value=0.0),
             name=self.name + '_bn_bias')
         self.bn_weights = [bn_scale, bn_bias]
-        self.bn_stats_aggregation = bn_stats_aggregation
+        self.bn_statistics_group_size = bn_statistics_group_size
 
         # Initialize ReLU
         self.relu = relu
@@ -61,7 +62,8 @@ def forward(self, x):
         conv = self.conv(x)
         bn = lbann.BatchNormalization(
             conv, weights=self.bn_weights,
-            stats_aggregation=self.bn_stats_aggregation,
+            statistics_group_size=(-1 if self.bn_statistics_group_size == 0
+                                   else self.bn_statistics_group_size),
             name='{0}_bn_instance{1}'.format(self.name,self.instance))
         if self.relu:
             return lbann.Relu(
@@ -80,7 +82,7 @@ class BasicBlock(lbann.modules.Module):
 
     def __init__(self, in_channels, mid_channels,
                  downsample, zero_init_residual,
-                 bn_stats_aggregation, name, width=1):
+                 bn_statistics_group_size, name, width=1):
         """Initialize residual block.
 
         Args:
@@ -90,8 +92,8 @@ def __init__(self, in_channels, mid_channels,
                 factor of 2 in each spatial dimension).
             zero_init_residual (bool): Zero-initialize the scale in
                 the final batch normalization in the residual branch.
-            bn_stats_aggregation (str): Aggregation mode for batch
-                normalization statistics.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
             name (str): Module name.
             width (float, optional): Width growth factor for 3x3
                 convolutions.
@@ -106,11 +108,11 @@ def __init__(self, in_channels, mid_channels,
         # Skip connection
         if downsample:
             self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0,
-                                      False, bn_stats_aggregation,
+                                      False, bn_statistics_group_size,
                                       False, self.name + '_branch1')
         elif in_channels != self.out_channels:
             self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0,
-                                      False, bn_stats_aggregation,
+                                      False, bn_statistics_group_size,
                                       False, self.name + '_branch1')
         else:
             self.branch1 = None
@@ -118,18 +120,18 @@ def __init__(self, in_channels, mid_channels,
         # Residual branch
         self.branch2a = ConvBNRelu(mid_channels, 3,
                                    (2 if downsample else 1), 1,
-                                   False, bn_stats_aggregation,
+                                   False, bn_statistics_group_size,
                                    True, self.name + '_branch2a')
         self.branch2b = ConvBNRelu(self.out_channels, 3, 1, 1,
                                    zero_init_residual,
-                                   bn_stats_aggregation,
+                                   bn_statistics_group_size,
                                    False, self.name + '_branch2b')
 
     def forward(self, x):
         self.instance += 1
         y1 = self.branch1(x) if self.branch1 else x
         y2 = self.branch2b(self.branch2a(x))
-        z = lbann.Add([y1, y2],
+        z = lbann.Add(y1, y2,
                       name='{0}_sum_instance{1}'.format(self.name,self.instance))
         return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance))
 
@@ -144,7 +146,7 @@ class BottleneckBlock(lbann.modules.Module):
 
     def __init__(self, in_channels, mid_channels,
                  downsample, zero_init_residual,
-                 bn_stats_aggregation, name, width=1):
+                 bn_statistics_group_size, name, width=1):
         """Initialize residual block.
 
         Args:
@@ -154,8 +156,8 @@ def __init__(self, in_channels, mid_channels,
                 factor of 2 in each spatial dimension).
             zero_init_residual (bool): Zero-initialize the scale in
                 the final batch normalization in the residual branch.
-            bn_stats_aggregation (str): Aggregation mode for batch
-                normalization statistics.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
             name (str): Module name.
             width (float, optional): Width growth factor for 3x3
                 convolutions.
@@ -171,33 +173,33 @@ def __init__(self, in_channels, mid_channels,
         # Skip connection
         if downsample:
             self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0,
-                                      False, bn_stats_aggregation,
+                                      False, bn_statistics_group_size,
                                       False, self.name + '_branch1')
         elif in_channels != self.out_channels:
             self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0,
-                                      False, bn_stats_aggregation,
+                                      False, bn_statistics_group_size,
                                       False, self.name + '_branch1')
         else:
             self.branch1 = None
 
         # Residual branch
         self.branch2a = ConvBNRelu(mid_channels, 1, 1, 0,
-                                   False, bn_stats_aggregation,
+                                   False, bn_statistics_group_size,
                                    True, self.name + '_branch2a')
         self.branch2b = ConvBNRelu(mid_channels, 3,
                                    (2 if downsample else 1), 1,
-                                   False, bn_stats_aggregation,
+                                   False, bn_statistics_group_size,
                                    True, self.name + '_branch2b')
         self.branch2c = ConvBNRelu(self.out_channels, 1, 1, 0,
                                    zero_init_residual,
-                                   bn_stats_aggregation,
+                                   bn_statistics_group_size,
                                    False, self.name + '_branch2c')
 
     def forward(self, x):
         self.instance += 1
         y1 = self.branch1(x) if self.branch1 else x
         y2 = self.branch2c(self.branch2b(self.branch2a(x)))
-        z = lbann.Add([y1, y2],
+        z = lbann.Add(y1, y2,
                       name='{0}_sum_instance{1}'.format(self.name,self.instance))
         return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance))
 
@@ -228,7 +230,7 @@ class ResNet(lbann.modules.Module):
 
     def __init__(self, block, output_size,
                  layer_sizes, layer_channels,
-                 zero_init_residual, bn_stats_aggregation,
+                 zero_init_residual, bn_statistics_group_size,
                  name, width=1):
         """Initialize ResNet.
 
@@ -242,8 +244,8 @@ def __init__(self, block, output_size,
                 internal channels in each ResNet layer.
             zero_init_residual (bool): Whether to initialize the final
                 batch normalization in residual branches with zeros.
-            bn_stats_aggregation (str): Aggregation mode for batch
-                normalization statistics.
+            bn_statistics_group_size (int): Group size for aggregating
+                batch normalization statistics.
             name (str): Module name.
             width (float, optional): Width growth factor.
 
@@ -252,7 +254,7 @@ def __init__(self, block, output_size,
         self.name = name
         self.instance = 0
         self.conv1 = ConvBNRelu(layer_channels[0], 7, 2, 3,
-                                False, bn_stats_aggregation,
+                                False, bn_statistics_group_size,
                                 True, self.name + '_conv1')
         self.blocks = []
         for layer in range(len(layer_sizes)):
@@ -264,7 +266,7 @@ def __init__(self, block, output_size,
                 downsample = (i == 0 and layer > 0)
                 b = block(in_channels, mid_channels,
                           downsample, zero_init_residual,
-                          bn_stats_aggregation,
+                          bn_statistics_group_size,
                           '{0}_layer{1}_block{2}'.format(self.name, layer, i),
                           width=width)
                 self.blocks.append(b)
@@ -300,7 +302,7 @@ class ResNet18(ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None, width=1):
         """Initialize ResNet-18.
 
@@ -309,8 +311,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name
                 (default: 'resnet18_module<index>')
             width (float, optional): Width growth factor.
@@ -321,7 +323,7 @@ def __init__(self, output_size,
             name = 'resnet18_module{0}'.format(ResNet18.global_count)
         super().__init__(BasicBlock, output_size,
                          (2,2,2,2), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation,
+                         zero_init_residual, bn_statistics_group_size,
                          name, width=width)
 
 class ResNet34(ResNet):
@@ -341,7 +343,7 @@ class ResNet34(ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None, width=1):
         """Initialize ResNet-34.
 
@@ -350,8 +352,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name
                 (default: 'resnet34_module<index>')
             width (float, optional): Width growth factor.
@@ -362,7 +364,7 @@ def __init__(self, output_size,
             name = 'resnet34_module{0}'.format(ResNet34.global_count)
         super().__init__(BasicBlock, output_size,
                          (3,4,6,3), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation,
+                         zero_init_residual, bn_statistics_group_size,
                          name, width=width)
 
 class ResNet50(ResNet):
@@ -382,7 +384,7 @@ class ResNet50(ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None, width=1):
         """Initialize ResNet-50.
 
@@ -391,8 +393,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name
                 (default: 'resnet50_module<index>')
             width (float, optional): Width growth factor.
@@ -403,7 +405,7 @@ def __init__(self, output_size,
             name = 'resnet50_module{0}'.format(ResNet50.global_count)
         super().__init__(BottleneckBlock, output_size,
                          (3,4,6,3), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation,
+                         zero_init_residual, bn_statistics_group_size,
                          name, width=width)
 
 class ResNet101(ResNet):
@@ -423,7 +425,7 @@ class ResNet101(ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None, width=1):
         """Initialize ResNet-101.
 
@@ -432,8 +434,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name
                 (default: 'resnet101_module<index>')
             width (float, optional): Width growth factor.
@@ -444,7 +446,7 @@ def __init__(self, output_size,
             name = 'resnet101_module{0}'.format(ResNet101.global_count)
         super().__init__(BottleneckBlock, output_size,
                          (3,4,23,3), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation,
+                         zero_init_residual, bn_statistics_group_size,
                          name, width=width)
 
 class ResNet152(ResNet):
@@ -464,7 +466,7 @@ class ResNet152(ResNet):
 
     def __init__(self, output_size,
                  zero_init_residual=True,
-                 bn_stats_aggregation='local',
+                 bn_statistics_group_size=1,
                  name=None, width=1):
         """Initialize ResNet-152.
 
@@ -473,8 +475,8 @@ def __init__(self, output_size,
             zero_init_residual (bool, optional): Whether to initialize
                 the final batch normalization in residual branches
                 with zeros.
-            bn_stats_aggregation (str, optional): Aggregation mode for
-                batch normalization statistics.
+            bn_statistics_group_size (str, optional): Group size for
+                aggregating batch normalization statistics.
             name (str, optional): Module name
                 (default: 'resnet152_module<index>')
             width (float, optional): Width growth factor.
@@ -485,5 +487,5 @@ def __init__(self, output_size,
             name = 'resnet152_module{0}'.format(ResNet152.global_count)
         super().__init__(BottleneckBlock, output_size,
                          (3,8,36,3), (64,128,256,512),
-                         zero_init_residual, bn_stats_aggregation,
+                         zero_init_residual, bn_statistics_group_size,
                          name, width=width)
diff --git a/python/lbann/models/transformer.py b/python/lbann/models/transformer.py
new file mode 100644
index 00000000000..49a60e845de
--- /dev/null
+++ b/python/lbann/models/transformer.py
@@ -0,0 +1,451 @@
+"""Basic Transformer model with multi-head self-attention.
+
+See:
+
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
+Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. "Attention
+is all you need." In Advances in Neural Information Processing
+Systems, pp. 5998-6008. 2017.
+
+"""
+import math
+import numpy as np
+
+import lbann
+import lbann.modules
+from lbann.util import str_list
+
+class TransformerEncoderLayer(lbann.modules.Module):
+    """Building block for encoder in Transformer model.
+
+    Comprised of multi-head attention and a fully-connected
+    feedforward network, each with a residual connection.
+
+    Args:
+        embed_dim (int): Internal dimensionality of multi-head
+            attention.
+        num_heads (int): Number of attention heads.
+        feedforward_dim (int): Internal dimensionality of
+            fully-connected feedforward network.
+        dropout (float): Dropout probability.
+        name (str): Default name is in the form
+            'transformerencoderlayer<index>'.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(
+        self,
+        embed_dim=512,
+        num_heads=8,
+        feedforward_dim=2048,
+        dropout=0.1,
+        name=None,
+    ):
+        TransformerEncoderLayer.global_count += 1
+        self.instance = 0
+        self.embed_dim = embed_dim
+        self.feedforward_dim = feedforward_dim
+        self.dropout_prob = dropout
+
+        # Module name
+        self.name = name
+        if not self.name:
+            self.name = f'transformerencoderlayer{TransformerEncoderLayer.global_count}'
+
+        # Layer modules
+        self.attention = lbann.modules.transformer.MultiheadAttention(
+            self.embed_dim,
+            num_heads,
+            name=f'{self.name}_attention'
+        )
+
+        # Weights for fully-connected layers
+        self.fc1_weights = [
+            lbann.Weights(initializer=lbann.HeNormalInitializer(),
+                          name=f'{self.name}_fc1_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_fc1_bias'),
+        ]
+        self.fc2_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_fc2_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_fc2_bias'),
+        ]
+
+    def forward(self, x, mask=None):
+        """Apply Transformer encoder layer.
+
+        Args:
+            x (lbann.Layer): Sequence of input vectors.
+            mask (lbann.Layer, optional): Attention mask.
+
+        Returns:
+            lbann.Layer: Sequence of output vectors.
+
+        """
+        self.instance += 1
+        name = f'{self.name}_instance{self.instance}'
+
+        # Self-attention with residual connection
+        y = self.attention(x, x, x, mask=mask)
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop1',
+            )
+        z = lbann.Sum(x, y, name=f'{name}_sum1')
+        z = lbann.InstanceNorm(z, name=f'{name}_norm1')
+        x = z
+
+        # Feedforward network with residual connection
+        y = lbann.ChannelwiseFullyConnected(
+            x,
+            weights=self.fc1_weights,
+            output_channel_dims=[self.feedforward_dim],
+            name=f'{name}_fc1',
+        )
+        y = lbann.Relu(y, name=f'{name}_relu1')
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop2',
+            )
+        y = lbann.ChannelwiseFullyConnected(
+            y,
+            weights=self.fc2_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}_fc2',
+        )
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop3',
+            )
+        z = lbann.Sum(x, y, name=f'{name}_sum2')
+        z = lbann.InstanceNorm(z, name=f'{name}_norm2')
+        return z
+
+class TransformerDecoderLayer(lbann.modules.Module):
+    """Building block for decoder in Transformer model.
+
+    Comprised of two multi-head attention modules and a
+    fully-connected feedforward network, each with a residual
+    connection.
+
+    Args:
+        embed_dim (int): Internal dimensionality of multi-head
+            attention.
+        num_heads (int): Number of attention heads.
+        feedforward_dim (int): Internal dimensionality of
+            fully-connected feedforward network.
+        dropout (float): Dropout probability.
+        name (str): Default name is in the form
+            'transformerdecoderlayer<index>'.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(
+        self,
+        embed_dim=512,
+        num_heads=8,
+        feedforward_dim=2048,
+        dropout=0.1,
+        name=None,
+    ):
+        TransformerDecoderLayer.global_count += 1
+        self.instance = 0
+        self.embed_dim = embed_dim
+        self.feedforward_dim = feedforward_dim
+        self.dropout_prob = dropout
+
+        # Module name
+        self.name = name
+        if not self.name:
+            self.name = f'transformerdecoderlayer{TransformerDecoderLayer.global_count}'
+
+        # Layer modules
+        self.attention1 = lbann.modules.transformer.MultiheadAttention(
+            embed_dim,
+            num_heads,
+            name=f'{self.name}_attention1'
+        )
+        self.attention2 = lbann.modules.transformer.MultiheadAttention(
+            embed_dim,
+            num_heads,
+            name=f'{self.name}_attention2'
+        )
+
+        # Weights for fully-connected layers
+        self.fc1_weights = [
+            lbann.Weights(initializer=lbann.HeNormalInitializer(),
+                          name=f'{self.name}_fc1_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_fc1_bias'),
+        ]
+        self.fc2_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_fc2_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_fc2_bias'),
+        ]
+
+    def forward(self, x, memory, src_mask=None, tgt_mask=None):
+        """Apply Transformer decoder layer.
+
+        Args:
+            x (lbann.Layer): Sequence of input vectors.
+            memory (lbann.Layer): Sequence of vectors produced by
+                Transformer encoder stack.
+            src_mask (lbann.Layer, optional): Attention mask for
+                second attention module (attends to both `x` and
+                `memory`).
+            tgt_mask (lbann.Layer, optional): Attention mask for first
+                attention module (attends only to `x`).
+
+        Returns:
+            lbann.Layer: Sequence of output vectors.
+
+        """
+        self.instance += 1
+        name = f'{self.name}_instance{self.instance}'
+
+        # Self-attention with residual connection
+        y = self.attention1(x, x, x, mask=tgt_mask)
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop1',
+            )
+        z = lbann.Sum(x, y, name=f'{name}_sum1')
+        z = lbann.InstanceNorm(z, name=f'{name}_norm1')
+        x = z
+
+        # Attention on encoder output with residual connection
+        y = self.attention2(x, memory, memory, mask=src_mask)
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop2',
+            )
+        z = lbann.Sum(x, y, name=f'{name}_sum2')
+        z = lbann.InstanceNorm(z, name=f'{name}_norm2')
+        x = z
+
+        # Feedforward network with residual connection
+        y = lbann.ChannelwiseFullyConnected(
+            x,
+            weights=self.fc1_weights,
+            output_channel_dims=[self.feedforward_dim],
+            name=f'{name}_fc1',
+        )
+        y = lbann.Relu(y, name=f'{name}_relu1')
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop3',
+            )
+        y = lbann.ChannelwiseFullyConnected(
+            y,
+            weights=self.fc2_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}_fc2',
+        )
+        if self.dropout_prob > 0:
+            y = lbann.Dropout(
+                y,
+                keep_prob=1-self.dropout_prob,
+                name=f'{name}_drop4',
+            )
+        z = lbann.Sum(x, y, name=f'{name}_sum3')
+        z = lbann.InstanceNorm(z, name=f'{name}_norm3')
+        return z
+
+class Transformer(lbann.modules.Module):
+    """Transformer model.
+
+    See:
+
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
+    Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.
+    "Attention is all you need." In Advances in Neural Information
+    Processing Systems, pp. 5998-6008. 2017.
+
+    Args:
+        hidden_dim (int): Internal dimensionality of multi-head
+            attention.
+        num_heads (int): Number of attention heads.
+        num_encoder_layers (int): Number of stacked layers in encoder.
+        num_decoder_layers (int): Number of stacked layers in decoder.
+        filter_dim (int): Internal dimensionality of fully-connected
+            feedforward networks.
+        dropout (float): Dropout probability.
+        name (str): Default name is in the form
+            'transformer<index>'.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(
+        self,
+        hidden_size=512,
+        num_heads=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        filter_size=2048,
+        dropout=0.1,
+        name=None,
+    ):
+        Transformer.global_count += 1
+        self.instance = 0
+        self.hidden_size = hidden_size
+
+        # Module name
+        self.name = name
+        if not self.name:
+            self.name = f'transformer{Transformer.global_count}'
+
+        # Caches for helper functions
+        self._subsequent_mask_cache = {}
+        self._positional_encoding_cache = {}
+
+        # Encoder and decoder stacks
+        self.encoder = [
+            TransformerEncoderLayer(
+                embed_dim=hidden_size,
+                num_heads=num_heads,
+                feedforward_dim=filter_size,
+                dropout=dropout,
+                name=f'{self.name}_encoder{i}',
+            )
+            for i in range(num_encoder_layers)
+        ]
+        self.decoder = [
+            TransformerDecoderLayer(
+                embed_dim=hidden_size,
+                num_heads=num_heads,
+                feedforward_dim=filter_size,
+                dropout=dropout,
+                name=f'{self.name}_decoder{i}',
+            )
+            for i in range(num_decoder_layers)
+        ]
+
+    def _positional_encoding(self, sequence_length):
+        """Positional encodings corresponding to a sequence length.
+
+        PE(pos,2*i)   = sin( pos / 10000**(2*i/hidden_size) )
+
+        PE(pos,2*i+1) = cos( pos / 10000**(2*i/hidden_size) )
+
+        Encodings are memoized.
+
+        """
+
+        # Construct positional encoding if not in cache
+        if sequence_length not in self._positional_encoding_cache:
+            vals = []
+            for pos in range(sequence_length):
+                for i in range((self.hidden_size+1) // 2):
+                    x = pos / 10000**(2*i/self.hidden_size)
+                    vals.append(math.sin(x))
+                    vals.append(math.cos(x))
+                if self.hidden_size % 2 != 0:
+                    vals.pop()
+            weights = lbann.Weights(
+                initializer=lbann.ValueInitializer(values=str_list(vals)),
+                optimizer=None,
+                name=f'{self.name}_positional{sequence_length}_weights',
+            )
+            self._positional_encoding_cache[sequence_length] = lbann.WeightsLayer(
+                dims=str_list([sequence_length, self.hidden_size]),
+                weights=weights,
+                name=f'{self.name}_positional{sequence_length}',
+            )
+
+        # Return cached positional encoding
+        return self._positional_encoding_cache[sequence_length]
+
+    def _subsequent_mask(self, size):
+        """Attention mask to prevent attending to subsequent positions.
+
+        The (i,j) entry is -1e9 if i<j and is 0 otherwise. Masks are
+        memoized.
+
+        """
+
+        # Construct mask if not in cache
+        if size not in self._subsequent_mask_cache:
+            vals = np.triu(np.full((size,size), -1e9), k=1)
+            weights = lbann.Weights(
+                initializer=lbann.ValueInitializer(values=str_list(np.nditer(vals))),
+                optimizer=None,
+                name=f'{self.name}_mask{size}_weights',
+            )
+            self._subsequent_mask_cache[size] = lbann.WeightsLayer(
+                dims=str_list([size, size]),
+                weights=weights,
+                name=f'{self.name}_mask{size}',
+            )
+
+        # Return cached mask
+        return self._subsequent_mask_cache[size]
+
+    def forward(self, source, source_length, target, target_length):
+        """Apply Transformer.
+
+        The input and output tensors are interpreted as sequences of
+        vectors, where the first tensor dimension is the sequence
+        dimension.
+
+        Args:
+            source (lbann.Layer): Sequence of input vectors to encoder
+                stack.
+            source_length (int): Length of input sequence to encoder.
+            target (lbann.Layer): Sequence of input vectors to decoder
+                stack.
+            target_length (int): Length of input sequence to decoder.
+
+        Returns:
+            lbann.Layer: Sequence of output vectors.
+
+        """
+        self.instance += 1
+
+        # Encoder stack
+        # Note: Add positional encoding to input
+        x = lbann.Add(
+            source,
+            self._positional_encoding(source_length),
+            name=f'{self.name}_instance{self.instance}_positional_source',
+        )
+        for encoder_layer in self.encoder:
+            x = encoder_layer(x)
+        memory = x
+
+        # Decoder stack
+        # Note: Add positional encoding to input
+        x = lbann.Add(
+            target,
+            self._positional_encoding(target_length),
+            name=f'{self.name}_instance{self.instance}_positional_target',
+        )
+        for decoder_layer in self.decoder:
+            x = decoder_layer(
+                x,
+                memory,
+                tgt_mask=self._subsequent_mask(target_length),
+            )
+
+        return x
diff --git a/python/lbann/modules.py b/python/lbann/modules.py
deleted file mode 100644
index dd674f78e6c..00000000000
--- a/python/lbann/modules.py
+++ /dev/null
@@ -1,364 +0,0 @@
-"""Neural network modules.
-
-These are a convenience for common layer patterns that are often the
-basic building blocks for larger models.
-
-"""
-
-from collections.abc import Iterable
-import warnings
-from math import sqrt
-import lbann
-from lbann.util import make_iterable
-
-def _str_list(l):
-    """Convert an iterable object to a space-separated string."""
-    return ' '.join(str(i) for i in make_iterable(l))
-
-class Module:
-    """Base class for neural network modules.
-
-    A module is a pattern of operations that may be applied to a set
-    of input layers, obtaining a set of output layers.
-
-    """
-
-    def __init__(self):
-        pass
-
-    def forward(self, input):
-        """Apply module pattern to `input`.
-
-        `input` is a `Layer` or a sequence of `Layer`s. The module
-        pattern is added to the layer graph and the output layer(s)
-        are returned.
-
-        """
-        # Should be overridden in all sub-classes
-        raise NotImplementedError
-
-    def __call__(self, input):
-        """Apply module mattern to `input`.
-
-        Syntatic sugar around `forward` function.
-
-        """
-        return self.forward(input)
-
-class FullyConnectedModule(Module):
-    """Basic block for fully-connected neural networks.
-
-    Applies a dense linearity and a nonlinear activation function.
-
-    """
-
-    global_count = 0  # Static counter, used for default names
-
-    def __init__(self, size, bias=True, weights=[], activation=None,
-                 name=None, data_layout='data_parallel'):
-        """Initialize fully-connected module.
-
-        Args:
-            size (int): Size of output tensor.
-            activation (type): Layer class for activation function.
-            bias (bool): Whether to apply bias after linearity.
-            weights (`Weights` or iterator of `Weights`): Weights in
-                fully-connected layer. There are at most two: the
-                matrix and the bias. If weights are not provided, the
-                matrix will be initialized with He normal
-                initialization and the bias with zeros.
-            name (str): Default name is in the form 'fcmodule<index>'.
-            data_layout (str): Data layout.
-
-        """
-        super().__init__()
-        FullyConnectedModule.global_count += 1
-        self.instance = 0
-        self.size = size
-        self.bias = bias
-        self.name = (name
-                     if name
-                     else 'fcmodule{0}'.format(FullyConnectedModule.global_count))
-        self.data_layout = data_layout
-
-        # Initialize weights
-        # Note: If weights are not provided, matrix weights are
-        # initialized with He normal scheme and bias weights are
-        # initialized with zeros.
-        self.weights = list(make_iterable(weights))
-        if len(self.weights) > 2:
-            raise ValueError('`FullyConnectedModule` has '
-                             'at most two weights, '
-                             'but got {0}'.format(len(self.weights)))
-        if len(self.weights) == 0:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.HeNormalInitializer(),
-                              name=self.name+'_matrix'))
-        if len(self.weights) == 1:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
-                              name=self.name+'_bias'))
-
-        # Initialize activation layer
-        self.activation = None
-        if activation:
-            if isinstance(activation, type):
-                self.activation = activation
-            else:
-                self.activation = type(activation)
-            if not issubclass(self.activation, lbann.Layer):
-                raise ValueError('activation must be a layer')
-
-    def forward(self, x):
-        self.instance += 1
-        name = '{0}_instance{1}'.format(self.name, self.instance)
-        y = lbann.FullyConnected(x,
-                                 weights=self.weights,
-                                 name=(name+'_fc' if self.activation else name),
-                                 data_layout=self.data_layout,
-                                 num_neurons=self.size,
-                                 has_bias=self.bias)
-        if self.activation:
-            return self.activation(y,
-                                   name=name+'_activation',
-                                   data_layout=self.data_layout)
-        else:
-            return y
-
-class ConvolutionModule(Module):
-    """Basic block for convolutional neural networks.
-
-    Applies a convolution and a nonlinear activation function.
-
-    """
-
-    global_count = 0  # Static counter, used for default names
-
-    def __init__(self, num_dims,
-                 out_channels, kernel_size,
-                 stride=1, padding=0, dilation=1, groups=1, bias=True,
-                 weights=[], activation=None, name=None):
-        """Initialize convolution module.
-
-        Args:
-            num_dims (int): Number of dimensions.
-            out_channels (int): Number of output channels, i.e. number
-                of filters.
-            kernel_size (int): Size of convolution kernel.
-            stride (int): Convolution stride.
-            padding (int): Convolution padding.
-            dilation (int): Convolution dilation.
-            groups (int): Number of convolution groups.
-            bias (bool): Whether to apply channel-wise bias after
-                convolution.
-            weights (`Weights` or iterator of `Weights`): Weights in
-                convolution layer. There are at most two: the kernel
-                and the bias. If weights are not provided, the kernel
-                will be initialized with He normal initialization and
-                the bias with zeros.
-            name (str): Default name is in the form 'convmodule<index>'.
-
-        """
-        super().__init__()
-        ConvolutionModule.global_count += 1
-        self.instance = 0
-        self.num_dims = num_dims
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-        self.bias = bias
-        self.weights = list(make_iterable(weights))
-        self.name = (name
-                     if name
-                     else 'convmodule{0}'.format(ConvolutionModule.global_count))
-
-        # Initialize weights
-        # Note: If weights are not provided, kernel weights are
-        # initialized with He normal scheme and bias weights are
-        # initialized with zeros.
-        self.weights = list(make_iterable(weights))
-        if len(self.weights) > 2:
-            raise ValueError('`ConvolutionModule` has '
-                             'at most two weights, '
-                             'but got {0}'.format(len(self.weights)))
-        if len(self.weights) == 0:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.HeNormalInitializer(),
-                              name=self.name+'_kernel'))
-        if len(self.weights) == 1:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
-                              name=self.name+'_bias'))
-
-        # Initialize activation layer
-        self.activation = None
-        if activation:
-            if isinstance(activation, type):
-                self.activation = activation
-            else:
-                self.activation = type(activation)
-            if not issubclass(self.activation, lbann.Layer):
-                raise ValueError('activation must be a layer')
-
-    def forward(self, x):
-        self.instance += 1
-        name = '{0}_instance{1}'.format(self.name, self.instance)
-        y = lbann.Convolution(x,
-                              weights=self.weights,
-                              name=(name+'_conv' if self.activation else name),
-                              num_dims=self.num_dims,
-                              num_output_channels=self.out_channels,
-                              has_vectors=False,
-                              conv_dims_i=self.kernel_size,
-                              conv_pads_i=self.padding,
-                              conv_strides_i=self.stride,
-                              conv_dilations_i=self.dilation,
-                              num_groups=self.groups,
-                              has_bias=self.bias)
-        if self.activation:
-            return self.activation(y, name=name+'_activation')
-        else:
-            return y
-
-class Convolution2dModule(ConvolutionModule):
-    """Basic block for 2D convolutional neural networks.
-
-    Applies a convolution and a nonlinear activation function.
-    This is a wrapper class for ConvolutionModule.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(2, *args, **kwargs)
-
-class Convolution3dModule(ConvolutionModule):
-    """Basic block for 3D convolutional neural networks.
-
-    Applies a convolution and a nonlinear activation function.
-    This is a wrapper class for ConvolutionModule.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(3, *args, **kwargs)
-
-class LSTMCell(Module):
-    """Long short-term memory cell."""
-
-    global_count = 0  # Static counter, used for default names
-
-    def __init__(self, size, bias = True,
-                 weights=[], name=None, data_layout='data_parallel'):
-        """Initialize LSTM cell.
-
-        Args:
-            size (int): Size of output tensor.
-            bias (bool): Whether to apply biases after linearity.
-            weights (`Weights` or iterator of `Weights`): Weights in
-                fully-connected layer. There are at most two - a
-                matrix ((4*size) x (input_size+size) dimensions) and a
-                bias (4*size entries). If weights are not provided,
-                the matrix and bias will be initialized in a similar
-                manner as PyTorch (uniform random values from
-                [-1/sqrt(size), 1/sqrt(size)]).
-            name (str): Default name is in the form 'lstmcell<index>'.
-            data_layout (str): Data layout.
-
-        """
-        super().__init__()
-        LSTMCell.global_count += 1
-        self.step = 0
-        self.size = size
-        self.name = (name
-                     if name
-                     else 'lstmcell{0}'.format(LSTMCell.global_count))
-        self.data_layout = data_layout
-
-        # Initial state
-        self.last_output = lbann.Constant(value=0.0, num_neurons=str(size),
-                                          name=self.name + '_init_output',
-                                          data_layout=self.data_layout)
-        self.last_cell = lbann.Constant(value=0.0, num_neurons=str(size),
-                                        name=self.name + '_init_cell',
-                                        data_layout=self.data_layout)
-
-        # Weights
-        self.weights = list(make_iterable(weights))
-        if len(self.weights) > 2:
-            raise ValueError('`LSTMCell` has at most two weights, '
-                             'but got {0}'.format(len(self.weights)))
-        if len(self.weights) == 0:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size),
-                                                                   max=-1/sqrt(self.size)),
-                              name=self.name+'_matrix'))
-        if len(self.weights) == 1:
-            self.weights.append(
-                lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size),
-                                                                   max=-1/sqrt(self.size)),
-                           name=self.name+'_bias'))
-
-        # Linearity
-        self.fc = FullyConnectedModule(4*size, bias=bias,
-                                       weights=self.weights,
-                                       name=self.name + '_fc',
-                                       data_layout=self.data_layout)
-
-    def forward(self, x):
-        """Perform LSTM step.
-
-        State from previous steps is used to compute output.
-
-        """
-        self.step += 1
-        name = '{0}_step{1}'.format(self.name, self.step)
-
-        # Apply linearity
-        input_concat = lbann.Concatenation([x, self.last_output],
-                                           name=name + '_input',
-                                           data_layout=self.data_layout)
-        fc = self.fc(input_concat)
-
-        # Get gates and cell update
-        slice = lbann.Slice(fc,
-                            slice_points=_str_list([0, self.size, 4*self.size]),
-                            name=name + '_fc_slice',
-                            data_layout=self.data_layout)
-        cell_update = lbann.Tanh(slice,
-                                 name=name + '_cell_update',
-                                 data_layout=self.data_layout)
-        sigmoid = lbann.Sigmoid(slice,
-                                name=name + '_sigmoid',
-                                data_layout=self.data_layout)
-        slice = lbann.Slice(sigmoid,
-                            slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]),
-                            name=name + '_sigmoid_slice',
-                            data_layout=self.data_layout)
-        f = lbann.Identity(slice, name=name + '_forget_gate',
-                           data_layout=self.data_layout)
-        i = lbann.Identity(slice, name=name + '_input_gate',
-                           data_layout=self.data_layout)
-        o = lbann.Identity(slice, name=name + '_output_gate',
-                           data_layout=self.data_layout)
-
-        # Cell state
-        cell_forget = lbann.Multiply([f, self.last_cell],
-                                     name=name + '_cell_forget',
-                                     data_layout=self.data_layout)
-        cell_input = lbann.Multiply([i, cell_update],
-                                    name=name + '_cell_input',
-                                    data_layout=self.data_layout)
-        cell = lbann.Add([cell_forget, cell_input], name=name + '_cell',
-                         data_layout=self.data_layout)
-
-        # Output
-        cell_act = lbann.Tanh(cell, name=name + '_cell_activation',
-                              data_layout=self.data_layout)
-        output = lbann.Multiply([o, cell_act], name=name,
-                                data_layout=self.data_layout)
-
-        # Update state and return output
-        self.last_cell = cell
-        self.last_output = output
-        return output
diff --git a/python/lbann/modules/__init__.py b/python/lbann/modules/__init__.py
new file mode 100644
index 00000000000..f590e2c52bb
--- /dev/null
+++ b/python/lbann/modules/__init__.py
@@ -0,0 +1,11 @@
+"""Neural network modules.
+
+These are a convenience for common layer patterns that are often the
+basic building blocks for larger models.
+
+"""
+
+# Import from submodules
+from lbann.modules.base import Module, FullyConnectedModule, ConvolutionModule, Convolution2dModule, Convolution3dModule
+from lbann.modules.rnn import LSTMCell, GRU
+from lbann.modules.transformer import MultiheadAttention
diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py
new file mode 100644
index 00000000000..82f4c611b42
--- /dev/null
+++ b/python/lbann/modules/base.py
@@ -0,0 +1,270 @@
+"""Base class for neural network modules.
+
+This also contains modules for fully-connected and convolution layers.
+
+"""
+import abc
+import lbann
+from lbann.util import make_iterable
+
+class Module(abc.ABC):
+    """Base class for neural network modules.
+
+    A module is a pattern of layers that can be added to a layer
+    graph, possibly multiple times. The pattern typically takes a set
+    of input layers and obtains a set of output layers.
+
+    """
+
+    def forward(self, *args, **kwargs):
+        """Apply module pattern.
+
+        A module pattern typically takes a set of `Layer`s as input
+        and returns a set of `Layer`s.
+
+        """
+        # Should be overridden in all sub-classes
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        """Apply module mattern to `input`.
+
+        Syntatic sugar around `forward` function.
+
+        """
+        return self.forward(*args, **kwargs)
+
+class FullyConnectedModule(Module):
+    """Basic block for fully-connected neural networks.
+
+    Applies a dense linearity and a nonlinear activation function.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 size,
+                 bias=True,
+                 transpose=False,
+                 weights=[],
+                 activation=None,
+                 name=None,
+                 data_layout='data_parallel',
+                 parallel_strategy={}):
+        """Initialize fully-connected module.
+
+        Args:
+            size (int): Size of output tensor.
+            activation (type): Layer class for activation function.
+            bias (bool): Whether to apply bias after linearity.
+            transpose (bool): Whether to apply transpose of weights
+                matrix.
+            weights (`Weights` or iterator of `Weights`): Weights in
+                fully-connected layer. There are at most two: the
+                matrix and the bias. If weights are not provided, the
+                matrix will be initialized with He normal
+                initialization and the bias with zeros.
+            name (str): Default name is in the form 'fcmodule<index>'.
+            data_layout (str): Data layout.
+            parallel_strategy (dict): Data partitioning scheme.
+
+        """
+        super().__init__()
+        FullyConnectedModule.global_count += 1
+        self.instance = 0
+        self.size = size
+        self.bias = bias
+        self.transpose = transpose
+        self.name = (name
+                     if name
+                     else 'fcmodule{0}'.format(FullyConnectedModule.global_count))
+        self.data_layout = data_layout
+        self.parallel_strategy = parallel_strategy
+
+        # Initialize weights
+        # Note: If weights are not provided, matrix weights are
+        # initialized with He normal scheme and bias weights are
+        # initialized with zeros.
+        self.weights = list(make_iterable(weights))
+        if len(self.weights) > 2:
+            raise ValueError('`FullyConnectedModule` has '
+                             'at most two weights, '
+                             'but got {0}'.format(len(self.weights)))
+        if len(self.weights) == 0:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.HeNormalInitializer(),
+                              name=self.name+'_matrix'))
+        if len(self.weights) == 1:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+                              name=self.name+'_bias'))
+
+        # Initialize activation layer
+        self.activation = None
+        if activation:
+            if isinstance(activation, type):
+                self.activation = activation
+            else:
+                self.activation = type(activation)
+            if not issubclass(self.activation, lbann.Layer):
+                raise ValueError('activation must be a layer')
+
+    def forward(self, x):
+        self.instance += 1
+        name = '{0}_instance{1}'.format(self.name, self.instance)
+        y = lbann.FullyConnected(x,
+                                 weights=self.weights,
+                                 name=(name+'_fc' if self.activation else name),
+                                 data_layout=self.data_layout,
+                                 num_neurons=self.size,
+                                 has_bias=self.bias,
+                                 transpose=self.transpose,
+                                 parallel_strategy=self.parallel_strategy)
+        if self.activation:
+            return self.activation(y,
+                                   name=name+'_activation',
+                                   data_layout=self.data_layout,
+                                   parallel_strategy=self.parallel_strategy)
+        else:
+            return y
+
+class ConvolutionModule(Module):
+    """Basic block for convolutional neural networks.
+
+    Applies a convolution and a nonlinear activation function.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, num_dims,
+                 out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, bias=True,
+                 weights=[], activation=None, name=None, transpose=False,
+                 parallel_strategy={}):
+        """Initialize convolution module.
+
+        Args:
+            num_dims (int): Number of dimensions.
+            out_channels (int): Number of output channels, i.e. number
+                of filters.
+            kernel_size (int): Size of convolution kernel.
+            stride (int): Convolution stride.
+            padding (int): Convolution padding.
+            dilation (int): Convolution dilation.
+            groups (int): Number of convolution groups.
+            bias (bool): Whether to apply channel-wise bias after
+                convolution.
+            weights (`Weights` or iterator of `Weights`): Weights in
+                convolution layer. There are at most two: the kernel
+                and the bias. If weights are not provided, the kernel
+                will be initialized with He normal initialization and
+                the bias with zeros.
+            name (str): Default name is in the form 'convmodule<index>'.
+            transpose (bool): If true call deconvolution (or convolution
+                         transpose)
+            parallel_strategy dict): Data partitioning scheme.
+
+        """
+        super().__init__()
+        ConvolutionModule.global_count += 1
+        self.instance = 0
+        self.num_dims = num_dims
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.weights = list(make_iterable(weights))
+        self.name = (name
+                     if name
+                     else 'convmodule{0}'.format(ConvolutionModule.global_count))
+        self.transpose = transpose
+        self.parallel_strategy = parallel_strategy
+
+        # Initialize weights
+        # Note: If weights are not provided, kernel weights are
+        # initialized with He normal scheme and bias weights are
+        # initialized with zeros.
+        self.weights = list(make_iterable(weights))
+        if len(self.weights) > 2:
+            raise ValueError('`ConvolutionModule` has '
+                             'at most two weights, '
+                             'but got {0}'.format(len(self.weights)))
+        if len(self.weights) == 0:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.HeNormalInitializer(),
+                              name=self.name+'_kernel'))
+        if len(self.weights) == 1:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+                              name=self.name+'_bias'))
+
+        # Initialize activation layer
+        self.activation = None
+        if activation:
+            if isinstance(activation, type):
+                self.activation = activation
+            else:
+                self.activation = type(activation)
+            if not issubclass(self.activation, lbann.Layer):
+                raise ValueError('activation must be a layer')
+
+    def forward(self, x):
+        self.instance += 1
+        name = '{0}_instance{1}'.format(self.name, self.instance)
+        if(self.transpose):
+          y = lbann.Deconvolution(x,
+                              weights=self.weights,
+                              name=(name+'_deconv' if self.activation else name),
+                              num_dims=self.num_dims,
+                              num_output_channels=self.out_channels,
+                              has_vectors=False,
+                              conv_dims_i=self.kernel_size,
+                              conv_pads_i=self.padding,
+                              conv_strides_i=self.stride,
+                              conv_dilations_i=self.dilation,
+                              num_groups=self.groups,
+                              has_bias=self.bias,
+                              parallel_strategy=self.parallel_strategy)
+        else:
+          y = lbann.Convolution(x,
+                              weights=self.weights,
+                              name=(name+'_conv' if self.activation else name),
+                              num_dims=self.num_dims,
+                              num_output_channels=self.out_channels,
+                              has_vectors=False,
+                              conv_dims_i=self.kernel_size,
+                              conv_pads_i=self.padding,
+                              conv_strides_i=self.stride,
+                              conv_dilations_i=self.dilation,
+                              num_groups=self.groups,
+                              has_bias=self.bias,
+                              parallel_strategy=self.parallel_strategy)
+        if self.activation:
+            return self.activation(y, name=name+'_activation')
+        else:
+            return y
+
+class Convolution2dModule(ConvolutionModule):
+    """Basic block for 2D convolutional neural networks.
+
+    Applies a convolution and a nonlinear activation function.
+    This is a wrapper class for ConvolutionModule.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(2, *args, **kwargs)
+
+class Convolution3dModule(ConvolutionModule):
+    """Basic block for 3D convolutional neural networks.
+
+    Applies a convolution and a nonlinear activation function.
+    This is a wrapper class for ConvolutionModule.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(3, *args, **kwargs)
diff --git a/python/lbann/modules/rnn.py b/python/lbann/modules/rnn.py
new file mode 100644
index 00000000000..4d3ee1a5982
--- /dev/null
+++ b/python/lbann/modules/rnn.py
@@ -0,0 +1,300 @@
+"""Neural network modules for recurrent models."""
+
+import math
+import lbann
+from .base import Module, FullyConnectedModule
+from lbann.util import make_iterable, str_list
+
+class LSTMCell(Module):
+    """Long short-term memory cell."""
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, size, bias = True,
+                 weights=[], name=None, data_layout='data_parallel'):
+        """Initialize LSTM cell.
+
+        Args:
+            size (int): Size of output tensor.
+            bias (bool): Whether to apply biases after linearity.
+            weights (`Weights` or iterator of `Weights`): Weights in
+                fully-connected layer. There are at most two - a
+                matrix ((4*size) x (input_size+size) dimensions) and a
+                bias (4*size entries). If weights are not provided,
+                the matrix and bias will be initialized in a similar
+                manner as PyTorch (uniform random values from
+                [-1/sqrt(size), 1/sqrt(size)]).
+            name (str): Default name is in the form 'lstmcell<index>'.
+            data_layout (str): Data layout.
+
+        """
+        super().__init__()
+        LSTMCell.global_count += 1
+        self.step = 0
+        self.size = size
+        self.name = (name
+                     if name
+                     else 'lstmcell{0}'.format(LSTMCell.global_count))
+        self.data_layout = data_layout
+
+        # Weights
+        self.weights = list(make_iterable(weights))
+        if len(self.weights) > 2:
+            raise ValueError('`LSTMCell` has at most two weights, '
+                             'but got {0}'.format(len(self.weights)))
+        scale = 1 / math.sqrt(self.size)
+        if len(self.weights) == 0:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_matrix')
+            )
+        if len(self.weights) == 1:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_bias')
+            )
+
+        # Linearity
+        self.fc = FullyConnectedModule(
+            4*size, bias=bias,
+            weights=self.weights,
+            name=self.name + '_fc',
+            data_layout=self.data_layout
+        )
+
+    def forward(self, x, prev_state):
+        """Apply LSTM step.
+
+        Args:
+            x (Layer): Input.
+            prev_state (tuple with two `Layer`s): State from previous
+                LSTM step. Comprised of LSTM output and cell state.
+
+        Returns:
+            (Layer, (Layer, Layer)): The output and state (the output
+                and cell state). The state can be passed directly into
+                the next LSTM step.
+
+        """
+        self.step += 1
+        name = '{0}_step{1}'.format(self.name, self.step)
+
+        # Get output and cell state from previous step
+        prev_output, prev_cell = prev_state
+
+        # Apply linearity
+        input_concat = lbann.Concatenation(x, prev_output,
+                                           name=name + '_input',
+                                           data_layout=self.data_layout)
+        fc = self.fc(input_concat)
+
+        # Get gates and cell update
+        slice = lbann.Slice(fc,
+                            slice_points=str_list([0, self.size, 4*self.size]),
+                            name=name + '_fc_slice',
+                            data_layout=self.data_layout)
+        cell_update = lbann.Tanh(slice,
+                                 name=name + '_cell_update',
+                                 data_layout=self.data_layout)
+        sigmoid = lbann.Sigmoid(slice,
+                                name=name + '_sigmoid',
+                                data_layout=self.data_layout)
+        slice = lbann.Slice(sigmoid,
+                            slice_points=str_list([0, self.size, 2*self.size, 3*self.size]),
+                            name=name + '_sigmoid_slice',
+                            data_layout=self.data_layout)
+        f = lbann.Identity(slice, name=name + '_forget_gate',
+                           data_layout=self.data_layout)
+        i = lbann.Identity(slice, name=name + '_input_gate',
+                           data_layout=self.data_layout)
+        o = lbann.Identity(slice, name=name + '_output_gate',
+                           data_layout=self.data_layout)
+
+        # Cell state
+        cell_forget = lbann.Multiply(f, prev_cell,
+                                     name=name + '_cell_forget',
+                                     data_layout=self.data_layout)
+        cell_input = lbann.Multiply(i, cell_update,
+                                    name=name + '_cell_input',
+                                    data_layout=self.data_layout)
+        cell = lbann.Add(cell_forget, cell_input, name=name + '_cell',
+                         data_layout=self.data_layout)
+
+        # Output
+        cell_act = lbann.Tanh(cell, name=name + '_cell_activation',
+                              data_layout=self.data_layout)
+        output = lbann.Multiply(o, cell_act, name=name,
+                                data_layout=self.data_layout)
+
+        # Return output and state
+        return output, (output, cell)
+
+class GRU(Module):
+    """Gated-recurrent unit.
+       Implementation mostly taken from:
+       https://pytorch.org/docs/stable/nn.html#gru"""
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self, size, bias = True,
+                 weights=[], name=None, data_layout='data_parallel'):
+        """Initialize GRU cell.
+
+        Args:
+            size (int): Size of output tensor.
+            bias (bool): Whether to apply biases after linearity.
+            weights (`Weights` or iterator of `Weights`): Weights in
+                fully-connected layer. There are at most four - two
+                matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two
+                biases (3*size entries) each. If weights are not provided,
+                the matrix and bias will be initialized in a similar
+                manner as PyTorch (uniform random values from
+                [-1/sqrt(size), 1/sqrt(size)]).
+            name (str): Default name is in the form 'gru<index>'.
+            data_layout (str): Data layout.
+
+        """
+        super().__init__()
+        GRU.global_count += 1
+        self.step = 0
+        self.size = size
+        self.name = (name
+                     if name
+                     else 'gru{0}'.format(GRU.global_count))
+        self.data_layout = data_layout
+
+        # Weights
+        self.weights = list(make_iterable(weights))
+        if len(self.weights) > 4:
+            raise ValueError('`GRU` has at most 4 weights, '
+                             'but got {0}'.format(len(self.weights)))
+        ##@todo: use loop
+        scale = 1 / math.sqrt(self.size)
+        if len(self.weights) == 0:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_ih_matrix')
+            )
+        if len(self.weights) == 1:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_ih_bias')
+            )
+        if len(self.weights) == 2:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_hh_matrix')
+            )
+        if len(self.weights) == 3:
+            self.weights.append(
+                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
+                                                                   max=scale),
+                              name=self.name+'_hh_bias')
+            )
+
+        # Linearity
+        ####Learnable input-hidden weights
+        self.ih_fc = FullyConnectedModule(
+            3*size, bias=bias,
+            weights=self.weights[:2],
+            name=self.name + '_ih_fc',
+            data_layout=self.data_layout
+        )
+        ###Learnable hidden-hidden weights
+        self.hh_fc = FullyConnectedModule(
+            3*size, bias=bias,
+            weights=self.weights[2:],
+            name=self.name + '_hh_fc',
+            data_layout=self.data_layout
+        )
+
+    def forward(self, x, prev_state):
+        """Apply GRU step.
+
+        Args:
+            x (Layer): Input.
+            prev_state: State from previous GRU step.
+
+        Returns:
+            (Layer, Layer): The output (out)  and state (hn).
+                          The state can be passed directly into
+                           the next GRU step.
+
+        """
+        self.step += 1
+        name = '{0}_step{1}'.format(self.name, self.step)
+
+
+        fc1 = self.ih_fc(x)   #input_fc
+        fc2 = self.hh_fc(prev_state)  #hidden_fc
+
+
+        # Get gates and cell update
+        fc1_slice = lbann.Slice(fc1,
+                            slice_points=str_list([0, self.size, 2*self.size, 3*self.size]),
+                            name=name + '_fc1_slice',
+                            data_layout=self.data_layout)
+        Wir_x = lbann.Identity(fc1_slice, name=name + '_Wrx',
+                           data_layout=self.data_layout)
+        Wiz_x = lbann.Identity(fc1_slice, name=name + '_Wzx',
+                           data_layout=self.data_layout)
+        Win_x = lbann.Identity(fc1_slice, name=name + '_Wnx',
+                           data_layout=self.data_layout)
+
+        fc2_slice = lbann.Slice(fc2,
+                            slice_points=str_list([0, self.size, 2*self.size, 3*self.size]),
+                            name=name + '_fc2_slice',
+                            data_layout=self.data_layout)
+        Whr_prev = lbann.Identity(fc2_slice, name=name + '_Wrh',
+                           data_layout=self.data_layout)
+        Whz_prev = lbann.Identity(fc2_slice, name=name + '_Wzh',
+                           data_layout=self.data_layout)
+        Whn_prev = lbann.Identity(fc2_slice, name=name + '_Wnh',
+                           data_layout=self.data_layout)
+
+        rt = \
+            lbann.Sigmoid(
+                lbann.Add(Wir_x, Whr_prev, data_layout=self.data_layout),
+                name=name + '_reset_gate',
+                data_layout=self.data_layout
+            )
+
+        zt = \
+            lbann.Sigmoid(
+                lbann.Add(Wiz_x, Whz_prev, data_layout=self.data_layout),
+                name=name + '_update_gate',
+                data_layout=self.data_layout,
+            )
+
+        nt = \
+            lbann.Tanh(
+                lbann.Add(
+                    Win_x,
+                    lbann.Multiply(rt, Whn_prev, data_layout=self.data_layout),
+                    data_layout=self.data_layout,
+                ),
+                name=name + '_new_gate', data_layout=self.data_layout,
+            )
+
+        ht = \
+            lbann.Add(
+                lbann.Multiply(
+                    lbann.WeightedSum(
+                        lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout),
+                        zt,
+                        scaling_factors='1 -1', data_layout=self.data_layout
+                    ),
+                    nt,
+                    data_layout=self.data_layout
+                ),
+                lbann.Multiply(zt, prev_state, data_layout=self.data_layout),
+                name=name+ '_output', data_layout=self.data_layout,
+            )
+
+        # Return output
+        return ht, ht
diff --git a/python/lbann/modules/transformer.py b/python/lbann/modules/transformer.py
new file mode 100644
index 00000000000..1931fb88dda
--- /dev/null
+++ b/python/lbann/modules/transformer.py
@@ -0,0 +1,178 @@
+"""Neural network modules for transformer models."""
+import math
+import lbann
+from .base import Module, FullyConnectedModule
+from lbann.util import make_iterable, str_list
+
+class MultiheadAttention(Module):
+    """Parallel instances of scaled dot-product attention.
+
+    See:
+
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion
+    Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.
+    "Attention is all you need." In Advances in Neural Information
+    Processing Systems, pp. 5998-6008. 2017.
+
+    Args:
+        embed_dim (int): Size of representation space.
+        num_heads (int): Number of parallel attention instances. Must
+            evenly divide `embed_dim`.
+        name (str): Default name is in the form
+            'multiheadattention<index>'.
+
+    """
+
+    global_count = 0  # Static counter, used for default names
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 name=None):
+        super().__init__()
+        MultiheadAttention.global_count += 1
+        self.instance = 0
+        assert embed_dim % num_heads == 0, 'embed_dim must be divisible by num_heads'
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+
+        # Module name
+        self.name = name
+        if not self.name:
+            self.name = f'multiheadattention{MultiheadAttention.global_count}'
+
+        # Weights for fully-connected layers
+        self.query_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_query_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_query_bias'),
+        ]
+        self.key_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_key_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_key_bias'),
+        ]
+        self.value_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_value_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_value_bias'),
+        ]
+        self.output_weights = [
+            lbann.Weights(initializer=lbann.GlorotNormalInitializer(),
+                          name=f'{self.name}_output_matrix'),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0),
+                          name=f'{self.name}_output_bias'),
+        ]
+
+    def forward(self, queries, keys, values, mask=None):
+        """Apply multi-head attention.
+
+        The input and output tensors are interpreted as sequences of
+        vectors, where the first tensor dimension is the sequence
+        dimension.
+
+        Args:
+            queries (lbann.Layer): Sequence of query vectors.
+            keys (lbann.Layer): Sequence of key vectors.
+            values (lbann.Layer): Sequence of value vectors.
+            mask (lbann.Layer, optional): Additive attention mask. If
+                the (i,j) entry is very negative (e.g. -1e9), then the
+                ith query does not attend to the jth key/value pair.
+
+        Returns:
+            lbann.Layer: Sequence of output vectors. The sequence
+                length is the same as `queries`.
+
+        """
+        self.instance += 1
+        name = f'{self.name}_instance{self.instance}'
+
+        # Apply fully-connected layers to input sequences
+        queries_fc = lbann.ChannelwiseFullyConnected(
+            queries,
+            weights=self.query_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}_queries_fc',
+        )
+        keys_fc = lbann.ChannelwiseFullyConnected(
+            keys,
+            weights=self.key_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}_keys_fc',
+        )
+        values_fc = lbann.ChannelwiseFullyConnected(
+            values,
+            weights=self.value_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}_values_fc',
+        )
+
+        # Slice embedding vectors for each head
+        slice_points = str_list(self.head_dim * i
+                                for i in range(self.num_heads+1))
+        queries_slice = lbann.Slice(
+            queries_fc,
+            axis=1,
+            slice_points=slice_points,
+            name=f'{name}_queries_slice',
+        )
+        keys_slice = lbann.Slice(
+            keys_fc,
+            axis=1,
+            slice_points=slice_points,
+            name=f'{name}_keys_slice',
+        )
+        values_slice = lbann.Slice(
+            values_fc,
+            axis=1,
+            slice_points=slice_points,
+            name=f'{name}_values_slice',
+        )
+
+        # Compute scaled dot-product attention for each head
+        attentions = []
+        for head in range(self.num_heads):
+            head_name = f'{name}_head{head}'
+
+            # Attention inputs
+            q = lbann.Identity(queries_slice)
+            k = lbann.Identity(keys_slice)
+            v = lbann.Identity(values_slice)
+
+            # Multiply queries and keys
+            # Note: num_queries x num_keys
+            y = lbann.MatMul(
+                q, k,
+                transpose_b=True,
+                name=f'{head_name}_matmul',
+            )
+            y = lbann.WeightedSum(
+                y,
+                scaling_factors=str(1 / math.sqrt(self.head_dim)),
+                name=f'{head_name}_scale',
+            )
+            if mask:
+                y = lbann.Add(y, mask, name=f'{head_name}_mask')
+            y = lbann.ChannelwiseSoftmax(y, name=f'{head_name}_softmax')
+
+            # Attention output
+            # Note: num_queries x head_dim
+            attentions.append(lbann.MatMul(y, v, name=head_name))
+
+        # Concatenate heads and apply fully-connected layer
+        attentions = lbann.Concatenation(
+            attentions,
+            axis=1,
+            name=f'{name}_heads_concat'
+        )
+        outputs_fc = lbann.ChannelwiseFullyConnected(
+            attentions,
+            weights=self.output_weights,
+            output_channel_dims=[self.embed_dim],
+            name=f'{name}',
+        )
+        return outputs_fc
diff --git a/python/lbann/optimizer.py b/python/lbann/optimizer.py
deleted file mode 100644
index 061d002baff..00000000000
--- a/python/lbann/optimizer.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from lbann import lbann_pb2
-import lbann.util.class_generator
-
-class Optimizer:
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-        return lbann_pb2.Optimizer()
-
-# Generate Optimizer sub-classes from lbann.proto
-# Note: The list of skip fields must be updated if any new fields are
-# added to the Optimizer message in lbann.proto
-classes = lbann.util.class_generator.generate_classes_from_protobuf_message(
-    lbann_pb2.Optimizer,
-    base_class = Optimizer,
-    base_has_export_proto = True)
-for c in classes:
-    globals()[c.__name__] = c
diff --git a/python/lbann/proto.py b/python/lbann/proto.py
index 2dcac9fb77e..59d60fd3055 100644
--- a/python/lbann/proto.py
+++ b/python/lbann/proto.py
@@ -2,7 +2,7 @@
 
 import google.protobuf.text_format
 import google.protobuf.message
-from lbann import lbann_pb2
+from lbann import lbann_pb2, NoOptimizer
 
 def save_prototext(filename, **kwargs):
     """Save a prototext file.
@@ -13,12 +13,33 @@ def save_prototext(filename, **kwargs):
     """
 
     # Construct protobuf message
-    for key, value in kwargs.items():
-        if not isinstance(value, google.protobuf.message.Message):
-            kwargs[key] = value.export_proto()
-    pb = lbann_pb2.LbannPB(**kwargs)
+    message = lbann_pb2.LbannPB()
+    field_names = message.DESCRIPTOR.fields_by_name.keys()
+
+    # Make sure keyword arguments are valid
+    for key, val in kwargs.items():
+        if key not in field_names:
+            raise TypeError("'{}' is an invalid keyword "
+                            "argument for this function".format(key))
+        if val is not None:
+            field = getattr(message, key)
+            if isinstance(val, google.protobuf.message.Message):
+                field.CopyFrom(val)
+            else:
+                field.CopyFrom(val.export_proto())
+            field.SetInParent()
+
+    # Make sure default optimizer is set
+    # TODO: This is a hack that should be removed when possible. LBANN
+    # requires the prototext file to provide a default optimizer. It
+    # would be better if LBANN used no optimizer if one isn't
+    # provided.
+    if not message.HasField('optimizer'):
+        from lbann import Optimizer
+        message.optimizer.CopyFrom(NoOptimizer().export_proto())
+        message.optimizer.SetInParent()
 
     # Write to file
     with open(filename, 'wb') as f:
         f.write(google.protobuf.text_format.MessageToString(
-            pb, use_index_order=True).encode())
+            message, use_index_order=True).encode())
diff --git a/python/lbann/util/__init__.py b/python/lbann/util/__init__.py
index 04e36dcffbb..576407afb61 100644
--- a/python/lbann/util/__init__.py
+++ b/python/lbann/util/__init__.py
@@ -1,13 +1,30 @@
-from collections.abc import Iterable
+import collections.abc
 
 def make_iterable(obj):
     """Convert to an iterable object.
 
     Simply returns `obj` if it is alredy iterable. Otherwise returns a
-    1-tuple containing `obj`.
+    1-tuple containing `obj`. `str`s are treated as _not_ iterable.
 
     """
-    if isinstance(obj, Iterable) and not isinstance(obj, str):
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
         return obj
     else:
         return (obj,)
+
+def str_list(it, sep=' '):
+    """Convert an iterable object to a string."""
+    return sep.join(str(i) for i in make_iterable(it))
+
+def make_nd_array(*dims):
+    """Create a multi-dimensional array with given dimensions.
+
+    The multi-dimensional array is a nested list initialized with
+    `None`s.
+
+    """
+    if dims:
+        head, *tail = dims
+        return [make_nd_array(*tail) for _ in range(head)]
+    else:
+        return None
diff --git a/python/lbann/util/class_generator.py b/python/lbann/util/class_generator.py
deleted file mode 100644
index b52580af589..00000000000
--- a/python/lbann/util/class_generator.py
+++ /dev/null
@@ -1,171 +0,0 @@
-"""Utility functions to generate classes from Protobuf messages."""
-import google.protobuf.descriptor
-from lbann import lbann_pb2
-
-# Map from Protobuf label enums to strings
-_proto_label_to_str = {
-    google.protobuf.descriptor.FieldDescriptor.LABEL_OPTIONAL: 'optional',
-    google.protobuf.descriptor.FieldDescriptor.LABEL_REQUIRED: 'required',
-    google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED: 'repeated'
-}
-# Map from Protobuf type enums to strings
-_proto_type_to_str = {
-    google.protobuf.descriptor.FieldDescriptor.TYPE_BOOL: 'bool',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_BYTES: 'bytes',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_DOUBLE: 'double',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_ENUM: 'enum',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_FIXED32: 'fixed32',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_FIXED64: 'fixed64',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_FLOAT: 'float',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_GROUP: 'group',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_INT32: 'int32',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_INT64: 'int64',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_MESSAGE: 'message',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_SFIXED32: 'sfixed32',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_SFIXED64: 'sfixed64',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_SINT32: 'sint32',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_SINT64: 'sint64',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_STRING: 'string',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_UINT32: 'uint32',
-    google.protobuf.descriptor.FieldDescriptor.TYPE_UINT64: 'uint64'
-}
-
-def _generate_class(message_descriptor,
-                    base_field_name,
-                    base_class,
-                    base_kwargs,
-                    base_has_export_proto):
-    """Generate new class from Protobuf message.
-
-    Args:
-        message (google.protobuf.descriptor.Descriptor): Descriptor
-            for Protobuf message.
-        base_field_name (str): Name of corresponding field in parent
-            message.
-        base_class (type): Base class for generated class.
-        base_kwargs (Iterable of str): Keyword arguments for base
-            class `__init__` method.
-        base_has_export_proto (bool): Whether the base class
-            implements an `export_proto` method. If `True`, the
-            generated class `export_proto` will set the appropriate
-            field in the Protobuf message returned by the base class
-            `export_proto`.
-
-    Returns:
-        type: Generated class.
-
-    """
-
-    # Names of Protobuf message and its fields
-    message_name = message_descriptor.name
-    field_names = message_descriptor.fields_by_name.keys()
-
-    # Make sure fields in generated and base classes are distinct
-    for arg in base_kwargs:
-        if arg in field_names:
-            raise RuntimeError(
-                'class {0} and its parent class {1} '
-                'both define the field {2}. This is a bug!'
-                .format(message_name, base_class.__name__, arg))
-
-    def __init__(self, *args, **kwargs):
-
-        # Extract arguments to pass to base class constructor
-        _base_kwargs = {}
-        for arg in base_kwargs:
-            if arg in kwargs:
-                _base_kwargs[arg] = kwargs[arg]
-                del kwargs[arg]
-        base_class.__init__(self, *args, **_base_kwargs)
-
-        # Make sure arguments are valid
-        for arg in kwargs:
-            if arg not in field_names:
-                raise ValueError('Unknown argument {0}'.format(arg))
-
-        # Set field values
-        for arg in field_names:
-            setattr(self, arg, kwargs.get(arg, None))
-
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-
-        # Construct Protobuf message
-        if base_has_export_proto:
-            proto = base_class.export_proto(self)
-            message = getattr(proto, base_field_name)
-            message.SetInParent()
-        else:
-            proto = getattr(lbann_pb2, message_name)()
-            message = proto
-
-        # Set message
-        for field in field_names:
-            val = getattr(self, field)
-            if val is not None:
-                if type(val) is list:
-                    getattr(message, field).extend(val)
-                else:
-                    setattr(message, field, val)
-
-        # Return Protobuf message
-        return proto
-
-    def get_field_names(self):
-        """Names of parameters in derived class."""
-        return field_names
-
-    # Generate docstring
-    if message_descriptor.fields:
-        doc = 'Fields:\n'
-        for field in message_descriptor.fields:
-            doc += '    {0} ({1} {2})\n'.format(
-                field.name,
-                _proto_label_to_str.get(field.label, 'unknown'),
-                _proto_type_to_str.get(field.type, 'unknown'))
-    else:
-        doc = 'Fields: none\n'
-
-    # Create new class
-    return type(message_name, (base_class,),
-                {'__init__': __init__,
-                 '__doc__': doc,
-                 'export_proto': export_proto,
-                 'get_field_names': get_field_names})
-
-def generate_classes_from_protobuf_message(message,
-                                           skip_fields = set(),
-                                           base_class = object,
-                                           base_kwargs = set(),
-                                           base_has_export_proto = False):
-    """Generate new classes based on fields in a Protobuf message.
-
-    Args:
-        message (type): A derived class of
-            `google.protobuf.message.Message`. A new class will be
-            generated for each field in the message.
-        skip_fields (Iterable of str, optional): Protobuf message
-            fields to ignore.
-        base_class (type, optional): Generated classes will inherit
-            from this class.
-        base_kwargs (Iterable of str, optional): Keyword arguments for
-            base class `__init__` method.
-        base_has_export_proto (bool, optional): Whether the base class
-            implements an `export_proto` method. If `True`, the base
-            class `export_proto` is responsible for constructing a
-            message of type `message` and the generated class
-            `export_proto` will set the appropriate field.
-
-    Returns:
-        list of type: Generated classes.
-
-    """
-    classes = []
-    for field in message.DESCRIPTOR.fields:
-        if field.name not in skip_fields:
-            classes.append(_generate_class(field.message_type,
-                                           field.name,
-                                           base_class,
-                                           base_kwargs,
-                                           base_has_export_proto))
-    return classes
diff --git a/python/lbann/viz.py b/python/lbann/viz.py
deleted file mode 100644
index e47cab2bbe2..00000000000
--- a/python/lbann/viz.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""Visualize LBANN models."""
-from re import sub
-from enum import Enum
-from graphviz import Digraph
-from lbann import lbann_pb2, Model
-
-def visualize_layer_graph(model, filename,
-                          file_format='pdf',
-                          label_format='name only',
-                          graphviz_engine='dot'):
-    """Visualize a model's layer graph and save to file.
-
-    Args:
-        model (`lbann_pb2.Model` or `lbann.model.Model`): Neural
-            network model.
-        filename (`str`): Output file.
-        file_format (`str`): Output file format.
-        label_format (`str`): Displayed layer information (options:
-            type-only, name-only, type-and-name, full).
-        graphviz_engine (`str`): Graphviz visualization scheme.
-
-    """
-
-    # Get protobuf message
-    if isinstance(model, lbann_pb2.Model):
-        proto = model
-    elif isinstance(model, Model):
-         proto = model.export_proto()
-    else:
-        raise TypeError('expected `model` to be an '
-                        '`lbann_pb2.Model` or a `lbann.model.Model`')
-
-    # Strip extension from filename
-    if filename.endswith('.' + file_format):
-        filename = filename[:-len(file_format)-1]
-
-    # Convert label format to lowercase with no spaces
-    label_format = sub(r' |-|_', '', label_format.lower())
-
-    # Construct graphviz graph
-    graph = Digraph(filename=filename, format=file_format, engine=graphviz_engine)
-    graph.attr('node', shape='rect')
-
-    # Construct nodes in layer graph
-    layer_types = (set(lbann_pb2.Layer.DESCRIPTOR.fields_by_name.keys())
-                   - set(['name', 'parents', 'children',
-                          'data_layout', 'device_allocation', 'weights',
-                          'num_neurons_from_data_reader', 'freeze',
-                          'hint_layer', 'weights_data',
-                          'top', 'bottom', 'type', 'motif_layer']))
-    for l in proto.layer:
-
-        # Determine layer type
-        type = ''
-        for _type in layer_types:
-            if l.HasField(_type):
-                type = getattr(l,_type).DESCRIPTOR.name
-                break
-
-        # Construct node label
-        label = ''
-        if label_format == 'nameonly':
-            label = l.name
-        elif label_format == 'typeonly':
-            label = type
-        elif label_format == 'typeandname':
-            label = '<{0}<br/>{1}>'.format(type, l.name)
-        elif label_format == 'full':
-            label = '<'
-            for (index, line) in enumerate(str(l).strip().split('\n')):
-                if index > 0:
-                    label += '<br/>'
-                label += line
-            label += '>'
-
-        # Add layer as layer graph node
-        graph.node(l.name, label=label)
-
-    # Add parent/child relationships as layer graph edges
-    edges = set()
-    for l in proto.layer:
-        edges.update([(p, l.name) for p in l.parents.split()])
-        edges.update([(l.name, c) for c in l.children.split()])
-    graph.edges(edges)
-
-    # Save to file
-    graph.render(filename=filename, cleanup=True, format=file_format)
diff --git a/python/lbann/weights.py b/python/lbann/weights.py
deleted file mode 100644
index df902a6ccd4..00000000000
--- a/python/lbann/weights.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Trainable model parameters."""
-import abc
-from lbann import lbann_pb2
-import lbann.util.class_generator
-
-class Initializer(abc.ABC):
-    """Initialization scheme for `Weights`."""
-    def export_proto(self):
-        pass
-
-# Generate Initializer sub-classes from lbann.proto.
-# Note: The list of skip fields must be updated if any new fields are
-# added to the Weights message in lbann.proto
-classes = lbann.util.class_generator.generate_classes_from_protobuf_message(
-    lbann_pb2.Weights,
-    skip_fields = set(['name', 'optimizer']),
-    base_class = Initializer)
-for c in classes:
-    globals()[c.__name__] = c
-
-class Weights:
-    """Trainable model parameters."""
-
-    global_count = 0  # Static counter, used for default names
-
-    def __init__(self, initializer=None, optimizer=None, name=None):
-        Weights.global_count += 1
-        self.name = name if name else 'weights{0}'.format(Weights.global_count)
-        self.initializer = initializer
-        self.optimizer = optimizer
-
-    def export_proto(self):
-        """Construct and return a protobuf message."""
-        proto = lbann_pb2.Weights()
-        proto.name = self.name
-
-        # Set initializer if needed
-        if self.initializer:
-            type_name = type(self.initializer).__name__
-            field_name = None
-            for field in lbann_pb2.Weights.DESCRIPTOR.fields:
-                if field.message_type and field.message_type.name == type_name:
-                    field_name = field.name
-                    break
-            init_message = getattr(proto, field_name)
-            init_message.CopyFrom(self.initializer.export_proto())
-            init_message.SetInParent()
-
-        # Set optimizer if needed
-        if self.optimizer:
-            proto.optimizer.CopyFrom(self.optimizer.export_proto())
-            proto.optimizer.SetInParent()
-
-        return proto
diff --git a/scripts/build_lbann_from_source.sh b/scripts/build_lbann_from_source.sh
new file mode 100755
index 00000000000..3320cbcf355
--- /dev/null
+++ b/scripts/build_lbann_from_source.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# "spack" is just a shell function; it may not be exported to this
+# scope. Just to be sure, reload the shell integration.
+if [ -n "${SPACK_ROOT}" ]; then
+    source ${SPACK_ROOT}/share/spack/setup-env.sh
+fi
+
+SPACK_VERSION=$(spack --version | sed 's/-.*//g')
+MIN_SPACK_VERSION=0.13.3
+
+source $(dirname ${BASH_SOURCE})/utilities.sh
+
+compare_versions ${SPACK_VERSION} ${MIN_SPACK_VERSION}
+VALID_SPACK=$?
+
+if [[ ${VALID_SPACK} -eq 2 ]]; then
+    echo "Newer version of Spack required.  Detected version ${SPACK_VERSION} requires at least ${MIN_SPACK_VERSION}"
+    exit 1
+fi
+
+# Detect system parameters
+CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
+ARCH=$(uname -m)
+SYS=$(uname -s)
+
+SCRIPT=${BASH_SOURCE}
+
+if [[ ${SYS} = "Darwin" ]]; then
+SCRIPTS_DIR=$(normpath $(dirname $(osx_realpath ${SCRIPT})))
+else
+SCRIPTS_DIR=$(realpath $(dirname ${SCRIPT}))
+fi
+
+LBANN_HOME=$(dirname ${SCRIPTS_DIR})
+SPACK_ENV_DIR=${LBANN_HOME}/spack_environments
+
+# Identify the center that we are running at
+CENTER=
+# String to identify the default compiler - DON'T use this for picking a compiler
+COMPILER=
+BUILD_SUFFIX=
+if [[ ${SYS} = "Darwin" ]]; then
+    CENTER="osx"
+    COMPILER="clang"
+    BUILD_SUFFIX=llnl.gov
+else
+    CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0)
+    if [[ ${CORI} -eq 1 ]]; then
+        CENTER="nersc"
+        # Make sure to purge and setup the modules properly prior to finding the Spack architecture
+        source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh
+        BUILD_SUFFIX=nersc.gov
+    else
+        CENTER="llnl_lc"
+        BUILD_SUFFIX=llnl.gov
+    fi
+    COMPILER="gnu"
+fi
+
+SPACK_ARCH=$(spack arch)
+SPACK_ARCH_TARGET=$(spack arch -t)
+
+SCRIPT=$(basename ${BASH_SOURCE})
+ENABLE_GPUS=ON
+ENABLE_DISTCONV=OFF
+ENABLE_DIHYDROGEN=OFF
+EXEC_ENV=TRUE
+BUILD_TYPE=Release
+VERBOSE=0
+DETERMINISTIC=OFF
+LBANN_ENV=lbann-dev-${SPACK_ARCH_TARGET}
+
+if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
+    echo "script ${BASH_SOURCE[0]} is being sourced ..."
+    EXEC_ENV="FALSE"
+fi
+
+CORE_BUILD_PATH="${LBANN_HOME}/build/${COMPILER}.${BUILD_TYPE}.${CLUSTER}.${BUILD_SUFFIX}"
+
+################################################################
+# Help message
+################################################################
+
+function help_message {
+    local SCRIPT=$(basename ${BASH_SOURCE})
+    local N=$(tput sgr0)    # Normal text
+    local C=$(tput setf 4)  # Colored text
+    cat << EOF
+Build LBANN on an LLNL LC system.
+Can be called anywhere in the LBANN project tree.
+Usage: ${SCRIPT} [options]
+Options:
+  ${C}--help${N}               Display this help message and exit.
+  ${C}--debug${N}              Build with debug flag.
+  ${C}--verbose${N}            Verbose output.
+  ${C}-e | --env${N}           Build and install LBANN using the spack environment provided: default=lbann-dev
+  ${C}-p | --prefix${N}        Build and install LBANN headers and dynamic library into subdirectorys at this path prefix.
+  ${C}-i | --install-dir${N}   Install LBANN headers and dynamic library into the install directory: default=${CORE_BUILD_PATH}/install
+  ${C}-b | --build-dir${N}     Specify alternative build directory: default=${CORE_BUILD_PATH}/build
+  ${C}--disable-gpus${N}       Disable GPUS
+  ${C}--instrument${N}         Use -finstrument-functions flag, for profiling stack traces
+  ${C}-s | --superbuild${N}    Superbuild LBANN with hydrogen and aluminum
+  ${C}-c | --distconv${N}      Enable the DistConv library
+EOF
+}
+
+################################################################
+# Parse command-line arguments
+################################################################
+
+while :; do
+    case ${1} in
+        -h|--help)
+            # Help message
+            help_message
+            if [[ ${EXEC_ENV} == "FALSE" ]]; then
+                return
+            else
+                exit 1
+            fi
+            ;;
+        -e|--env)
+            # Change default build directory
+            if [ -n "${2}" ]; then
+                LBANN_ENV=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        -b|--build-dir)
+            # Change default build directory
+            if [ -n "${2}" ]; then
+                if [[ ${2} = "." ]]; then
+                    BUILD_DIR=${SPACK_ENV_DIR}/${2}
+                elif [[ ${2} = /* ]]; then
+                    BUILD_DIR=${2}
+                else
+                    BUILD_DIR=${SPACK_ENV_DIR}/${2}
+                fi
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        -i|--install-dir)
+            # Specify install directory
+            if [ -n "${2}" ]; then
+                if [[ ${2} = "." ]]; then
+                    INSTALL_DIR=${SPACK_ENV_DIR}/${2}
+                elif [[ ${2} = /* ]]; then
+                    INSTALL_DIR=${2}
+                else
+                    INSTALL_DIR=${SPACK_ENV_DIR}/${2}
+                fi
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        -p|--prefix)
+            # Change default build directory
+            if [ -n "${2}" ]; then
+                if [[ ${2} = "." ]]; then
+                    BUILD_DIR=${SPACK_ENV_DIR}/${2}/build
+                    INSTALL_DIR=${SPACK_ENV_DIR}/${2}/install
+                elif [[ ${2} = /* ]]; then
+                    BUILD_DIR=${2}/build
+                    INSTALL_DIR=${2}/install
+                else
+                    BUILD_DIR=${SPACK_ENV_DIR}/${2}/build
+                    INSTALL_DIR=${SPACK_ENV_DIR}/${2}/install
+                fi
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        --disable-gpus)
+            ENABLE_GPUS=OFF
+            ;;
+        -v|--verbose)
+            # Verbose output
+            VERBOSE=1
+            ;;
+        -d|--debug)
+            # Debug mode
+            BUILD_TYPE=Debug
+            DETERMINISTIC=ON
+            ;;
+        --instrument)
+            INSTRUMENT="-finstrument-functions -ldl"
+            ;;
+        -s|--superbuild)
+            # Debug mode
+            SUPERBUILD="superbuild_lbann_with_hydrogen_and_aluminum.sh"
+            ;;
+        -c|--distconv)
+            ENABLE_DISTCONV=ON
+            ENABLE_DIHYDROGEN=ON
+            # CUDA is required for Distconv
+            ENABLE_GPUS=ON
+            # MPI-CUDA backend is required for Distconv
+            ALUMINUM_WITH_MPI_CUDA=ON
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+BUILD_DIR="${BUILD_DIR:-${CORE_BUILD_PATH}/build}"
+INSTALL_DIR="${INSTALL_DIR:-${CORE_BUILD_PATH}/install}"
+
+export LBANN_HOME=${LBANN_HOME}
+export LBANN_BUILD_DIR=${BUILD_DIR}
+export LBANN_INSTALL_DIR=${INSTALL_DIR}
+
+CMD="mkdir -p ${BUILD_DIR}"
+echo ${CMD}
+${CMD}
+CMD="mkdir -p ${INSTALL_DIR}"
+echo ${CMD}
+${CMD}
+
+SUPERBUILD="${SUPERBUILD:-cmake_lbann.sh}"
+if [[ ${SYS} = "Darwin" ]]; then
+    OSX_VER=$(sw_vers -productVersion)
+    ENABLE_GPUS=OFF
+fi
+
+CMD="cd ${LBANN_BUILD_DIR}"
+echo ${CMD}
+${CMD}
+echo ${PWD}
+
+SPACK_ENV_CMD=
+if [[ ${LBANN_ENV} ]]; then
+    SPACK_ENV_CMD="spack env activate -p ${LBANN_ENV}"
+    ${SPACK_ENV_CMD}
+fi
+
+if [[ ${SYS} = "Darwin" ]]; then
+    export DYLD_LIBRARY_PATH=/System/Library/Frameworks/ImageIO.framework/Resources/:/usr/lib/:${DYLD_LIBRARY_PATH}
+fi
+
+C_FLAGS="${INSTRUMENT} -fno-omit-frame-pointer"
+CXX_FLAGS="-DLBANN_SET_EL_RNG ${INSTRUMENT} -fno-omit-frame-pointer"
+
+if [ "${ARCH}" == "x86_64" ]; then
+    CXX_FLAGS="-march=native ${CXX_FLAGS}"
+else
+    CXX_FLAGS="-mcpu=native -mtune=native ${CXX_FLAGS}"
+fi
+
+source ${SPACK_ENV_DIR}/${SUPERBUILD}
+
+ninja install
+
+echo "To rebuild the environment:"
+echo "    ${SPACK_ENV_CMD}"
+echo "    cd ${LBANN_BUILD_DIR}"
+echo "    ninja install"
diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh
index ee7c61ae4d1..8745243528d 100755
--- a/scripts/build_lbann_lc.sh
+++ b/scripts/build_lbann_lc.sh
@@ -13,8 +13,9 @@ CORAL=$([[ $(hostname) =~ (sierra|lassen|ray) ]] && echo 1 || echo 0)
 COMPILER=gnu
 if [ "${CLUSTER}" == "surface" -o "${CLUSTER}" == "pascal" ]; then
     module load gcc/7.3.0
-    module load opt cudatoolkit/9.2
-elif [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "lassen" ]; then
+elif [ "${CORAL}" -eq 1 ]; then
+    # Make sure module commands available
+    . /usr/share/lmod/lmod/init/bash
     module load gcc/7.3.1
 fi
 if [ "${ARCH}" == "x86_64" ]; then
@@ -43,10 +44,9 @@ if [ "${ARCH}" == "x86_64" ]; then
 fi
 
 
-ELEMENTAL_MATH_LIBS=
-PATCH_OPENBLAS=ON
 C_FLAGS=
 CXX_FLAGS=-DLBANN_SET_EL_RNG
+CUDA_FLAGS=
 Fortran_FLAGS=
 CLEAN_BUILD=0
 DATATYPE=float
@@ -62,7 +62,6 @@ INSTALL_DIR=
 BUILD_SUFFIX=
 DETERMINISTIC=OFF
 WITH_CUDA=
-WITH_CUDA_2=ON
 WITH_TOPO_AWARE=ON
 INSTRUMENT=
 WITH_ALUMINUM=
@@ -70,6 +69,8 @@ ALUMINUM_WITH_MPI_CUDA=OFF
 ALUMINUM_WITH_NCCL=
 WITH_CONDUIT=ON
 WITH_TBINF=OFF
+WITH_DIHYDROGEN=OFF
+WITH_DISTCONV=OFF
 RECONFIGURE=0
 USE_NINJA=0
 # In case that autoconf fails during on-demand buid on surface, try the newer
@@ -78,6 +79,8 @@ USE_NINJA=0
 # by enabling LIBJPEG_TURBO_DIR
 WITH_LIBJPEG_TURBO=ON
 #LIBJPEG_TURBO_DIR="/p/lscratchh/brainusr/libjpeg-turbo-1.5.2"
+WITH_NVSHMEM=0
+NVSHMEM_DIR=
 
 function version_gt() { test "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1"; }
 
@@ -129,6 +132,7 @@ Options:
   ${C}--with-conduit              Build with conduit interface
   ${C}--ninja                     Generate ninja files instead of makefiles
   ${C}--ninja-processes${N} <val> Number of parallel processes for ninja.
+  ${C}--nvshmem${N}               Enable NVSHMEM
 EOF
 }
 
@@ -249,7 +253,6 @@ while :; do
             ;;
         --disable-cuda)
             WITH_CUDA=OFF
-            WITH_CUDA_2=OFF
             ;;
         --disable-topo-aware)
             WITH_TOPO_AWARE=OFF
@@ -273,6 +276,20 @@ while :; do
         --reconfigure)
             RECONFIGURE=1
             ;;
+        --nvshmem)
+            WITH_NVSHMEM=1
+            ;;
+        --with-dihydrogen)
+            WITH_DIHYDROGEN=ON
+            ;;
+        --with-distconv)
+            WITH_DISTCONV=ON
+            WITH_DIHYDROGEN=ON
+            # CUDA is required for Distconv
+            WITH_CUDA=ON
+            # MPI-CUDA backend is required for Distconv
+            ALUMINUM_WITH_MPI_CUDA=ON
+            ;;
         -?*)
             # Unknown option
             echo "Unknown option (${1})" >&2
@@ -313,7 +330,7 @@ fi
 # Load packages
 if [ ${USE_MODULES} -ne 0 ]; then
     module load git
-    module load cmake/3.12.1
+    module load cmake/3.14.5
 else
     use git
 fi
@@ -408,8 +425,8 @@ fi
 # Add compiler optimization flags
 if [ "${BUILD_TYPE}" == "Release" ]; then
     if [ "${COMPILER}" == "gnu" ]; then
-        C_FLAGS="${C_FLAGS} -O3 ${INSTRUMENT}"
-        CXX_FLAGS="${CXX_FLAGS} -O3 ${INSTRUMENT}"
+        C_FLAGS="${C_FLAGS} -O3 ${INSTRUMENT} -fno-omit-frame-pointer"
+        CXX_FLAGS="${CXX_FLAGS} -O3 ${INSTRUMENT} -fno-omit-frame-pointer"
         Fortran_FLAGS="${Fortran_FLAGS} -O3"
         if [ "${CLUSTER}" == "catalyst" ]; then
             C_FLAGS="${C_FLAGS} -march=ivybridge -mtune=ivybridge"
@@ -435,8 +452,8 @@ if [ "${BUILD_TYPE}" == "Release" ]; then
     fi
 else
     if [ "${COMPILER}" == "gnu" ]; then
-        C_FLAGS="${C_FLAGS} -g ${INSTRUMENT}"
-        CXX_FLAGS="${CXX_FLAGS} -g ${INSTRUMENT}"
+        C_FLAGS="${C_FLAGS} -g ${INSTRUMENT} -fno-omit-frame-pointer"
+        CXX_FLAGS="${CXX_FLAGS} -g ${INSTRUMENT} -fno-omit-frame-pointer"
         Fortran_FLAGS="${Fortran_FLAGS} -g"
     fi
 fi
@@ -445,6 +462,16 @@ fi
 CXX_FLAGS="${CXX_FLAGS} -ldl"
 C_FLAGS="${CXX_FLAGS}"
 
+# Hacks to build with NVSHMEM
+if [ ${WITH_NVSHMEM} -ne 0 ]; then
+    if [ "${CLUSTER}" == "lassen" ]; then
+        NVSHMEM_DIR=/usr/workspace/wsb/brain/nvshmem/nvshmem_0.3.3/cuda-10.1_ppc64le
+        CUDA_FLAGS="-gencode=arch=compute_70,code=sm_70"
+    else
+        echo "NVSHMEM is currently only supported on Lassen"
+        exit 1
+    fi
+fi
 
 # Set environment variables
 CC=${C_COMPILER}
@@ -564,16 +591,12 @@ if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ]
     HAS_GPU=1
     WITH_CUDA=${WITH_CUDA:-ON}
     WITH_CUDNN=ON
-    WITH_CUB=ON
-    ELEMENTAL_USE_CUBLAS=OFF
+    WITH_CUB=${WITH_CUB:-ON}
     WITH_ALUMINUM=${WITH_ALUMINUM:-ON}
     ALUMINUM_WITH_NCCL=${ALUMINUM_WITH_NCCL:-ON}
 	if [[ ${CORAL} -eq 1 ]]; then
-		export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.4.2-1+cuda9.2_ppc64le
 		module del cuda
-		CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/9.2.148}
-	else
-		export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.4.2-1+cuda9.2_x86_64
+		CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/10.1.243}
 	fi
 
     # Hack for surface
@@ -583,8 +606,7 @@ if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ]
 		    CUDA_TOOLKIT_MODULE=cudatoolkit/9.2
 		    ;;
 		pascal)
-                    module load opt
-		    CUDA_TOOLKIT_MODULE=cudatoolkit/9.2
+		    CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/10.1.168}
 		    ;;
 	esac
 fi
@@ -613,23 +635,35 @@ if [ "${WITH_CUDA}" == "ON" ]; then
 	CUDA_TOOLKIT_VERSION=$(${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc --version | grep -oE "V[0-9]+\.[0-9]+" | sed 's/V//')
 
 	# CUDNN
-	if [ -z "${CUDNN_DIR}" ]; then
-		if [ "${CUDA_TOOLKIT_VERSION}" == "9.2" ]; then
-			CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-7.5.1/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH}
-		elif [ "${CUDA_TOOLKIT_VERSION}" == "9.1" ]; then
-			CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-7.1.3/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH}
-		fi
+	if [[ -z $CUDNN_DIR ]]; then
+        CUDNN_VER=${CUDNN_VER:-7.6.4}
+		CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-${CUDNN_VER}/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH}
 	fi
-	if [ ! -d "${CUDNN_DIR}" ]; then
+	if [[ ! -d $CUDNN_DIR ]]; then
 		echo "Could not find cuDNN at $CUDNN_DIR"
 		exit 1
 	fi
 	export CUDNN_DIR
+
+    # NCCL
+    if [[ -z $NCCL_DIR ]]; then
+        # Subsequent 2.4.X versions are known to have a performance
+        # regression. See the release notes.
+        NCCL_VER=${NCCL_VER:-2.4.2-1}
+        NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_${NCCL_VER}+cuda${CUDA_TOOLKIT_VERSION}_${ARCH}
+    fi
+    if [[ ! -d $NCCL_DIR ]]; then
+        echo "Could not find NCCL at $NCCL_DIR"
+        exit 1
+    fi
+    export NCCL_DIR
 else
     HAS_GPU=0
     WITH_CUDA=${WITH_CUDA:-OFF}
     WITH_CUDNN=OFF
-    ELEMENTAL_USE_CUBLAS=OFF
+    WITH_CUB=OFF
+    ALUMINUM_WITH_NCCL=OFF
+    ALUMINUM_WITH_MPI_CUDA=OFF
 fi
 
 ################################################################
@@ -708,9 +742,6 @@ if [ ${VERBOSE} -ne 0 ]; then
     print_variable WITH_CUDA
     print_variable WITH_CUDNN
     print_variable WITH_NVPROF
-    print_variable ELEMENTAL_USE_CUBLAS
-    print_variable ELEMENTAL_MATH_LIBS
-    print_variable PATCH_OPENBLAS
     print_variable DETERMINISTIC
     print_variable CLEAN_BUILD
     print_variable VERBOSE
@@ -764,6 +795,7 @@ cmake \
 -D CMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE} \
 -D CMAKE_INSTALL_PREFIX=${INSTALL_DIR} \
 -D LBANN_SB_BUILD_CEREAL=ON \
+-D LBANN_SB_BUILD_CLARA=ON \
 -D LBANN_SB_BUILD_CNPY=ON \
 -D LBANN_SB_BUILD_HYDROGEN=ON \
 -D LBANN_SB_FWD_HYDROGEN_Hydrogen_ENABLE_CUDA=${WITH_CUDA} \
@@ -773,6 +805,7 @@ cmake \
 -D LBANN_SB_BUILD_PROTOBUF=ON \
 -D LBANN_SB_BUILD_CUB=${WITH_CUB} \
 -D LBANN_SB_BUILD_ALUMINUM=${WITH_ALUMINUM} \
+-D ALUMINUM_TAG=v0.3.3 \
 -D ALUMINUM_ENABLE_MPI_CUDA=${ALUMINUM_WITH_MPI_CUDA} \
 -D ALUMINUM_ENABLE_NCCL=${ALUMINUM_WITH_NCCL} \
 -D LBANN_SB_BUILD_CONDUIT=${WITH_CONDUIT} \
@@ -780,6 +813,7 @@ cmake \
 -D LBANN_SB_BUILD_LBANN=ON \
 -D CMAKE_CXX_FLAGS="${CXX_FLAGS}" \
 -D CMAKE_C_FLAGS="${C_FLAGS}" \
+-D CMAKE_CUDA_FLAGS="${CUDA_FLAGS}" \
 -D CMAKE_C_COMPILER=${C_COMPILER} \
 -D CMAKE_CXX_COMPILER=${CXX_COMPILER} \
 -D CMAKE_Fortran_COMPILER=${Fortran_COMPILER} \
@@ -791,10 +825,17 @@ cmake \
 -D LBANN_DATATYPE=${DATATYPE} \
 -D LBANN_DETERMINISTIC=${DETERMINISTIC} \
 -D LBANN_WITH_ALUMINUM=${WITH_ALUMINUM} \
+-D LBANN_SB_BUILD_CATCH2=ON \
 -D LBANN_NO_OMP_FOR_DATA_READERS=${NO_OMP_FOR_DATA_READERS} \
 -D LBANN_CONDUIT_DIR=${CONDUIT_DIR} \
 -D LBANN_BUILT_WITH_SPECTRUM=${WITH_SPECTRUM} \
 -D OPENBLAS_ARCH_COMMAND=${OPENBLAS_ARCH} \
+-D LBANN_WITH_NVSHMEM=${WITH_NVSHMEM} \
+-D LBANN_SB_FWD_LBANN_NVSHMEM_DIR=${NVSHMEM_DIR} \
+-D LBANN_SB_BUILD_DIHYDROGEN=${WITH_DIHYDROGEN} \
+-D DIHYDROGEN_ENABLE_DISTCONV_LEGACY=${WITH_DISTCONV} \
+-D LBANN_WITH_DIHYDROGEN=${WITH_DIHYDROGEN} \
+-D LBANN_WITH_DISTCONV=${WITH_DISTCONV} \
 ${SUPERBUILD_DIR}
 EOF
 )
diff --git a/scripts/build_lbann_osx.sh b/scripts/build_lbann_osx.sh
deleted file mode 100755
index 3149529477a..00000000000
--- a/scripts/build_lbann_osx.sh
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/bin/bash
-
-################################################################
-# Simple script to build LBANN in OS X.
-# Can be called anywhere in the LBANN project tree.
-################################################################
-
-brew_check_install()
-{
-    PKG=$(brew list | grep $1)
-    if [ -z "$PKG" ]; then
-        brew install $1
-    fi
-}
-
-# Install dependencies with Homebrew
-# Note: Requires sudo access. Homebrew can be downloaded from
-# http://brew.sh
-# and installed / updated as shown below
-#   brew tap homebrew/science
-#   brew tap homebrew/boneyard
-#   brew update
-# Check for LBANN dependencies
-brew_check_install git
-brew_check_install cmake
-brew_check_install llvm       # Require OpenMP support in clang needs --with-toolchain flag
-brew_check_install gcc49      # gfortran-4.9 is compatible with clang
-brew_check_install open-mpi
-brew_check_install doxygen
-brew_check_install graphviz   # Doxygen dependency
-brew_check_install metis      # Elemental dependency
-brew_check_install scalapack  # Elemental dependency
-
-################################################################
-# Default options
-################################################################
-
-# Parameters
-COMPILER=clang
-COMPILER_ROOT=/usr/local/opt/llvm
-C_COMPILER=/usr/local/opt/llvm/bin/clang
-CXX_COMPILER=/usr/local/opt/llvm/bin/clang++
-Fortran_COMPILER=/usr/local/bin/gfortran-4.9
-MPI_C_COMPILER=/usr/local/bin/mpicc
-MPI_CXX_COMPILER=/usr/local/bin/mpicxx
-MPI_Fortran_COMPILER=/usr/local/bin/mpifort
-BUILD_TYPE=Release
-WITH_CUDA=OFF
-WITH_CUDNN=OFF
-WITH_CUB=OFF
-CLEAN_BUILD=0
-VERBOSE=0
-CMAKE_INSTALL_MESSAGE=LAZY
-MAKE_NUM_PROCESSES=$(($(sysctl -n hw.ncpu) + 1))
-INSTALL_LBANN=0
-WITH_ALUMINUM=OFF
-GEN_DOC=0
-
- export C_INCLUDE_PATH=$($COMPILER_ROOT/bin/llvm-config --includedir)
- export LIBRARY_PATH=$($COMPILER_ROOT/bin/llvm-config --libdir)
- export CPLUS_INCLUDE_PATH=$($COMPILER_ROOT/bin/llvm-config --includedir)
-
-################################################################
-# Help message
-################################################################
-
-function help_message {
-  local SCRIPT=$(basename ${0})
-  local N=$(tput sgr0)    # Normal text
-  local C=$(tput setf 4)  # Colored text
-cat << EOF
-Build LBANN on an LLNL LC system.
-Can be called anywhere in the LBANN project tree.
-Usage: ${SCRIPT} [options]
-Options:
-  ${C}--help${N}                  Display this help message and exit.
-  ${C}--verbose${N}               Verbose output.
-  ${C}--debug${N}                 Build with debug flag.
-  ${C}--clean-build${N}           Clean build directory before building.
-  ${C}--make-processes${N} <val>  Number of parallel processes for make.
-  ${C}--install-lbann${N}         Install LBANN headers and dynamic library into the build directory.
-  ${C}--doc${N}                   Build LBANN's doxygen documentation
-EOF
-}
-
-################################################################
-# Parse command-line arguments
-################################################################
-
-while :; do
-  case ${1} in
-    -h|--help)
-      # Help message
-      help_message
-      exit 0
-      ;;
-    -v|--verbose)
-      # Verbose output
-      VERBOSE=1
-      CMAKE_INSTALL_MESSAGE=ALWAYS
-      ;;
-    -d|--debug)
-      # Debug mode
-      BUILD_TYPE=Debug
-      SEQ_INIT=ON
-      ;;
-    --clean-build|--build-clean)
-      # Clean build directory
-      CLEAN_BUILD=1
-      ;;
-    -j|--make-processes)
-      if [ -n "${2}" ]; then
-        MAKE_NUM_PROCESSES=${2}
-      else
-        echo "\"${1}\" option requires a non-empty option argument" >&2
-        exit 1
-      fi
-      ;;
-    -i|--install-lbann)
-      INSTALL_LBANN=1
-      ;;
-    --doc)
-      GEN_DOC=1
-      ;;
-    -?*)
-      # Unknown option
-      echo "Unknown option (${1})" >&2
-      exit 1
-      ;;
-    *)
-      # Break loop if there are no more options
-      break
-  esac
-  shift
-done
-
-################################################################
-# Initialize variables
-################################################################
-
-# Build and install directories
-ROOT_DIR=$(git rev-parse --show-toplevel)
-
-# Initialize build directory
-if [ -z "${BUILD_DIR}" ]; then
-    BUILD_DIR=${ROOT_DIR}/build/${COMPILER}.${BUILD_TYPE}.$(hostname)
-fi
-if [ -n "${BUILD_SUFFIX}" ]; then
-    BUILD_DIR=${BUILD_DIR}.${BUILD_SUFFIX}
-fi
-
-INSTALL_DIR=${BUILD_DIR}
-mkdir -p ${BUILD_DIR}
-mkdir -p ${INSTALL_DIR}
-
-SUPERBUILD_DIR="${ROOT_DIR}/superbuild"
-
-################################################################
-# Build LBANN
-################################################################
-
-# Work in build directory
-pushd ${BUILD_DIR}
-
-  # Clean up build directory
-  if [ ${CLEAN_BUILD} -ne 0 ]; then
-    CLEAN_COMMAND="rm -rf ${BUILD_DIR}/*"
-    if [ ${VERBOSE} -ne 0 ]; then
-      echo "${CLEAN_COMMAND}"
-    fi
-    ${CLEAN_COMMAND}
-  fi
-
-  # Configure build with CMake
-  CONFIGURE_COMMAND=$(cat << EOF
-cmake \
--D CMAKE_EXPORT_COMPILE_COMMANDS=ON \
--D CMAKE_BUILD_TYPE=${BUILD_TYPE} \
--D CMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE} \
--D CMAKE_INSTALL_PREFIX=${INSTALL_DIR} \
--D LBANN_SB_BUILD_CNPY=ON \
--D LBANN_SB_BUILD_HYDROGEN=ON \
--D LBANN_SB_BUILD_OPENBLAS=ON \
--D LBANN_SB_BUILD_OPENCV=ON \
--D LBANN_SB_BUILD_JPEG_TURBO=OFF \
--D LBANN_SB_BUILD_PROTOBUF=ON \
--D LBANN_SB_BUILD_CUB=${WITH_CUB}
--D LBANN_SB_BUILD_LBANN=ON \
--D CMAKE_CXX_FLAGS="${CXX_FLAGS}" \
--D CMAKE_C_FLAGS="${C_FLAGS}" \
--D CMAKE_C_COMPILER=${C_COMPILER} \
--D CMAKE_CXX_COMPILER=${CXX_COMPILER} \
--D CMAKE_Fortran_COMPILER=${Fortran_COMPILER} \
--D LBANN_WITH_NCCL=${WITH_NCCL} \
--D LBANN_WITH_CUDA=${WITH_CUDA} \
--D LBANN_WITH_NVPROF=${WITH_NVPROF} \
--D LBANN_WITH_VTUNE=${WITH_VTUNE} \
--D LBANN_WITH_TOPO_AWARE=${WITH_TOPO_AWARE} \
--D LBANN_SEQUENTIAL_INITIALIZATION=${SEQ_INIT} \
--D LBANN_WITH_ALUMINUM=${WITH_ALUMINUM} \
--D LBANN_ALUMINUM_DIR=${ALUMINUM_DIR} \
--D MAKE_NUM_PROCESSES=${MAKE_NUM_PROCESSES} \
-${SUPERBUILD_DIR}
-EOF
-)
-
-  if [ ${VERBOSE} -ne 0 ]; then
-    echo "${CONFIGURE_COMMAND}"
-  fi
-  ${CONFIGURE_COMMAND}
-  if [ $? -ne 0 ] ; then
-    echo "--------------------"
-    echo "CONFIGURE FAILED"
-    echo "--------------------"
-    exit 1
-  fi
-
-  # Build LBANN with make
-  BUILD_COMMAND="make -j${MAKE_NUM_PROCESSES} VERBOSE=${VERBOSE}"
-  if [ ${VERBOSE} -ne 0 ]; then
-    echo "${BUILD_COMMAND}"
-  fi
-  ${BUILD_COMMAND}
-  if [ $? -ne 0 ] ; then
-    echo "--------------------"
-    echo "MAKE FAILED"
-    echo "--------------------"
-    exit 1
-  fi
-
-  # Install LBANN with make
-  if [ ${INSTALL_LBANN} -ne 0 ]; then
-      INSTALL_COMMAND="make install -j${MAKE_NUM_PROCESSES} VERBOSE=${VERBOSE}"
-      if [ ${VERBOSE} -ne 0 ]; then
-          echo "${INSTALL_COMMAND}"
-      fi
-      ${INSTALL_COMMAND}
-      if [ $? -ne 0 ] ; then
-          echo "--------------------"
-          echo "MAKE INSTALL FAILED"
-          echo "--------------------"
-          exit 1
-      fi
-  fi
-
-  # Generate documentation with make
-  if [ ${GEN_DOC} -ne 0 ]; then
-    DOC_COMMAND="make doc"
-    if [ ${VERBOSE} -ne 0 ]; then
-      echo "${DOC_COMMAND}"
-    fi
-    ${DOC_COMMAND}
-    if [ $? -ne 0 ] ; then
-      echo "--------------------"
-      echo "MAKE DOC FAILED"
-      echo "--------------------"
-      exit 1
-    fi
-  fi
-popd
diff --git a/scripts/install_lbann.sh b/scripts/install_lbann.sh
new file mode 100755
index 00000000000..e82b3587130
--- /dev/null
+++ b/scripts/install_lbann.sh
@@ -0,0 +1,335 @@
+#!/bin/bash
+
+# "spack" is just a shell function; it may not be exported to this
+# scope. Just to be sure, reload the shell integration.
+if [ -n "${SPACK_ROOT}" ]; then
+    source ${SPACK_ROOT}/share/spack/setup-env.sh
+fi
+
+SPACK_VERSION=$(spack --version | sed 's/-.*//g')
+MIN_SPACK_VERSION=0.13.3
+
+source $(dirname ${BASH_SOURCE})/utilities.sh
+
+compare_versions ${SPACK_VERSION} ${MIN_SPACK_VERSION}
+VALID_SPACK=$?
+
+if [[ ${VALID_SPACK} -eq 2 ]]; then
+    echo "Newer version of Spack required.  Detected version ${SPACK_VERSION} requires at least ${MIN_SPACK_VERSION}"
+    exit 1
+fi
+
+# Detect system parameters
+CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
+ARCH=$(uname -m)
+SYS=$(uname -s)
+
+SCRIPT=${BASH_SOURCE}
+
+if [[ ${SYS} = "Darwin" ]]; then
+SCRIPTS_DIR=$(normpath $(dirname $(osx_realpath ${SCRIPT})))
+else
+SCRIPTS_DIR=$(realpath $(dirname ${SCRIPT}))
+fi
+
+LBANN_HOME=$(dirname ${SCRIPTS_DIR})
+SPACK_ENV_DIR=${LBANN_HOME}/spack_environments
+
+# Identify the center that we are running at
+CENTER=
+if [[ ${SYS} = "Darwin" ]]; then
+    CENTER="osx"
+else
+    CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0)
+    if [[ ${CORI} -eq 1 ]]; then
+        CENTER="nersc"
+        # Make sure to purge and setup the modules properly prior to finding the Spack architecture
+        source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh
+    else
+        CENTER="llnl_lc"
+    fi
+fi
+
+SPACK_ARCH=$(spack arch)
+SPACK_ARCH_TARGET=$(spack arch -t)
+
+SCRIPT=$(basename ${BASH_SOURCE})
+BUILD_DIR=${LBANN_HOME}/build/spack
+ENABLE_GPUS=ON
+GPU_VARIANTS="+gpu+nccl"
+ENABLE_HALF=OFF
+HALF_VARIANTS="~half"
+BUILD_TYPE=Release
+VERBOSE=0
+LBANN_ENV=
+SPACK_INSTALL_ARGS=
+BUILD_LBANN_SW_STACK="TRUE"
+
+################################################################
+# Help message
+################################################################
+
+function help_message {
+    local SCRIPT=$(basename ${BASH_SOURCE})
+    local N=$(tput sgr0)    # Normal text
+    local C=$(tput setf 4)  # Colored text
+    cat << EOF
+Build LBANN on an LLNL LC system.
+Can be called anywhere in the LBANN project tree.
+Usage: ${SCRIPT} [options]
+Options:
+  ${C}--help${N}               Display this help message and exit.
+  ${C}--verbose${N}            Verbose output.
+  ${C}-d | -deps-only)${N}     Only install the lbann dependencies
+  ${C}-e | --env${N}           Build and install LBANN in the spack environment provided.
+  ${C}--half${N}               Enable support for HALF precision data types in Hydrogen and DiHydrogen
+  ${C}--disable-gpus${N}       Disable GPUS
+  ${C}-s | --superbuild${N}    Superbuild LBANN with dihydrogen, hydrogen, and aluminum
+EOF
+}
+
+################################################################
+# Parse command-line arguments
+################################################################
+
+while :; do
+    case ${1} in
+        -h|--help)
+            # Help message
+            help_message
+            exit 1
+            ;;
+        -d|-deps-only)
+            DEPS_ONLY="TRUE"
+# Until several spack bugs are fixed we cannot use this flag
+#            SPACK_INSTALL_ARGS="--only dependencies"
+            ;;
+        -e|--env)
+            # Change default build directory
+            if [ -n "${2}" ]; then
+                LBANN_ENV=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        --half)
+            ENABLE_HALF=ON
+            HALF_VARIANTS="+half"
+            ;;
+        --disable-gpus)
+            ENABLE_GPUS=OFF
+            GPU_VARIANTS=
+            ;;
+        -s|--superbuild)
+            BUILD_LBANN_SW_STACK="FALSE"
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+temp_file=$(mktemp)
+
+# Defines STD_PACKAGES and STD_MODULES
+source ${SPACK_ENV_DIR}/std_versions_and_variants.sh
+# Defines EXTERNAL_ALL_PACKAGES and EXTERNAL_PACKAGES
+source ${SPACK_ENV_DIR}/${CENTER}/externals-${SPACK_ARCH}.sh
+# Defines COMPILER_ALL_PACKAGES and COMPILER_DEFINITIONS
+source ${SPACK_ENV_DIR}/${CENTER}/compilers.sh
+
+# Disable GPU features on OS X
+if [[ ${SYS} = "Darwin" ]]; then
+    ENABLE_GPUS=OFF
+    GPU_VARIANTS=
+fi
+
+BUILD_SPECS=
+HYDROGEN_VARIANTS="variants: +shared +int64 +al ${HALF_VARIANTS}"
+DIHYDROGEN_VARIANTS="variants: +shared +al +openmp ${HALF_VARIANTS}"
+if [[ ${DEPS_ONLY} = "TRUE" ]]; then
+    if [[ ${SYS} != "Darwin" ]]; then
+        HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +openmp_blas"
+        COMPILER_PACKAGE=$(cat <<EOF
+  - gcc
+EOF
+)
+    else
+        HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} blas=accelerate"
+        COMPILER_PACKAGE=$(cat <<EOF
+  - llvm
+EOF
+)
+    fi
+
+    GPU_PACKAGES=
+    if [[ "${ENABLE_GPUS}" == "ON" ]]; then
+        GPU_PACKAGES=$(cat <<EOF
+  - cudnn
+  - cub
+  - cuda
+  - nccl
+EOF
+)
+    fi
+
+    HALF_PACKAGES=
+    if [[ "${ENABLE_HALF}" == "ON" ]]; then
+        HALF_PACKAGES=$(cat <<EOF
+  - half
+EOF
+)
+    fi
+
+    SUPERBUILD_SPECS=
+    if [[ ${BUILD_LBANN_SW_STACK} == "TRUE" ]]; then
+        SUPERBUILD_SPECS=$(cat <<EOF
+  - aluminum
+  - hydrogen
+  - dihydrogen
+EOF
+)
+    fi
+
+    # Include additional specs if only the dependencies are build
+    BUILD_SPECS=$(cat <<EOF
+# These packages should go away when spack fixes its environments
+${SUPERBUILD_SPECS}
+  - cereal
+  - clara
+  - cnpy
+  - conduit
+${HALF_PACKAGES}
+  - hwloc
+  - opencv
+  - zlib
+${GPU_PACKAGES}
+  - py-numpy
+  - py-protobuf
+  - py-setuptools
+  - mpi
+
+# These are required
+  - catch2
+  - cmake
+  - ninja
+  - python
+  - py-pytest
+${COMPILER_PACKAGE}
+EOF
+)
+    LBANN_ENV="${LBANN_ENV:-lbann-dev-${SPACK_ARCH_TARGET}}"
+else
+    LBANN_ENV="${LBANN_ENV:-lbann-${SPACK_ARCH_TARGET}}"
+    BUILD_SPECS=$(cat <<EOF
+  - lbann@develop${GPU_VARIANTS}
+EOF
+)
+fi
+
+AL_VARIANTS=
+if [[ "${ENABLE_GPUS}" == "ON" ]]; then
+    AL_VARIANTS="variants: +gpu+nccl+mpi_cuda"
+    HYDROGEN_VARIANTS="${HYDROGEN_VARIANTS} +cuda"
+    DIHYDROGEN_VARIANTS="${DIHYDROGEN_VARIANTS} +cuda +legacy"
+fi
+
+SPACK_ENV=$(cat <<EOF
+spack:
+  concretization: together
+  specs:
+${BUILD_SPECS}
+  packages:
+${EXTERNAL_ALL_PACKAGES}
+${COMPILER_ALL_PACKAGES}
+
+${EXTERNAL_PACKAGES}
+
+${STD_PACKAGES}
+
+    aluminum:
+      buildable: true
+      version: [0.3.3]
+      ${AL_VARIANTS}
+      providers: {}
+      paths: {}
+      modules: {}
+      compiler: []
+      target: []
+    hydrogen:
+      buildable: true
+      version: [1.3.4]
+      ${HYDROGEN_VARIANTS}
+      providers: {}
+      paths: {}
+      modules: {}
+      compiler: []
+      target: []
+    dihydrogen:
+      buildable: true
+      version: [master]
+      ${DIHYDROGEN_VARIANTS}
+      providers: {}
+      paths: {}
+      modules: {}
+      compiler: []
+      target: []
+
+${COMPILER_DEFINITIONS}
+
+${STD_MODULES}
+  view: true
+EOF
+)
+
+echo "${SPACK_ENV}" > ${temp_file}
+
+if [[ $(spack env list | grep ${LBANN_ENV}) ]]; then
+    echo "Spack environment ${LBANN_ENV} already exists... overwriting it"
+    CMD="spack env rm ${LBANN_ENV}"
+    echo ${CMD}
+    ${CMD}
+fi
+
+CMD="spack env create ${LBANN_ENV} ${temp_file}"
+echo ${CMD}
+${CMD}
+
+CMD="spack env activate -p ${LBANN_ENV}"
+echo ${CMD}
+${CMD}
+
+CMD="spack install ${SPACK_INSTALL_ARGS}"
+echo ${CMD}
+eval ${CMD}
+if [[ $? -ne 0 ]]; then
+    echo "--------------------"
+    echo "Spack installation FAILED"
+    echo "--------------------"
+    exit 1
+else
+    if [[ ${DEPS_ONLY} = "TRUE" ]]; then
+        echo "LBANN's dependencies are installed in a spack environment named ${LBANN_ENV}, access it via:"
+        echo "  spack env activate -p ${LBANN_ENV}"
+        # Reactivate the spack environment since a clean installation will note setup the modules properly
+        CMD=". $SPACK_ROOT/share/spack/setup-env.sh"
+        ${CMD}
+        # It is no longer necessary to load modules
+        # CMD="spack env loads"
+        # ${CMD}
+
+        echo "Build LBANN from source using the spack environment ${LBANN_ENV}, using the build script:"
+        echo "  ${SCRIPTS_DIR}/build_lbann_from_source.sh -e ${LBANN_ENV}"
+    else
+        echo "LBANN is installed in a spack environment named ${LBANN_ENV}, access it via:"
+        echo "  spack env activate -p ${LBANN_ENV}"
+    fi
+fi
diff --git a/scripts/install_lbann_aux_tools.sh b/scripts/install_lbann_aux_tools.sh
new file mode 100755
index 00000000000..d2b777e8b9f
--- /dev/null
+++ b/scripts/install_lbann_aux_tools.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+
+SPACK_VERSION=$(spack --version | sed 's/-.*//g')
+MIN_SPACK_VERSION=0.13.3
+
+source $(dirname ${BASH_SOURCE})/utilities.sh
+
+compare_versions ${SPACK_VERSION} ${MIN_SPACK_VERSION}
+VALID_SPACK=$?
+
+if [[ ${VALID_SPACK} -eq 2 ]]; then
+    echo "Newer version of Spack required.  Detected version ${SPACK_VERSION} requires at least ${MIN_SPACK_VERSION}"
+    exit 1
+fi
+
+# Detect system parameters
+CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
+ARCH=$(uname -m)
+SYS=$(uname -s)
+
+SCRIPT=${BASH_SOURCE}
+
+if [[ ${SYS} = "Darwin" ]]; then
+SCRIPTS_DIR=$(normpath $(dirname $(osx_realpath ${SCRIPT})))
+else
+SCRIPTS_DIR=$(realpath $(dirname ${SCRIPT}))
+fi
+
+LBANN_HOME=$(dirname ${SCRIPTS_DIR})
+SPACK_ENV_DIR=${LBANN_HOME}/spack_environments
+
+# Identify the center that we are running at
+CENTER=
+if [[ ${SYS} = "Darwin" ]]; then
+    CENTER="osx"
+else
+    CORI=$([[ $(hostname) =~ (cori|cgpu) ]] && echo 1 || echo 0)
+    if [[ ${CORI} -eq 1 ]]; then
+        CENTER="nersc"
+        # Make sure to purge and setup the modules properly prior to finding the Spack architecture
+        source ${SPACK_ENV_DIR}/${CENTER}/setup_modules.sh
+    else
+        CENTER="llnl_lc"
+    fi
+fi
+
+SPACK_ARCH=$(spack arch)
+SPACK_ARCH_TARGET=$(spack arch -t)
+
+SCRIPT=$(basename ${BASH_SOURCE})
+LBANN_ENV=
+
+################################################################
+# Help message
+################################################################
+
+function help_message {
+    local SCRIPT=$(basename ${BASH_SOURCE})
+    local N=$(tput sgr0)    # Normal text
+    local C=$(tput setf 4)  # Colored text
+    cat << EOF
+Build LBANN on an LLNL LC system.
+Can be called anywhere in the LBANN project tree.
+Usage: ${SCRIPT} [options]
+Options:
+  ${C}--help${N}               Display this help message and exit.
+  ${C}--verbose${N}            Verbose output.
+  ${C}-e | --env${N}           Build and install LBANN in the spack environment provided.
+EOF
+}
+
+################################################################
+# Parse command-line arguments
+################################################################
+
+while :; do
+    case ${1} in
+        -h|--help)
+            # Help message
+            help_message
+            exit 1
+            ;;
+        -e|--env)
+            # Change default build directory
+            if [ -n "${2}" ]; then
+                LBANN_ENV=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
+        -?*)
+            # Unknown option
+            echo "Unknown option (${1})" >&2
+            exit 1
+            ;;
+        *)
+            # Break loop if there are no more options
+            break
+    esac
+    shift
+done
+
+temp_file=$(mktemp)
+
+# Defines STD_PACKAGES and STD_MODULES
+source ${SPACK_ENV_DIR}/std_versions_and_variants.sh
+# Defines EXTERNAL_ALL_PACKAGES and EXTERNAL_PACKAGES
+source ${SPACK_ENV_DIR}/${CENTER}/externals-${SPACK_ARCH}.sh
+# Defines COMPILER_ALL_PACKAGES and COMPILER_DEFINITIONS
+source ${SPACK_ENV_DIR}/${CENTER}/compilers.sh
+
+LBANN_ENV="${LBANN_ENV:-lbann-aux-tool-${SPACK_ARCH_TARGET}}"
+
+SPACK_ENV=$(cat <<EOF
+spack:
+  concretization: together
+  specs:
+  - py-argparse
+  - py-configparser
+  - py-cython
+
+  - py-graphviz
+  - py-matplotlib
+  - py-texttable
+
+  - py-onnx
+  - py-pandas
+  packages:
+${EXTERNAL_ALL_PACKAGES}
+${COMPILER_ALL_PACKAGES}
+
+${EXTERNAL_PACKAGES}
+
+${STD_PACKAGES}
+
+    'py-cython:':
+      buildable: true
+      version: [0.29]
+      target: []
+      providers: {}
+      paths: {}
+      modules: {}
+      compiler: []
+    'py-matplotlib:':
+      buildable: true
+      variants: ~tk ~image
+      version: []
+      target: []
+      providers: {}
+      paths: {}
+      modules: {}
+      compiler: []
+
+${COMPILER_DEFINITIONS}
+
+${STD_MODULES}
+  view: true
+EOF
+)
+
+echo "${SPACK_ENV}" > ${temp_file}
+
+if [[ $(spack env list | grep ${LBANN_ENV}) ]]; then
+    echo "Spack environment ${LBANN_ENV} already exists... overwriting it"
+    CMD="spack env rm ${LBANN_ENV}"
+    echo ${CMD}
+    ${CMD}
+fi
+
+CMD="spack env create ${LBANN_ENV} ${temp_file}"
+echo ${CMD}
+${CMD}
+
+CMD="spack env activate -p ${LBANN_ENV}"
+echo ${CMD}
+${CMD}
+
+CMD="spack install"
+echo ${CMD}
+eval ${CMD}
+if [[ $? -ne 0 ]]; then
+    echo "--------------------"
+    echo "Spack installation FAILED"
+    echo "--------------------"
+    exit 1
+else
+    echo "LBANN's auxiliary tools are installed in a spack environment named ${LBANN_ENV}, access it via:"
+    echo "  spack env activate -p ${LBANN_ENV}"
+    # Reactivate the spack environment since a clean installation will note setup the modules properly
+    CMD=". $SPACK_ROOT/share/spack/setup-env.sh"
+    ${CMD}
+    CMD="spack env loads"
+    ${CMD}
+fi
diff --git a/scripts/spack_recipes/build_lbann.sh b/scripts/spack_recipes/build_lbann.sh
deleted file mode 100755
index 7254e9e4139..00000000000
--- a/scripts/spack_recipes/build_lbann.sh
+++ /dev/null
@@ -1,274 +0,0 @@
-#!/bin/bash
-
-set -e
-
-if [ ! -z "$bamboo_SPACK_ROOT" ]; then
-    . $bamboo_SPACK_ROOT/share/spack/setup-env.sh
-fi
-
-CLUSTER=`hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g'`
-
-SPACK_RECIPES=`dirname ${0}`
-#Set Script Name variable
-SCRIPT=`basename ${0}`
-
-SCRIPTS_DIR=`dirname ${SPACK_RECIPES}`
-
-if [[ "$SCRIPTS_DIR" = /* ]]; then
-  ROOT_DIR=`dirname ${SCRIPTS_DIR}`
-else
-  LVL_TO_ROOT_DIR=`dirname ${SCRIPTS_DIR}`
-fi
-
-BLAS=openblas
-BUILD_TYPE=Release
-COMPILER=gcc@4.9.3
-DTYPE=float
-EL_VER=develop
-if [ "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" ]; then
-  MPI=spectrum-mpi
-elif [ "${CLUSTER}" == "pascal" -o "${CLUSTER}" == "surface" ]; then
-  MPI='mvapich2 +cuda'
-else
-  MPI=mvapich2
-fi
-VARIANTS=
-GPU=0 # usually ignored
-
-#Help function
-function HELP {
-  echo -e \\n"Help documentation for ${BOLD}${SCRIPT}.${NORM}"\\n
-  echo -e "${REV}Basic usage:${NORM} ${BOLD}$SCRIPT -c gcc@7.1.0${NORM}"\\n
-  echo "Command line switches are optional. The following switches are recognized."
-  echo "${REV}-b${NORM} <val> --Select ${BOLD}BLAS library${NORM}. Default is ${BOLD}${BLAS}${NORM}."
-  echo "${REV}-c${NORM} <val> --Select ${BOLD}compiler${NORM}. Default is ${BOLD}${COMPILER}${NORM}."
-  echo "${REV}-d${NORM}       --Build with ${BOLD}Debug mode${NORM} enabled. Default is ${BOLD}${BUILD_TYPE}${NORM}."
-  echo "${REV}-e${NORM} <val> --Select ${BOLD}Elemental version${NORM}. Default is ${BOLD}${EL_VER}${NORM}."
-  echo "${REV}-g${NORM}       --Build with ${BOLD}GPU support${NORM} enabled."
-  echo "${REV}-m${NORM} <val> --Select ${BOLD}MPI library${NORM}. Default is ${BOLD}${MPI}${NORM}."
-  echo "${REV}-s${NORM}       --Build with ${BOLD}sequential initialization mode${NORM} enabled."
-  echo "${REV}-t${NORM} <val> --Select ${BOLD}datatype${NORM}. Default is ${BOLD}${DTYPE}${NORM}."
-  echo -e "${REV}-h${NORM}       --Displays this help message. No further functions are performed."\\n
-  exit 1
-}
-
-while getopts "b:c:de:ghm:st:z" opt; do
-  case $opt in
-    b)
-      BLAS=$OPTARG
-      ;;
-    c)
-      COMPILER=$OPTARG
-      ;;
-    d)
-      BUILD_TYPE=Debug
-      ;;
-    e)
-      EL_VER=$OPTARG
-      ;;
-    g)
-      GPU=1
-      ;;
-    h)
-      HELP
-      exit 1
-      ;;
-    m)
-      MPI=$OPTARG
-      ;;
-    s)
-      VARIANTS="${VARIANTS} +seq_init"
-      ;;
-    t)
-      DTYPE=$OPTARG
-      ;;
-    z)
-      SPACK_DIRTY=1
-      ;;
-    \?)
-      echo "Invalid option: -$OPTARG" >&2
-      exit 1
-      ;;
-    :)
-      echo "Option -$OPTARG requires an argument." >&2
-      exit 1
-      ;;
-  esac
-done
-
-shift $((OPTIND-1))
-# now do something with $@
-
-# Figure out which cluster we are on
-ARCH=`uname -m`
-
-PLATFORM=
-FEATURE=
-if [ "${GPU}" == "1" -o "${CLUSTER}" == "surface" -o "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "pascal" ]; then
-  if [ "${CLUSTER}" == "flash" ]; then
-    PLATFORM="+gpu ^cuda@7.5 ^cudnn@5.1"
-    FEATURE="_gpu_cuda-7.5_cudnn-5.1"
-  elif [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "ray" ]; then
-    PLATFORM="+gpu ^cuda@9.2.64 ^cudnn@7.0"
-    FEATURE="_gpu_cuda-9.2.64_cudnn-7.0"
-  elif [ "${CLUSTER}" == "pascal" ]; then
-    PLATFORM="+gpu ^cuda@9.1.85 ^cudnn@7.1"
-    FEATURE="_gpu_cuda-9.1.85_cudnn-7.1"
-  else
-    PLATFORM="+gpu"
-    FEATURE="_gpu"
-  fi
-  EL_VER="${EL_VER}+cuda"
-  MPI="${MPI}+cuda"
-else
-  PLATFORM="~gpu"
-fi
-
-C_FLAGS=
-CXX_FLAGS=
-Fortran_FLAGS=
-
-DIST=
-case ${BUILD_TYPE} in
-  Release)
-    DIST=rel
-    # Don't use the march=native flag for gcc and intel compilers since that
-    # wouldn't allow spack to differentiate between optimization sets
-    # C_FLAGS="${C_FLAGS} -march=native"
-    # CXX_FLAGS="${CXX_FLAGS} -march=native"
-    # Fortran_FLAGS="${Fortran_FLAGS} -march=native"
-    if [[ (${COMPILER} == gcc@*) ]]; then
-        if [ "${CLUSTER}" == "catalyst" ]; then
-            ARCH_FLAGS="-march=ivybridge -mtune=ivybridge"
-        elif [ "${CLUSTER}" == "quartz"  -o "${CLUSTER}" == "pascal" ]; then
-            ARCH_FLAGS="-march=broadwell -mtune=broadwell"
-        elif [ "${CLUSTER}" == "surface" ]; then
-            ARCH_FLAGS="-march=sandybridge -mtune=sandybridge"
-        elif [ "${CLUSTER}" == "flash" ]; then
-            ARCH_FLAGS="-march=haswell -mtune=haswell"
-        fi
-    elif [[ (${COMPILER} == intel@*) ]]; then
-        if [ "${CLUSTER}" == "catalyst" ]; then
-            ARCH_FLAGS="-march=corei7-avx -mtune=ivybridge"
-        elif [ "${CLUSTER}" == "quartz"  -o "${CLUSTER}" == "pascal" ]; then
-            ARCH_FLAGS="-march=core-avx2 -mtune=broadwell"
-        elif [ "${CLUSTER}" == "surface" ]; then
-            ARCH_FLAGS="-march=corei7-avx -mtune=sandybridge"
-        elif [ "${CLUSTER}" == "flash" ]; then
-            ARCH_FLAGS="-march=core-avx2 -mtune=haswell"
-        fi
-    elif [[ ${COMPILER} == clang@* ]]; then
-        if [ "${CLUSTER}" == "catalyst" -o "${CLUSTER}" == "surface" ]; then
-            ARCH_FLAGS="-mavx -march=native"
-        elif [ "${CLUSTER}" == "quartz" -o "${CLUSTER}" == "flash"  -o "${CLUSTER}" == "pascal" ]; then
-            ARCH_FLAGS="-mavx2 -march=native"
-        fi
-    fi
-    C_FLAGS="-O3 -g ${ARCH_FLAGS}"
-    CXX_FLAGS="-O3 -g ${ARCH_FLAGS}"
-    Fortran_FLAGS="-O3 -g ${ARCH_FLAGS}"
-    ;;
-  Debug)
-    DIST=debug
-    C_FLAGS="-g"
-    CXX_FLAGS="-g"
-    Fortran_FLAGS="-g"
-    ;;
-  :)
-    DIST=unkwn
-    ;;
-esac
-
-SPACK_CFLAGS=
-if [ ! -z "${C_FLAGS}" ]; then
-    SPACK_CFLAGS="cflags=\"${C_FLAGS}\""
-fi
-SPACK_CXXFLAGS=
-if [ ! -z "${CXX_FLAGS}" ]; then
-    SPACK_CXXFLAGS="cxxflags=\"${CXX_FLAGS}\""
-fi
-SPACK_FFLAGS=
-if [ ! -z "${Fortran_FLAGS}" ]; then
-    SPACK_FFLAGS="fflags=\"${Fortran_FLAGS}\""
-fi
-
-SPACK_SETUP_FLAGS=
-if [ "${SPACK_DIRTY}" == "1" ]; then
-  SPACK_SETUP_FLAGS="--dirty"
-fi
-
-if [ "${CLUSTER}" == "ray" ]; then
-  MPI="spectrum-mpi@2018.04.27"
-fi
-
-SPACK_OPTIONS="lbann@local %${COMPILER} build_type=${BUILD_TYPE} dtype=${DTYPE} ${PLATFORM} ${VARIANTS} ^hydrogen@${EL_VER} build_type=${BUILD_TYPE} blas=${BLAS} ^${MPI}"
-# Disable the extra compiler flags until spack supports propagating flags properly
-#SPACK_OPTIONS="lbann@local build_type=${BUILD_TYPE} dtype=${DTYPE} ${PLATFORM} ${VARIANTS} %${COMPILER} ${SPACK_CFLAGS} ${SPACK_CXXFLAGS} ${SPACK_FFLAGS} ^elemental@${EL_VER} blas=${BLAS} ^${MPI}"
-
-# Use older cmake to avoid passing -pthread to nvcc
-if [ "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" ]; then
-  SPACK_OPTIONS="$SPACK_OPTIONS ^cmake@3.9.0"
-fi
-
-SPEC="spack spec ${SPACK_OPTIONS}"
-CMD="spack setup ${SPACK_SETUP_FLAGS} ${SPACK_OPTIONS}"
-
-# Create a directory for the build
-if [ ! -z "$bamboo_SPACK_ROOT" ]; then
-  DIR="${CLUSTER}_${COMPILER}_${DIST}"
-  DIR=${DIR//@/-}
-  DIR=${DIR// /-}
-  DIR=${DIR//+/-}
-else
-  DIR="${CLUSTER}_${COMPILER}_${ARCH}${FEATURE}_${MPI}_${BLAS}_${DIST}"
-  DIR=${DIR//@/-}
-  DIR=${DIR// /-}
-  DIR=${DIR//+/-}
-fi
-echo "Creating directory ${DIR}"
-mkdir -p ${DIR}/build
-cd ${DIR}
-
-echo $SPEC
-echo $SPEC > spack_build_lbann.sh
-eval $SPEC
-err=$?
-if [ $err -eq 1 ]; then
-  echo "Spack spec command returned error: $err"
-  exit -1
-fi
-
-echo $CMD
-echo $CMD >> spack_build_lbann.sh
-chmod +x spack_build_lbann.sh
-eval $CMD
-err=$?
-if [ $err -eq 1 ]; then
-  echo "Spack setup command returned error: $err"
-  exit -1
-fi
-
-# Find the root of the git repo
-cd build
-PATH_TO_SRC=
-if [ ! -z ${LVL_TO_ROOT_DIR} ]; then
-  PATH_TO_SRC="${LVL_TO_ROOT_DIR}/../.."
-elif [ ! -z ${ROOT_DIR} ]; then
-  PATH_TO_SRC="${ROOT_DIR}"
-fi
-
-if [ ! -z ${PATH_TO_SRC} -a -d ${PATH_TO_SRC}/src ]; then
-  CMD="../spconfig.py ${PATH_TO_SRC}"
-  echo $CMD
-  eval $CMD
-fi
-
-# Deal with the fact that spack should not install a package when doing setup"
-FIX="spack uninstall --all -y lbann %${COMPILER} build_type=${BUILD_TYPE}"
-echo $FIX
-if [ ! -z "$bamboo_SPACK_ROOT" ]; then
-    eval $FIX &> /dev/null
-    exit 0
-else
-    eval $FIX
-fi
diff --git a/scripts/utilities.sh b/scripts/utilities.sh
new file mode 100644
index 00000000000..113c1e686a8
--- /dev/null
+++ b/scripts/utilities.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+compare_versions()
+{
+    local v1=( $(echo "$1" | tr '.' ' ') )
+    local v2=( $(echo "$2" | tr '.' ' ') )
+    local len=$(( ${#v1[*]} > ${#v2[*]} ? ${#v1[*]} : ${#v2[*]} ))
+    for ((i=0; i<len; i++))
+    do
+        [ "${v1[i]:-0}" -gt "${v2[i]:-0}" ] && return 1
+        [ "${v1[i]:-0}" -lt "${v2[i]:-0}" ] && return 2
+    done
+    return 0
+}
+
+osx_realpath() {
+    [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
+}
+
+function normpath() {
+  # Remove all /./ sequences.
+  local path=${1//\/.\//\/}
+
+  # Remove dir/.. sequences.
+  while [[ $path =~ ([^/][^/]*/\.\./) ]]; do
+    path=${path/${BASH_REMATCH[0]}/}
+  done
+  echo $path
+}
diff --git a/scripts/viz.py b/scripts/viz.py
index 95bc0899f9d..a0752ac3625 100755
--- a/scripts/viz.py
+++ b/scripts/viz.py
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
+"""Visualize an LBANN model's layer graph and save to file."""
+
 import argparse
-import google.protobuf.text_format as txtf
-from lbann.proto import lbann_pb2
-import lbann.viz
+import re
+import graphviz
+import google.protobuf.text_format
+from lbann import lbann_pb2, layers_pb2
 
 # Parse command-line arguments
 parser = argparse.ArgumentParser(
-    description='Visualize layer graph for LBANN model.')
+    description='Visualize an LBANN model\'s layer graph and save to file.')
 parser.add_argument(
     'input', action='store', type=str,
     help='model prototext file')
@@ -27,13 +30,68 @@
     help='Graphviz visualization scheme (default: dot)', metavar='ENGINE')
 args = parser.parse_args()
 
-# Parse prototext file
+# Strip extension from filename
+filename = args.output
+file_format = args.file_format
+if filename.endswith('.' + file_format):
+    filename = filename[:-len(file_format)-1]
+
+# Convert label format to lowercase with no spaces
+label_format = re.sub(r' |-|_', '', args.label_format.lower())
+
+# Read prototext file
 proto = lbann_pb2.LbannPB()
 with open(args.input, 'r') as f:
-    txtf.Merge(f.read(), proto)
+    google.protobuf.text_format.Merge(f.read(), proto)
+model = proto.model
+
+# Construct graphviz graph
+graph = graphviz.Digraph(filename=filename,
+                         format=file_format,
+                         engine=args.graphviz_engine)
+graph.attr('node', shape='rect')
+
+# Construct nodes in layer graph
+layer_types = (set(layers_pb2.Layer.DESCRIPTOR.fields_by_name.keys())
+               - set(['name', 'parents', 'children', 'datatype',
+                      'data_layout', 'device_allocation', 'weights',
+                      'num_neurons_from_data_reader', 'freeze',
+                      'hint_layer', 'weights_data',
+                      'top', 'bottom', 'type', 'motif_layer']))
+for l in model.layer:
+
+    # Determine layer type
+    type = ''
+    for _type in layer_types:
+        if l.HasField(_type):
+            type = getattr(l,_type).DESCRIPTOR.name
+            break
+
+    # Construct node label
+    label = ''
+    if label_format == 'nameonly':
+        label = l.name
+    elif label_format == 'typeonly':
+        label = type
+    elif label_format == 'typeandname':
+        label = '<{0}<br/>{1}>'.format(type, l.name)
+    elif label_format == 'full':
+        label = '<'
+        for (index, line) in enumerate(str(l).strip().split('\n')):
+            if index > 0:
+                label += '<br/>'
+            label += line
+        label += '>'
+
+    # Add layer as layer graph node
+    graph.node(l.name, label=label)
+
+# Add parent/child relationships as layer graph edges
+edges = set()
+for l in model.layer:
+    edges.update([(p, l.name) for p in l.parents.split()])
+    edges.update([(l.name, c) for c in l.children.split()])
+graph.edges(edges)
 
-# Visualize
-lbann.viz.visualize_layer_graph(proto.model, args.output,
-                                file_format=args.file_format,
-                                label_format=args.label_format,
-                                graphviz_engine=args.graphviz_engine)
+# Save to file
+graph.render(filename=filename, cleanup=True, format=file_format)
diff --git a/spack_environments/cmake_lbann.sh b/spack_environments/cmake_lbann.sh
new file mode 100644
index 00000000000..cb4b68f3d89
--- /dev/null
+++ b/spack_environments/cmake_lbann.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+
+LBANN_FWD_CMD=
+LBANN_COMPILER_CMD=
+
+if [[ ${SYS} = "Darwin" ]]; then
+LBANN_FWD_CMD=$(cat << EOF
+  -D HWLOC_DIR=/usr/local/opt/hwloc \
+  -D OpenMP_CXX_LIB_NAMES=omp \
+  -D OpenMP_CXX_FLAGS="-fopenmp=libomp" \
+  -D OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib
+EOF
+)
+LBANN_COMPILER_CMD=$(cat << EOF
+  -D CMAKE_CXX_COMPILER=$(which clang++) \
+  -D CMAKE_C_COMPILER=$(which clang)
+EOF
+)
+fi
+
+# Configure build with CMake
+CONFIGURE_COMMAND=$(cat << EOF
+cmake \
+  -G Ninja \
+  -D CMAKE_BUILD_TYPE:STRING=${BUILD_TYPE} \
+  -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \
+  -D CMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+  -D CMAKE_C_FLAGS="${C_FLAGS}" \
+  \
+  -D LBANN_DATATYPE:STRING=float \
+  -D LBANN_WITH_ALUMINUM:BOOL=ON \
+  -D LBANN_WITH_CONDUIT:BOOL=ON \
+  -D LBANN_WITH_CUDA:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_CUDNN:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_NCCL:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_NVPROF:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_SOFTMAX_CUDA:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_TOPO_AWARE:BOOL=ON \
+  -D LBANN_WITH_TBINF=OFF \
+  -D LBANN_WITH_VTUNE:BOOL=OFF \
+  -D LBANN_DETERMINISTIC=${DETERMINISTIC} \
+  -D LBANN_WITH_DIHYDROGEN=${ENABLE_DIHYDROGEN} \
+  -D LBANN_WITH_DISTCONV:BOOL=${ENABLE_DISTCONV} \
+${LBANN_FWD_CMD}  \
+${LBANN_COMPILER_CMD}  \
+  ${LBANN_HOME}
+EOF
+)
+
+if [[ ${VERBOSE} -ne 0 ]]; then
+    echo "${CONFIGURE_COMMAND}" 2>1 | tee lbann_cmake_invocation.txt
+else
+    echo "${CONFIGURE_COMMAND}" > lbann_cmake_invocation.txt
+fi
+eval ${CONFIGURE_COMMAND}
+if [[ $? -ne 0 ]]; then
+    echo "--------------------"
+    echo "CONFIGURE FAILED"
+    echo "--------------------"
+    exit 1
+fi
diff --git a/spack_environments/developer_release_osx_spack.yaml b/spack_environments/developer_release_osx_spack.yaml
deleted file mode 100644
index 8b1bee0296c..00000000000
--- a/spack_environments/developer_release_osx_spack.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
-spack:
-  # add package specs to the `specs` list
-  specs:
-  - protobuf@3.6.1 build_type=Release +shared
-  - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo
-  - cnpy@master build_type=RelWithDebInfo
-  - opencv@3.4.3 build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
-  - cereal@1.2.2 build_type=RelWithDebInfo patches=2dfa0bff9816d0ebd8a1bcc70ced4483b3cda83a982ea5027f1aaadceaa15aac,720265382f29b744488d67e8df5000f2ca1b4dceb2018835fb5dc7a3a1c23f75,91f968e9ac3964e1a689a9ad379ab16f7803ac3d34d24f87ebcaecaa3f9a2f16
-  - ninja@1.8.2
-  - zlib@1.2.11
-  - openblas@0.3.4 cpu_target=auto ~ilp64+pic+shared threads=none ~virtual_machine
-  - hwloc@2.0.2
-  - cmake@3.12.1
-  - py-cython@0.29
-  - py-breathe
-  - py-m2r
-  - py-sphinx
-  - py-certifi
-  - py-urllib3
-  - py-idna
-  - py-chardet
-  - doxygen
-  mirrors: {}
-  modules:
-    enable: []
-  repos: []
-  config: {}
-################################################################################
-# Include paths to standard compilers and packages on LLNL LC systems
-# Remove and/or replace these with your site specific packages and paths
-################################################################################
-# include:
-#   - externals_llnl_lc_cz.yaml
-  packages:
-    all:
-      providers:
-        mpi: [openmpi@4.0 arch=darwin-highsierra-x86_64]
-      buildable: true
-      version: []
-      paths: {}
-      modules: {}
-      compiler: [clang@7.0.1 arch=darwin-highsierra-x86_64]
-
-    cmake:
-      variants: ~openssl ~ncurses
-      paths:
-        cmake@3.14.0 arch=darwin-highsierra--x86_64:  /usr/local/
-    python:
-      buildable: True
-      variants: +shared
-      version: [3.7.2]
-
-    openmpi:
-      buildable: False
-      version: [4.0]
-      paths:
-        openmpi@4.0 arch=darwin-highsierra-x86_64: /usr/local/
-
-  compilers:
-  - compiler:
-      environment: {}
-      extra_rpaths: []
-      flags: {}
-      modules: []
-      operating_system: highsierra
-      paths:
-        cc: /usr/local/Cellar/llvm/7.0.1/bin/clang
-        cxx: /usr/local/Cellar/llvm/7.0.1/bin/clang++
-        f77: /usr/local/bin/gfortran
-        fc: /usr/local/bin/gfortran
-      spec: clang@7.0.1
-      target: x86_64
diff --git a/spack_environments/developer_release_ppc64le_cuda_spack.yaml b/spack_environments/developer_release_ppc64le_cuda_spack.yaml
deleted file mode 100644
index 5326a77bbd3..00000000000
--- a/spack_environments/developer_release_ppc64le_cuda_spack.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
-
-################################################################################
-# Notes on building a environment file:
-# 1) Packages that should be explicitly installed to satisfy dependencies
-#    for LBANN, Hydrogen, or Aluminum are in the specs list.
-# 2) Packages and their variants that are secondary dependencies of
-#    the primary dependencies should be in the packages list.  This
-#    ensures that all primary dependenies build with a consistent set
-#    of secondary (and tertiary, ...) dependencies.
-################################################################################
-
-spack:
-  # add package specs to the `specs` list
-  specs:
-  - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo
-  - cnpy@master build_type=RelWithDebInfo
-  - opencv build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
-  - cereal
-  - ninja
-  - zlib
-  - cmake
-  - cudnn@7.5.1-10.1-ppc64le
-  - cub
-  - nccl
-  - hwloc
-  - py-argparse
-  - py-configparser
-  - py-cython
-  - py-graphviz
-  - py-matplotlib
-  - py-onnx
-  - py-pandas
-  - py-protobuf+cpp
-  - py-setuptools
-  - py-texttable
-  mirrors: {}
-  modules:
-    enable: []
-  repos: []
-  config: {}
-################################################################################
-# Include paths to standard compilers and packages on LLNL LC systems
-# Remove and/or replace these with your site specific packages and paths
-# Note that the include files are expected to be local to this yaml file
-################################################################################
-  include:
-    - std_versions_and_variants_llnl_lc_cz.yaml
-    - externals_ppc64le_llnl_lc_cz.yaml
diff --git a/spack_environments/developer_release_x86_64_cuda_spack.yaml b/spack_environments/developer_release_x86_64_cuda_spack.yaml
deleted file mode 100644
index 6349b29ab09..00000000000
--- a/spack_environments/developer_release_x86_64_cuda_spack.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
-
-################################################################################
-# Notes on building a environment file:
-# 1) Packages that should be explicitly installed to satisfy dependencies
-#    for LBANN, Hydrogen, or Aluminum are in the specs list.
-# 2) Packages and their variants that are secondary dependencies of
-#    the primary dependencies should be in the packages list.  This
-#    ensures that all primary dependenies build with a consistent set
-#    of secondary (and tertiary, ...) dependencies.
-################################################################################
-
-spack:
-  # add package specs to the `specs` list
-  specs:
-  - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo
-  - cnpy@master build_type=RelWithDebInfo
-  - opencv build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
-  - cereal
-  - ninja
-  - zlib
-  - cmake
-  - cudnn@7.5.1-10.0-x86_64
-  - cub
-  - nccl
-  - hwloc
-  - py-argparse
-  - py-configparser
-  - py-cython
-  - py-graphviz
-  - py-matplotlib
-  - py-onnx
-  - py-pandas
-  - py-protobuf+cpp
-  - py-setuptools
-  - py-texttable
-  mirrors: {}
-  modules:
-    enable: []
-  repos: []
-  config: {}
-################################################################################
-# Include paths to standard compilers and packages on LLNL LC systems
-# Remove and/or replace these with your site specific packages and paths
-# Note that the include files are expected to be local to this yaml file
-################################################################################
-  include:
-    - std_versions_and_variants_llnl_lc_cz.yaml
-    - externals_x86_64_llnl_lc_cz.yaml
diff --git a/spack_environments/externals_ppc64le_llnl_lc_cz.yaml b/spack_environments/externals_ppc64le_llnl_lc_cz.yaml
deleted file mode 100644
index c3b297e0062..00000000000
--- a/spack_environments/externals_ppc64le_llnl_lc_cz.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-  packages:
-    all:
-      providers:
-        mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
-      buildable: true
-      version: []
-      paths: {}
-      modules: {}
-      compiler: [gcc@7.3.1 arch=linux-rhel7-ppc64le]
-
-    cmake::
-      variants: ~openssl ~ncurses
-      version: [3.12.1]
-      paths:
-        cmake@3.12.1 arch=linux-rhel7-ppc64le: /usr/tce/packages/cmake/cmake-3.12.1
-
-    cuda::
-      buildable: False
-      version: [9.2.88, 10.1.105]
-      paths:
-        cuda@9.2.88 arch=linux-rhel7-ppc64le: /usr/tce/packages/cuda/cuda-9.2.88/
-        cuda@10.1.105 arch=linux-rhel7-ppc64le: /usr/tce/packages/cuda/cuda-10.1.105
-
-    cudnn::
-      buildable: true
-      version: [7.4.2. 7.5.1, 7.5.1-10.1-ppc64le]
-      paths:
-        cudnn@7.5.1 arch=linux-rhel7-ppc64le: /usr/workspace/wsb/brain/cudnn/cudnn-7.5.1/cuda-10.1_ppc64le/
-        cudnn@7.4.2 arch=linux-rhel7-ppc64le: /usr/workspace/wsb/brain/cudnn/cudnn-7.4.2/cuda-9.2_ppc64le
-
-    hwloc::
-      buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-ppc64le: /usr/lib64/libhwloc.so
-
-    openblas::
-      buildable: True
-      variants: threads=openmp ~avx2 ~avx512
-      version: [0.3.6]
-
-    opencv::
-      buildable: true
-      variants: +powerpc +vsx
-      version: [4.1.0]
-
-    spectrum-mpi::
-      buildable: False
-      version: [rolling-release]
-      paths:
-        spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-ppc64le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
-
-  compilers:
-  - compiler:
-      environment: {}
-      extra_rpaths: []
-      flags: {}
-      modules: []
-      operating_system: rhel7
-      paths:
-        cc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gcc
-        cxx: /usr/tce/packages/gcc/gcc-7.3.1/bin/g++
-        f77: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran
-        fc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran
-      spec: gcc@7.3.1
-      target: ppc64le
diff --git a/spack_environments/externals_x86_64_llnl_lc_cz.yaml b/spack_environments/externals_x86_64_llnl_lc_cz.yaml
deleted file mode 100644
index 00cb62e1bf1..00000000000
--- a/spack_environments/externals_x86_64_llnl_lc_cz.yaml
+++ /dev/null
@@ -1,63 +0,0 @@
-  packages:
-    all:
-      providers:
-        mpi: [mvapich2@2.3 arch=linux-rhel7-x86_64]
-        lapack: [openblas threads=openmp]
-        blas: [openblas threasd=openmp]
-      buildable: true
-      version: []
-      paths: {}
-      modules: {}
-      compiler: [gcc@7.3.0 arch=linux-rhel7-x86_64]
-
-    cmake::
-      variants: ~openssl ~ncurses
-      version: [3.12.1]
-      paths:
-        cmake@3.12.1 arch=linux-rhel7-x86_64:  /usr/tce/packages/cmake/cmake-3.12.1
-
-    cuda::
-      buildable: False
-      version: [10.0.130]
-      paths:
-        cuda@10.0.130 arch=linux-rhel7-x86_64: /usr/tce/packages/cuda/cuda-10.0.130
-
-    cudnn::
-      buildable: true
-      version: [7.5.1-10.0-x86_64]
-
-    hwloc::
-      buildable: False
-      version: [2.0.2]
-      paths:
-        hwloc@2.0.2 arch=linux-rhel7-x86_64: /usr/lib64/libhwloc.so
-
-    mvapich2::
-      buildable: True
-      version: [2.3]
-      paths:
-        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-x86_64: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
-
-    openblas::
-      buildable: True
-      variants: threads=openmp
-      version: [0.3.6]
-
-    opencv::
-      buildable: true
-      version: [4.1.0]
-
-  compilers:
-  - compiler:
-      environment: {}
-      extra_rpaths: []
-      flags: {}
-      modules: []
-      operating_system: rhel7
-      paths:
-        cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
-        cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
-        f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-        fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-      spec: gcc@7.3.0
-      target: x86_64
diff --git a/spack_environments/llnl_lc/compilers.sh b/spack_environments/llnl_lc/compilers.sh
new file mode 100644
index 00000000000..30c6e25f828
--- /dev/null
+++ b/spack_environments/llnl_lc/compilers.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+COMPILER_ALL_PACKAGES=$(cat <<EOF
+      compiler: [gcc@7.3.0 arch=linux-rhel7-broadwell, gcc@7.3.0 arch=linux-rhel7-haswell, gcc@7.3.1 arch=linux-rhel7-power9le, gcc@7.3.1 arch=linux-rhel7-power8le]
+EOF
+)
+
+COMPILER_DEFINITIONS=$(cat <<EOF
+  compilers:
+  - compiler:
+      environment: {}
+      extra_rpaths: []
+      flags: {}
+      modules: []
+      operating_system: rhel7
+      paths:
+        cc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gcc
+        cxx: /usr/tce/packages/gcc/gcc-7.3.1/bin/g++
+        f77: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran
+        fc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran
+      spec: gcc@7.3.1
+      target: ppc64le
+  - compiler:
+      environment: {}
+      extra_rpaths: []
+      flags: {}
+      modules: []
+      operating_system: rhel7
+      paths:
+        cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
+        cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
+        f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+        fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+      spec: gcc@7.3.0
+      target: x86_64
+EOF
+)
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
new file mode 100644
index 00000000000..d53505f0617
--- /dev/null
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-broadwell.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [mvapich2@2.3 arch=linux-rhel7-broadwell]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.14.5]
+      paths:
+        cmake@3.14.5 arch=linux-rhel7-broadwell:  /usr/tce/packages/cmake/cmake-3.14.5
+
+    cuda::
+      buildable: False
+      version: [10.1.168]
+      modules:
+        cuda@10.1.168 arch=linux-rhel7-broadwell: cuda/10.1.168
+
+    cudnn::
+      buildable: true
+      version: [7.6.5.32-10.1-linux-x64]
+
+    gcc::
+       buildable: False
+       version: [7.3.0]
+       modules:
+         gcc@7.3.0 arch=linux-rhel7-broadwell: gcc/7.3.0
+
+    hwloc::
+      buildable: False
+      version: [2.0.2]
+      paths:
+        hwloc@2.0.2 arch=linux-rhel7-broadwell: /usr/lib64/libhwloc.so
+
+    mvapich2::
+      buildable: True
+      version: [2.3]
+      paths:
+        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-broadwell: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
+
+    openblas::
+      buildable: True
+      variants: threads=openmp
+      version: [0.3.6]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+      modules:
+        python@3.7.2 arch=linux-rhel7-broadwell: python/3.7.2
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=linux-rhel7-broadwell: /usr
+EOF
+)
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
new file mode 100644
index 00000000000..1a5f2846240
--- /dev/null
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-haswell.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [mvapich2@2.3 arch=linux-rhel7-haswell]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.14.5]
+      paths:
+        cmake@3.14.5 arch=linux-rhel7-haswell:  /usr/tce/packages/cmake/cmake-3.14.5
+
+    cuda::
+      buildable: False
+      version: [10.1.168]
+      modules:
+        cuda@10.1.168 arch=linux-rhel7-haswell: cuda/10.1.168
+
+    cudnn::
+      buildable: true
+      version: [7.6.5.32-10.1-linux-x64]
+
+    gcc::
+       buildable: False
+       version: [7.3.0]
+       modules:
+         gcc@7.3.0 arch=linux-rhel7-haswell: gcc/7.3.0
+
+    hwloc::
+      buildable: False
+      version: [2.0.2]
+      paths:
+        hwloc@2.0.2 arch=linux-rhel7-haswell: /usr/lib64/libhwloc.so
+
+    mvapich2::
+      buildable: True
+      version: [2.3]
+      paths:
+        mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-haswell: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/
+
+    openblas::
+      buildable: True
+      variants: threads=openmp
+      version: [0.3.6]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+      modules:
+        python@3.7.2 arch=linux-rhel7-haswell: python/3.7.2
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=linux-rhel7-haswell: /usr
+EOF
+)
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
new file mode 100644
index 00000000000..1520e243c9a
--- /dev/null
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power8le.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-power8le]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.14.5]
+      paths:
+        cmake@3.14.5 arch=linux-rhel7-power8le:   /usr/tce/packages/cmake/cmake-3.14.5
+
+    cuda::
+      buildable: False
+      version: [10.1.243]
+      modules:
+        cuda@10.1.243 arch=linux-rhel7-power8le: cuda/10.1.243
+
+    cudnn::
+      buildable: true
+      version: [7.6.5.32-10.1-linux-ppc64le]
+
+    gcc::
+       buildable: False
+       version: [7.3.1]
+       modules:
+         gcc@7.3.1 arch=linux-rhel7-power8le: gcc/7.3.1
+
+    hwloc::
+      buildable: False
+      version: [2.0.2]
+      paths:
+        hwloc@2.0.2 arch=linux-rhel7-power8le: /usr/lib64/libhwloc.so
+
+    openblas::
+      buildable: True
+      variants: threads=openmp ~avx2 ~avx512
+      version: [0.3.6]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+      modules:
+        python@3.7.2 arch=linux-rhel7-power8le: python/3.7.2
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=linux-rhel7-power8le: /usr
+
+    spectrum-mpi::
+      buildable: False
+      version: [rolling-release]
+      paths:
+        spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power8le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
+EOF
+)
diff --git a/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
new file mode 100644
index 00000000000..1848bff85f0
--- /dev/null
+++ b/spack_environments/llnl_lc/externals-linux-rhel7-power9le.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-power9le]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.14.5]
+      paths:
+        cmake@3.14.5 arch=linux-rhel7-power9le:   /usr/tce/packages/cmake/cmake-3.14.5
+
+    cuda::
+      buildable: False
+      version: [10.1.243]
+      modules:
+        cuda@10.1.243 arch=linux-rhel7-power9le: cuda/10.1.243
+
+    cudnn::
+      buildable: true
+      version: [7.6.5.32-10.1-linux-ppc64le]
+
+    gcc::
+       buildable: False
+       version: [7.3.1]
+       modules:
+         gcc@7.3.1 arch=linux-rhel7-power9le: gcc/7.3.1
+
+    hwloc::
+      buildable: False
+      version: [2.0.2]
+      paths:
+        hwloc@2.0.2 arch=linux-rhel7-power9le: /usr/lib64/libhwloc.so
+
+    openblas::
+      buildable: True
+      variants: threads=openmp ~avx2 ~avx512
+      version: [0.3.6]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+      modules:
+        python@3.7.2 arch=linux-rhel7-power9le: python/3.7.2
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=linux-rhel7-power9le: /usr
+
+    spectrum-mpi::
+      buildable: False
+      version: [rolling-release]
+      paths:
+        spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power9le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1
+EOF
+)
diff --git a/spack_environments/nersc/compilers.sh b/spack_environments/nersc/compilers.sh
new file mode 100644
index 00000000000..6a38e114464
--- /dev/null
+++ b/spack_environments/nersc/compilers.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+COMPILER_ALL_PACKAGES=$(cat <<EOF
+      compiler: [gcc@8.2.0 arch=cray-cnl7-skylake_avx512]
+EOF
+)
+
+COMPILER_DEFINITIONS=$(cat <<EOF
+  compilers:
+  - compiler:
+      spec: gcc@8.2.0
+      paths:
+        cc: /opt/gcc/8.2.0/bin/gcc
+        cxx: /opt/gcc/8.2.0/bin/g++
+        f77: /opt/gcc/8.2.0/bin/gfortran
+        fc: /opt/gcc/8.2.0/bin/gfortran
+      flags: {}
+      operating_system: opensuse_leap15
+      target: x86_64
+      modules: []
+      environment:
+        prepend_path:
+          LD_LIBRARY_PATH: /opt/esslurm/lib64
+      extra_rpaths: []
+EOF
+)
diff --git a/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
new file mode 100644
index 00000000000..ba555d7f806
--- /dev/null
+++ b/spack_environments/nersc/externals-cray-cnl7-skylake_avx512.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [mvapich2@2.3 arch=cray-cnl7-skylake_avx512]
+        lapack: [openblas threads=openmp]
+        blas: [openblas threasd=openmp]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.14.4]
+      modules:
+        cmake@3.14.4 arch=cray-cnl7-skylake_avx512: cmake/3.14.4
+
+    cuda::
+      buildable: False
+      version: [10.2.89]
+      modules:
+        cuda@10.2.89 arch=cray-cnl7-skylake_avx512: cuda/10.2.89
+
+    cudnn::
+      buildable: true
+      version: [7.6.5.32-10.2-linux-x64]
+
+    gcc::
+       buildable: False
+       version: [8.2.0]
+       modules:
+         gcc@8.2.0 arch=cray-cnl7-skylake_avx512: gcc/8.2.0
+
+    gettext::
+      buildable: False
+      version: [0.19.8.1]
+      paths:
+        gettext@0.19.8.1 arch=cray-cnl7-skylake_avx512: /usr
+
+    hwloc::
+      buildable: False
+      version: [1.11.8]
+      paths:
+        hwloc@1.11.8 arch=cray-cnl7-skylake_avx512: /usr/lib64/libhwloc.so
+
+    mvapich2::
+      buildable: False
+      version: [2.3.2]
+      modules:
+        mvapich2@2.3.2%gcc@8.2.0 arch=cray-cnl7-skylake_avx512: mvapich2/2.3.2
+
+    openblas::
+      buildable: True
+      variants: threads=openmp
+      version: [0.3.6]
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
+      version: [4.1.0]
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.4]
+
+    readline::
+      buildable: False
+      version: [8.0]
+      paths:
+        readline@8.0 arch=cray-cnl7-skylake_avx512: /lib64
+
+    rdma-core::
+      buildable: False
+      version: [20]
+      paths:
+        rdma-core@20 arch=cray-cnl7-skylake_avx512: /usr
+EOF
+)
diff --git a/spack_environments/nersc/setup_modules.sh b/spack_environments/nersc/setup_modules.sh
new file mode 100644
index 00000000000..b6d80c5e4ff
--- /dev/null
+++ b/spack_environments/nersc/setup_modules.sh
@@ -0,0 +1,3 @@
+module purge
+module load modules/3.2.11.4
+module load gcc/8.2.0 cuda/10.2.89 mvapich2/2.3.2 cmake/3.14.4
diff --git a/spack_environments/osx/compilers.sh b/spack_environments/osx/compilers.sh
new file mode 100644
index 00000000000..ac7cb978fa2
--- /dev/null
+++ b/spack_environments/osx/compilers.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+COMPILER_ALL_PACKAGES=$(cat <<EOF
+      compiler: [clang@9.0.1 arch=darwin-mojave-skylake, clang@9.0.0 arch=darwin-mojave-skylake]
+EOF
+)
+
+COMPILER_DEFINITIONS=$(cat <<EOF
+  compilers:
+  - compiler:
+      environment: {}
+      extra_rpaths: []
+      flags: {}
+      modules: []
+      operating_system: mojave
+      paths:
+        cc: /usr/local/Cellar/llvm/9.0.1/bin/clang
+        cxx: /usr/local/Cellar/llvm/9.0.1/bin/clang++
+        f77: /usr/local/bin/gfortran
+        fc: /usr/local/bin/gfortran
+      spec: clang@9.0.1
+      target: x86_64
+  - compiler:
+      environment: {}
+      extra_rpaths: []
+      flags: {}
+      modules: []
+      operating_system: mojave
+      paths:
+        cc: /usr/local/Cellar/llvm/9.0.0_1/bin/clang
+        cxx: /usr/local/Cellar/llvm/9.0.0_1/bin/clang++
+        f77: /usr/local/bin/gfortran
+        fc: /usr/local/bin/gfortran
+      spec: clang@9.0.0
+      target: x86_64
+EOF
+)
diff --git a/spack_environments/osx/externals-darwin-mojave-skylake.sh b/spack_environments/osx/externals-darwin-mojave-skylake.sh
new file mode 100644
index 00000000000..c37423cc7fa
--- /dev/null
+++ b/spack_environments/osx/externals-darwin-mojave-skylake.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+EXTERNAL_ALL_PACKAGES=$(cat <<EOF
+    all:
+      providers:
+        mpi: [openmpi@4.0 arch=darwin-mojave-skylake]
+        blas: [veclibfort]
+        lapack: [veclibfort]
+      buildable: true
+      version: []
+      paths: {}
+      modules: {}
+EOF
+)
+
+EXTERNAL_PACKAGES=$(cat <<EOF
+    cmake::
+      buildable: True
+      variants: ~openssl ~ncurses
+      version: [3.16.2]
+      paths:
+        cmake@3.16.2 arch=darwin-mojave-skylake:  /usr/local/
+
+    hwloc::
+      buildable: True
+      version: [2.0.2]
+
+    llvm::
+       buildable: False
+       variants: +clang
+       version: [9.0.0]
+       paths:
+         llvm@9.0.0 arch=darwin-mojave-skylake: /usr/local/Cellar/llvm/9.0.0_1/
+
+    opencv::
+      buildable: true
+      variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib
+      version: [4.1.0]
+
+    openmpi:
+      buildable: False
+      version: [4.0]
+      paths:
+        openmpi@4.0 arch=darwin-mojave-skylake: /usr/local/
+
+    python::
+      buildable: True
+      variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat
+      version: [3.7.2]
+EOF
+)
diff --git a/spack_environments/std_versions_and_variants.sh b/spack_environments/std_versions_and_variants.sh
new file mode 100644
index 00000000000..6e80d886a78
--- /dev/null
+++ b/spack_environments/std_versions_and_variants.sh
@@ -0,0 +1,97 @@
+#!/bin/sh
+
+STD_PACKAGES=$(cat <<EOF
+    cereal::
+      buildable: true
+      version: [1.2.2]
+
+    conduit::
+      buildable: true
+      variants: ~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo
+      version: [0.5.1]
+
+    cnpy::
+      buildable: true
+      variants: build_type=RelWithDebInfo
+      version: [master]
+
+    cub::
+      buildable: true
+      version: [1.8.0]
+
+    nccl::
+      buildable: true
+      version: [2.5.7-1]
+
+    protobuf::
+      buildable: True
+      variants: build_type=Release +shared
+      version: [3.10.0]
+
+    py-numpy::
+      buildable: True
+      version: [1.16.2]
+
+    py-protobuf::
+      buildable: True
+      variants: +cpp
+      version: [3.10.0]
+
+    zlib::
+      buildable: True
+      version: [1.2.11]
+EOF
+)
+
+STD_MODULES=$(cat <<EOF
+  modules:
+    enable::
+      - tcl
+      - lmod
+    lmod::
+      hash_length: 3
+      core_compilers:
+        - 'gcc@7.3.0'
+        - 'gcc@7.3.1'
+      projections:
+        all: '\${PACKAGE}/\${VERSION}-\${COMPILERNAME}-\${COMPILERVER}'
+      blacklist:
+        - '%gcc@4.8'
+        - '%gcc@4.9.3'
+      hierarchy:
+        - 'mpi'
+        - 'lapack'
+      all:
+        autoload: 'direct'
+        suffixes:
+          '^openblas': openblas
+          '^netlib-lapack': netlib
+        filter:
+          # Exclude changes to any of these variables
+          environment_blacklist: ['CPATH', 'LIBRARY_PATH']
+      ^python:
+        autoload:  'direct'
+    tcl:
+      hash_length: 3
+      core_compilers:
+        - 'gcc@7.3.0'
+        - 'gcc@7.3.1'
+      projections:
+        all: '\${PACKAGE}/\${VERSION}-\${COMPILERNAME}-\${COMPILERVER}'
+      whitelist:
+        - gcc
+      blacklist:
+        - '%gcc@4.8'
+        - '%gcc@4.9.3'
+      all:
+        autoload: 'direct'
+        suffixes:
+          '^openblas': openblas
+          '^netlib-lapack': netlib
+        filter:
+          # Exclude changes to any of these variables
+          environment_blacklist: ['CPATH', 'LIBRARY_PATH']
+      ^python:
+        autoload:  'direct'
+EOF
+)
diff --git a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml
deleted file mode 100644
index 64604c430d1..00000000000
--- a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-  packages:
-    all:
-      providers: {}
-      buildable: true
-      version: []
-      paths: {}
-      modules: {}
-      compiler: []
-
-    cereal::
-      buildable: true
-      version: [1.2.2]
-
-    cub::
-      buildable: true
-      version: [1.7.1]
-
-    nccl::
-      buildable: true
-      version: [2.4.6-1]
-
-    protobuf::
-      buildable: True
-      variants: build_type=Release +shared
-      version: [3.7.1]
-
-    python::
-      buildable: True
-      version: [3.7.2]
-
-    py-cython::
-      buildable: True
-      version: [0.29]
-
-    py-matplotlib::
-      buildable: True
-      variants: ~tk ~image
-
-    py-numpy::
-      buildable: True
-      version: [1.16.2]
-
-    zlib::
-      buildable: True
-      version: [1.2.11]
diff --git a/spack_environments/superbuild_lbann_with_hydrogen_and_aluminum.sh b/spack_environments/superbuild_lbann_with_hydrogen_and_aluminum.sh
new file mode 100644
index 00000000000..6c8e187094c
--- /dev/null
+++ b/spack_environments/superbuild_lbann_with_hydrogen_and_aluminum.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+
+AL_FWD_CMD=
+HYDROGEN_FWD_CMD=
+LBANN_FWD_CMD=
+LBANN_COMPILER_CMD=
+
+if [[ ${SYS} = "Darwin" ]]; then
+AL_FWD_CMD=$(cat << EOF
+  -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_LIB_NAMES=omp \
+  -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_FLAGS=-fopenmp \
+  -D LBANN_SB_FWD_ALUMINUM_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib
+EOF
+)
+HYDROGEN_FWD_CMD=$(cat << EOF
+  -D LBANN_SB_FWD_HYDROGEN_OpenMP_CXX_LIB_NAMES=omp \
+  -D LBANN_SB_FWD_HYDROGEN_OpenMP_CXX_FLAGS="-fopenmp=libomp" \
+  -D LBANN_SB_FWD_HYDROGEN_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib
+EOF
+)
+LBANN_FWD_CMD=$(cat << EOF
+  -D LBANN_SB_FWD_LBANN_HWLOC_DIR=/usr/local/opt/hwloc \
+  -D LBANN_SB_FWD_LBANN_OpenMP_CXX_LIB_NAMES=omp \
+  -D LBANN_SB_FWD_LBANN_OpenMP_CXX_FLAGS="-fopenmp=libomp" \
+  -D LBANN_SB_FWD_LBANN_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib
+EOF
+)
+LBANN_COMPILER_CMD=$(cat << EOF
+  -D CMAKE_CXX_COMPILER=$(which clang++) \
+  -D CMAKE_C_COMPILER=$(which clang)
+EOF
+)
+
+fi
+
+# Configure build with CMake
+CONFIGURE_COMMAND=$(cat << EOF
+cmake \
+  -G Ninja \
+  -D CMAKE_BUILD_TYPE:STRING=${BUILD_TYPE} \
+  -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \
+  -D CMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+  -D CMAKE_C_FLAGS="${C_FLAGS}" \
+  \
+  -D LBANN_SB_BUILD_ALUMINUM=ON \
+  -D ALUMINUM_TAG=v0.3.3 \
+  -D ALUMINUM_ENABLE_MPI_CUDA=OFF \
+  -D ALUMINUM_ENABLE_NCCL=${ENABLE_GPUS} \
+${AL_FWD_CMD}  \
+  \
+  -D LBANN_SB_BUILD_HYDROGEN=ON \
+  -D Hydrogen_ENABLE_ALUMINUM=ON \
+  -D Hydrogen_ENABLE_CUB=${ENABLE_GPUS} \
+  -D Hydrogen_ENABLE_CUDA=${ENABLE_GPUS} \
+  -D Hydrogen_ENABLE_HALF=ON \
+${HYDROGEN_FWD_CMD}  \
+  \
+  -D LBANN_SB_BUILD_LBANN=ON \
+  -D LBANN_DATATYPE:STRING=float \
+  -D LBANN_SEQUENTIAL_INITIALIZATION:BOOL=OFF \
+  -D LBANN_WITH_ALUMINUM:BOOL=ON \
+  -D LBANN_WITH_CONDUIT:BOOL=ON \
+  -D LBANN_WITH_CUDA:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_CUDNN:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_NCCL:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_NVPROF:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_SOFTMAX_CUDA:BOOL=${ENABLE_GPUS} \
+  -D LBANN_WITH_TOPO_AWARE:BOOL=ON \
+  -D LBANN_WITH_TBINF=OFF \
+  -D LBANN_WITH_VTUNE:BOOL=OFF \
+  -D LBANN_DETERMINISTIC=${DETERMINISTIC} \
+${LBANN_FWD_CMD}  \
+${LBANN_COMPILER_CMD}  \
+  ${LBANN_HOME}/superbuild
+EOF
+)
+
+if [[ ${VERBOSE} -ne 0 ]]; then
+    echo "${CONFIGURE_COMMAND}" 2>1 | tee cmake_superbuild_invocation.txt
+else
+    echo "${CONFIGURE_COMMAND}" > cmake_superbuild_invocation.txt
+fi
+eval ${CONFIGURE_COMMAND}
+if [[ $? -ne 0 ]]; then
+    echo "--------------------"
+    echo "CONFIGURE FAILED"
+    echo "--------------------"
+    exit 1
+fi
diff --git a/spack_environments/users/llnl_lc/ppc64le_cuda/spack.yaml b/spack_environments/users/llnl_lc/ppc64le_cuda/spack.yaml
deleted file mode 100644
index edcf3ed1367..00000000000
--- a/spack_environments/users/llnl_lc/ppc64le_cuda/spack.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
-spack:
-  # add package specs to the `specs` list
-  specs:
-  - lbann@develop+docs+gpu+nccl
-  mirrors: {}
-  modules:
-    enable: []
-  repos: []
-  config: {}
-################################################################################
-# Include paths to standard compilers and packages on LLNL LC systems
-# Remove and/or replace these with your site specific packages and paths
-################################################################################
-  include:
-  - ../../../std_versions_and_variants_llnl_lc_cz.yaml
-  - ../../../externals_ppc64le_llnl_lc_cz.yaml
-  packages:
-    aluminum:
-      buildable: true
-      version: [master]
-      providers: {}
-      paths: {}
-      modules: {}
-      compiler: []
-    hydrogen:
-      buildable: true
-      version: [develop]
-      providers: {}
-      paths: {}
-      modules: {}
-      compiler: []
-  upstreams: {}
diff --git a/spack_environments/users/llnl_lc/x86_64_cuda/spack.yaml b/spack_environments/users/llnl_lc/x86_64_cuda/spack.yaml
deleted file mode 100644
index 0240a6fa787..00000000000
--- a/spack_environments/users/llnl_lc/x86_64_cuda/spack.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# This is a Spack Environment file.
-#
-# It describes a set of packages to be installed, along with
-# configuration settings.
-
-spack:
-  # add package specs to the `specs` list
-  specs:
-  - lbann@develop+docs+gpu+nccl
-  mirrors: {}
-  modules:
-    enable: []
-  repos: []
-  config: {}
-################################################################################
-# Include paths to standard compilers and packages on LLNL LC systems
-# Remove and/or replace these with your site specific packages and paths
-################################################################################
-  include:
-  - ../../../std_versions_and_variants_llnl_lc_cz.yaml
-  - ../../../externals_x86_64_llnl_lc_cz.yaml
-  packages:
-    aluminum:
-      buildable: true
-      version: [master]
-      providers: {}
-      paths: {}
-      modules: {}
-      compiler: []
-    hydrogen:
-      buildable: true
-      version: [develop]
-      providers: {}
-      paths: {}
-      modules: {}
-      compiler: []
-  upstreams: {}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c11b281a01b..57521f3451b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,8 +7,10 @@ set_full_path(THIS_DIR_SOURCES
 
 # Add the subdirectories
 add_subdirectory(callbacks)
+add_subdirectory(data_coordinator)
 add_subdirectory(data_readers)
 add_subdirectory(data_store)
+add_subdirectory(execution_contexts)
 add_subdirectory(io)
 add_subdirectory(layers)
 add_subdirectory(metrics)
@@ -16,6 +18,9 @@ add_subdirectory(models)
 add_subdirectory(objective_functions)
 add_subdirectory(optimizers)
 add_subdirectory(proto)
+add_subdirectory(trainers)
+add_subdirectory(training_algorithms)
+add_subdirectory(transforms)
 add_subdirectory(utils)
 add_subdirectory(weights)
 
diff --git a/src/Elemental_extensions.cpp b/src/Elemental_extensions.cpp
index bcea0cb6dfe..de91ee166e9 100644
--- a/src/Elemental_extensions.cpp
+++ b/src/Elemental_extensions.cpp
@@ -28,6 +28,7 @@
 
 #include "El.hpp"
 
+#include <lbann_config.hpp>
 #include "lbann/Elemental_extensions.hpp"
 
 namespace El {
@@ -172,4 +173,11 @@ void RowSum(const AbstractDistMatrix<F>& A, AbstractDistMatrix<F>& sums) {
 LBANN_PROTO_FLOAT
 LBANN_PROTO_DOUBLE
 
+#ifdef LBANN_HAS_HALF
+LBANN_PROTO(El::cpu_half_type)
+#endif
+#ifdef LBANN_HAS_GPU_FP16
+LBANN_PROTO(El::gpu_half_type)
+#endif
+
 } // namespace El
diff --git a/src/base.cpp b/src/base.cpp
index f67fd982d84..5f9c82f4ffa 100644
--- a/src/base.cpp
+++ b/src/base.cpp
@@ -35,25 +35,48 @@
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
 #endif
 #endif
+#ifdef LBANN_HAS_SHMEM
+#include <shmem.h>
+#endif // LBANN_HAS_SHMEM
 
 #include "lbann/comm.hpp"
-#include "lbann/utils/random.hpp"
+#include "lbann/utils/exception.hpp"
 #include "lbann/utils/omp_diagnostics.hpp"
 #include "lbann/utils/stack_trace.hpp"
 
 #ifdef LBANN_HAS_CUDNN
 #include "lbann/utils/cudnn.hpp"
 #endif
+#ifdef LBANN_HAS_PYTHON
+#include "lbann/utils/python.hpp"
+#endif
+#ifdef LBANN_HAS_NVSHMEM
+#include "lbann/utils/nvshmem.hpp"
+#endif
+#ifdef LBANN_HAS_DISTCONV
+#include "lbann/utils/distconv.hpp"
+#endif
+
+#include <iostream>
+#include <string>
+#include <vector>
 
 namespace lbann {
 
-world_comm_ptr initialize(int& argc, char**& argv, int seed) {
+MPI_Errhandler err_handle;
+
+world_comm_ptr initialize(int& argc, char**& argv) {
   // Initialize Elemental.
   El::Initialize(argc, argv);
+
   // Create a new comm object.
   // Initial creation with every process in one model.
   auto comm = world_comm_ptr{new lbann_comm(0), &lbann::finalize };
 
+  // Install MPI error handler
+  MPI_Comm_create_errhandler(lbann_mpi_err_handler, &err_handle);
+  MPI_Comm_set_errhandler(MPI_COMM_WORLD, err_handle);
+
 #if defined(LBANN_TOPO_AWARE)
   // Determine the number of NUMA nodes present.
   hwloc_topology_t topo;
@@ -79,17 +102,45 @@ world_comm_ptr initialize(int& argc, char**& argv, int seed) {
   }
   hwloc_topology_destroy(topo);
 #endif
-  // Initialize local random number generators.
-  init_random(seed);
-  init_data_seq_random(seed);
+
+#ifdef LBANN_HAS_SHMEM
+  // Initialize SHMEM
+  {
+    int threading_level = SHMEM_THREAD_MULTIPLE;
+    int status = shmem_init_thread(threading_level, &threading_level);
+    if (status != 0 || threading_level != SHMEM_THREAD_MULTIPLE) {
+      LBANN_ERROR("error initializing OpenSHMEM");
+    }
+  }
+#endif // LBANN_HAS_SHMEM
+
+#ifdef LBANN_HAS_DISTCONV
+  dc::initialize(MPI_COMM_WORLD);
+#endif // LBANN_HAS_DISTCONV
 
   return comm;
 }
 
 void finalize(lbann_comm* comm) {
+#ifdef LBANN_HAS_NVSHMEM
+  nvshmem::finalize();
+#endif // LBANN_HAS_NVSHMEM
+  MPI_Errhandler_free( &err_handle );
+#ifdef LBANN_HAS_DISTCONV
+  dc::finalize();
+#endif
 #ifdef LBANN_HAS_CUDNN
   cudnn::destroy();
 #endif
+#ifdef LBANN_HAS_PYTHON
+  python::finalize();
+#endif
+#ifdef LBANN_HAS_NVSHMEM
+  nvshmem::finalize();
+#endif // LBANN_HAS_SHMEM
+#ifdef LBANN_HAS_SHMEM
+  shmem_finalize();
+#endif // LBANN_HAS_SHMEM
   if (comm != nullptr) {
     delete comm;
   }
@@ -102,10 +153,134 @@ static std::vector<std::string> pool_mode_names = { "invalid", "max", "average",
 /** returns a string representation of the pool_mode */
 std::string get_pool_mode_name(pool_mode m) {
   if ((int)m < 1 or (int)m >= (int)pool_mode_names.size()) {
-    throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: "
-          + " Invalid pool_mode");
+    LBANN_ERROR("Invalid pool_mode");
   }
   return pool_mode_names[(int)m];
 }
 
+matrix_format data_layout_to_matrix_format(data_layout layout) {
+  matrix_format format;
+  switch(layout) {
+  case data_layout::MODEL_PARALLEL:
+    format = matrix_format::MC_MR;
+    break;
+  case data_layout::DATA_PARALLEL:
+    /// Weights are stored in STAR_STAR and data in STAR_VC
+    format = matrix_format::STAR_STAR;
+    break;
+  default:
+    LBANN_ERROR("Invalid data layout selected");
+  }
+  return format;
+}
+
+std::string to_string(data_layout const& dl) {
+  switch (dl) {
+  case data_layout::DATA_PARALLEL:
+    return "data_parallel";
+  case data_layout::MODEL_PARALLEL:
+    return "model_parallel";
+  case data_layout::invalid:
+    return "invalid";
+  }
+  return "invalid data_layout";
+}
+
+data_layout data_layout_from_string(std::string const& str) {
+  if (str == "data_parallel" || str == "DATA_PARALLEL")
+    return data_layout::DATA_PARALLEL;
+  if (str == "model_parallel" || str == "MODEL_PARALLEL")
+    return data_layout::MODEL_PARALLEL;
+  if (str == "invalid" || str == "INVALID")
+    return data_layout::invalid; // Why is this a thing?
+  LBANN_ERROR("Unable to convert \"", str, "\" to lbann::data_layout.");
+}
+
+std::string to_string(El::Device const& d) {
+  switch (d) {
+  case El::Device::CPU:
+    return "CPU";
+#ifdef HYDROGEN_HAVE_GPU
+  case El::Device::GPU:
+    return "GPU";
+#endif // HYDROGEN_HAVE_GPU
+  }
+  return "invalid El::Device";
+}
+
+El::Device device_from_string(std::string const& str) {
+  if (str == "cpu" || str == "CPU")
+    return El::Device::CPU;
+#ifdef HYDROGEN_HAVE_GPU
+  if (str == "gpu" || str == "GPU")
+    return El::Device::GPU;
+#endif
+  LBANN_ERROR("Unable to convert \"", str, "\" to El::Device.");
+}
+
+std::string to_string(execution_mode m) {
+  switch(m) {
+  case execution_mode::training:
+    return "training";
+  case execution_mode::validation:
+    return "validation";
+  case execution_mode::testing:
+    return "testing";
+  case execution_mode::prediction:
+    return "prediction";
+  case execution_mode::invalid:
+    return "invalid";
+  default:
+      LBANN_ERROR("Invalid execution mode specified");
+  }
+}
+
+execution_mode exec_mode_from_string(std::string const& str) {
+  if (str == "training" || str == "train")
+    return execution_mode::training;
+  else if (str == "validation" || str == "validate")
+      return execution_mode::validation;
+  else if (str == "testing" || str == "test")
+    return execution_mode::testing;
+  else if (str == "prediction" || str == "predict")
+    return execution_mode::prediction;
+  else if (str == "invalid")
+    return execution_mode::invalid;
+  else
+    LBANN_ERROR("\"" + str + "\" is not a valid execution mode.");
+}
+
+std::istream& operator>>(std::istream& is, execution_mode& m) {
+  std::string tmp;
+  is >> tmp;
+  m = exec_mode_from_string(tmp);
+  return is;
+}
+
+bool endsWith(const std::string mainStr, const std::string &toMatch)
+{
+  if(mainStr.size() >= toMatch.size() &&
+     mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0)
+    return true;
+  else
+    return false;
+}
+
+void print_matrix_dims(AbsDistMat *m, const char *name) {
+  std::cout << "DISPLAY MATRIX: " << name << " = "
+            << m->Height() << " x " << m->Width() << std::endl;
+}
+
+void print_local_matrix_dims(AbsMat *m, const char *name) {
+  std::cout << "DISPLAY MATRIX: " << name << " = "
+            << m->Height() << " x " << m->Width() << std::endl;
+}
+
+void lbann_mpi_err_handler(MPI_Comm *comm, int *err_code, ... ) {
+  char err_string[MPI_MAX_ERROR_STRING];
+  int err_string_length;
+  MPI_Error_string(*err_code, &err_string[0], &err_string_length);
+  LBANN_ERROR("MPI threw this error: ", err_string);
+}
+
 } // namespace lbann
diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt
index 2b29975b44b..fc043510142 100644
--- a/src/callbacks/CMakeLists.txt
+++ b/src/callbacks/CMakeLists.txt
@@ -1,38 +1,46 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
-  callback_check_dataset.cpp
-  callback_check_gradients.cpp
-  callback_check_init.cpp
-  callback_check_metric.cpp
-  callback_checknan.cpp
-  callback_checkpoint.cpp
-  callback_checksmall.cpp
-  callback_confusion_matrix.cpp
-  callback_debug.cpp
-  callback_debug_io.cpp
-  callback_dump_outputs.cpp
-  callback_dump_error_signals.cpp
-  callback_dump_gradients.cpp
-  callback_dump_minibatch_sample_indices.cpp
-  callback_dump_weights.cpp
-  callback_early_stopping.cpp
-  callback_imcomm.cpp
-  callback_io.cpp
-  callback_learning_rate.cpp
-  callback_ltfb.cpp
-  callback_perturb_adam.cpp
-  callback_print.cpp
-  callback_save_images.cpp
-  callback_save_model.cpp
-  callback_summary.cpp
-  callback_sync_layers.cpp
-  callback_sync_selected.cpp
-  callback_timeline.cpp
-  callback_timer.cpp
-  callback_variable_minibatch.cpp
+  callback.cpp
+  check_dataset.cpp
+  check_gradients.cpp
+  check_init.cpp
+  check_metric.cpp
+  check_nan.cpp
+  check_small.cpp
+  checkpoint.cpp
+  confusion_matrix.cpp
+  debug.cpp
+  debug_io.cpp
+  dump_error_signals.cpp
+  dump_gradients.cpp
+  dump_minibatch_sample_indices.cpp
+  dump_outputs.cpp
+  dump_weights.cpp
+  early_stopping.cpp
+  gpu_memory_usage.cpp
+  hang.cpp
+  imcomm.cpp
+  learning_rate.cpp
+  load_model.cpp
+  ltfb.cpp
+  mixup.cpp
+  monitor_io.cpp
+  perturb_adam.cpp
+  perturb_dropout.cpp
+  print_model_description.cpp
+  print_statistics.cpp
   profiler.cpp
-  callback_replace_weights.cpp
-  callback_gpu_memory_usage.cpp
+  replace_weights.cpp
+  save_images.cpp
+  save_model.cpp
+  save_topk_models.cpp
+  set_weights_value.cpp
+  summary.cpp
+  summarize_images.cpp
+  sync_layers.cpp
+  timeline.cpp
+  timer.cpp
+  variable_minibatch.cpp
 )
 
 # Propagate the files up the tree
diff --git a/src/callbacks/callback.cpp b/src/callbacks/callback.cpp
new file mode 100644
index 00000000000..0a395ea8de2
--- /dev/null
+++ b/src/callbacks/callback.cpp
@@ -0,0 +1,35 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+
+description callback_base::get_description() const {
+  return name();
+}
+
+} // namespace lbann
diff --git a/src/callbacks/callback_check_dataset.cpp b/src/callbacks/callback_check_dataset.cpp
deleted file mode 100644
index 5bf0702b54c..00000000000
--- a/src/callbacks/callback_check_dataset.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include <vector>
-#include "lbann/callbacks/callback_check_dataset.hpp"
-#include "lbann/layers/io/io_layer.hpp"
-#include "lbann/layers/io/input/input_layer.hpp"
-#include <iomanip>
-
-namespace lbann {
-
-void lbann_callback_check_dataset::add_to_set(model *m, Layer *l, int64_t step, std::set<long>& set) {
-  if (!dynamic_cast<io_layer*>(l)) {
-    return;
-  }
-
-  El::Matrix<El::Int>* indices = l->get_sample_indices_per_mb();
-
-  std::set<long>::iterator it;
-
-  for(El::Int i = 0; i < indices->Height(); i++) {
-    for(El::Int j = 0; j < indices->Width(); j++) {
-      El::Int idx = indices->Get(i,j);
-      it = set.find(idx);
-      if(it != set.end()) {
-        throw lbann_exception(
-          std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-          + " :: @" + std::to_string(step)
-          + " :: found a duplicate index in being loaded: " + std::to_string(idx));
-      }else {
-        set.insert(idx);
-      }
-    }
-  }
-}
-
-void lbann_callback_check_dataset::on_forward_prop_end(model *m, Layer *l) {
-  add_to_set(m, l, m->get_step(), training_set);
-}
-
-void lbann_callback_check_dataset::on_evaluate_forward_prop_end(model *m, Layer *l) {
-  switch(m->get_execution_mode()) {
-  case execution_mode::validation:
-    add_to_set(m, l, m->get_step(), validation_set);
-    break;
-  case execution_mode::testing:
-    add_to_set(m, l, m->get_step(), testing_set);
-    break;
-  default:
-    throw lbann_exception("lbann_callback_check_dataset: invalid execution phase");
-  }
-}
-
-void lbann_callback_check_dataset::on_epoch_end(model *m) {
-  lbann_comm* comm = m->get_comm();
-  std::cout << "Training [" << comm->get_rank_in_trainer() <<
-    "] : I have processed " << training_set.size() << " elements" << std::endl;
-
-  // Get first input layer in model
-  generic_input_layer* input = nullptr;
-  for (auto&& l : m->get_layers()) {
-    input = dynamic_cast<generic_input_layer*>(l);
-    if (input != nullptr) { break; }
-  }
-  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
-
-  int num_samples = training_set.size();
-  std::vector<int> vec_num_samples(comm->get_procs_per_trainer());
-  if (comm->am_trainer_master()) {
-    comm->trainer_gather(num_samples, vec_num_samples.data());
-  }else {
-    comm->trainer_gather(num_samples, comm->get_trainer_master());
-  }
-  std::vector<int> sample_offsets(comm->get_procs_per_trainer());
-  std::partial_sum(vec_num_samples.begin(), vec_num_samples.end(), sample_offsets.begin());
-  std::cout << "Training [" << comm->get_rank_in_trainer() << "] offsets";
-  for (const auto& idx : sample_offsets) {
-    std::cout << idx << " ";
-  }
-  std::cout << std::endl;
-  std::cout << "Training [" << comm->get_rank_in_trainer() << "] counts";
-  for (const auto& idx : vec_num_samples) {
-    std::cout << idx << " ";
-  }
-  std::cout << std::endl;
-
-  // sample_offset[]
-  // for (int i = 0; i < vec_num_samples.size(); i++) {
-  //   //  for (const auto& idx : vec_num_samples) {
-
-  // }
-
-  // Build a vector large enough to hold all the data indices for this rank.
-  std::vector<int> local_data(training_set.size());
-  std::copy(training_set.begin(), training_set.end(), local_data.data());
-
-  std::cout << "Training: my local vector has size " << local_data.size() << std::endl;
-  if (comm->am_trainer_master()) {
-    // Build a vector large enough to hold all indices for the model.
-    std::vector<int> model_training_set(
-      input->get_num_iterations_per_epoch(execution_mode::training) * m->get_max_mini_batch_size());
-
-    std::cout << "Training: my model vector has size " << model_training_set.size() << std::endl;
-    // comm->trainer_gatherv(local_data.data(), local_data.size(),
-    //                     model_training_set.data(), vec_num_samples.data(), sample_offsets.data());
-
-    std::cout << "Training: The entire model has processed " << model_training_set.size() << " elements" << std::endl;
-  } else {
-    // comm->trainer_gatherv(local_data.data(), local_data.size(),
-    //                     m->get_comm()->get_trainer_master());
-  }
-
-  std::cout << "Training [" << comm->get_rank_in_trainer() << "] ";
-  for (const auto& idx : training_set) {
-    std::cout << idx << " ";
-  }
-  std::cout << std::endl;
-
-  training_set.clear();
-}
-
-void lbann_callback_check_dataset::on_validation_end(model *m) {
-  std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << validation_set.size() << " elements" << std::endl;
-#if 0
-  std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] ";
-  for(std::set<long>::iterator iter=validation_set.begin(); iter!=validation_set.end();++iter) {
-    std::cout << *iter << " ";
-  }
-  std::cout << std::endl;
-#endif
-  validation_set.clear();
-}
-
-void lbann_callback_check_dataset::on_test_end(model *m) {
-  std::cout << "Testing [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << testing_set.size() << " elements" << std::endl;
-  testing_set.clear();
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp
deleted file mode 100644
index 9f133f467da..00000000000
--- a/src/callbacks/callback_check_gradients.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_check_gradients.hpp"
-
-namespace lbann {
-
-lbann_callback_check_gradients
-  ::lbann_callback_check_gradients(DataType step_size,
-                                   bool verbose,
-                                   bool error_on_failure)
-  : m_step_size(step_size),
-    m_verbose(verbose),
-    m_error_on_failure(error_on_failure) {}
-
-void lbann_callback_check_gradients::on_test_begin(model *m) {
-
-  // Get model members
-  lbann_comm *comm = m->get_comm();
-  const std::vector<Layer*>& layers = m->get_layers();
-
-  // Initialize network for testing
-  for (auto&& w : m->get_weights()) {
-    auto&& opt = w->get_optimizer();
-    if (opt != nullptr) { opt->clear_gradient(); }
-  }
-  layers[0]->forward_prop();
-
-  // Compute objective function
-  const DataType objective = compute_objective_function(m);
-
-  // Choose finite difference step
-  // Note: Consider a central difference scheme:
-  //   f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
-  // By Taylor's theorem, the truncation error is bounded by
-  //   E_trunc <= | f'''''(xi) | / 18 * h^4
-  // Assuming f can be computed to a relative accuracy of epsilon,
-  //   E_fl <= epsilon * | f(chi) | / h
-  // For simplicity, we assume f(chi) ~ f(x), and | f'''''(xi) | ~ 1.
-  // If step size is not specified, then we choose h so that
-  //   E_fl <= sqrt(epsilon)
-  const DataType epsilon = std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
-  DataType step_size = m_step_size;
-  if (m_step_size <= DataType(0)) {
-    step_size = std::fabs(objective) * std::sqrt(epsilon);
-  }
-  DataType expected_error = (epsilon * objective / step_size
-                             + std::pow(step_size, 4) / 18);
-  expected_error = std::pow(expected_error, 0.9);
-
-  // Compute gradients
-  m->get_objective_function()->differentiate();
-  m->get_objective_function()->compute_weight_regularization();
-  for (int l = layers.size() - 1; l > 0; --l) {
-    layers[l]->back_prop();
-  }
-
-  // Print objective function value
-  if (comm->am_world_master()) {
-    std::cout << "--------------------------------------------------------------------------------" << std::endl
-              << "Gradient checking..." << std::endl
-              << "  Objective function value = " << objective << std::endl
-              << "  Step size                = " << step_size << std::endl
-              << "  Expected gradient error  = " << expected_error << std::endl;
-  }
-
-  for (weights *w : m->get_weights()) {
-    if (w->get_optimizer() == nullptr) {
-      continue;
-    }
-    if (comm->am_world_master()) {
-      std::cout << "Checking " << w->get_name() << std::endl;
-    }
-
-    // Get weights matrix and gradient
-    const AbsDistMat& weights_matrix = w->get_values();
-    const AbsDistMat& gradient = w->get_optimizer()->get_gradient();
-
-    // Iterate through weights matrix entries
-    for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
-      for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
-        const bool weight_is_local = weights_matrix.IsLocal(row, col);
-        const El::Int local_row = (weight_is_local ?
-                                   weights_matrix.LocalRow(row) :
-                                   0);
-        const El::Int local_col = (weight_is_local ?
-                                   weights_matrix.LocalCol(col) :
-                                   0);
-        const DataType initial_weight = (weight_is_local ?
-                                         weights_matrix.GetLocal(local_row,
-                                                                 local_col) :
-                                         DataType(0));
-
-        // Compute objective function values
-        // Note: matrix entry is reset after computing objective
-        // function values
-        w->set_value(initial_weight + 2 * step_size, row, col);
-        const DataType f_2h = compute_objective_function(m);
-        w->set_value(initial_weight + step_size, row, col);
-        const DataType f_h = compute_objective_function(m);
-        w->set_value(initial_weight - step_size, row, col);
-        const DataType f_nh = compute_objective_function(m);
-        w->set_value(initial_weight - 2 * step_size, row, col);
-        const DataType f_n2h = compute_objective_function(m);
-        w->set_value(initial_weight, row, col);
-
-        // Compute relative error in gradient.
-        // Note: only weight owner participates
-        if (weight_is_local && weights_matrix.RedundantRank() == 0) {
-          const DataType analytical_gradient
-            = gradient.GetLocal(local_row, local_col);
-          const DataType numerical_gradient
-            = (- f_2h + 8 * f_h - 8 * f_nh + f_n2h) / (12 * step_size);
-          const DataType error = std::fabs(analytical_gradient - numerical_gradient);
-          auto relative_error = DataType(0);
-          if (error != DataType(0)) {
-            relative_error = error / std::max(std::fabs(analytical_gradient),
-                                              std::fabs(numerical_gradient));
-          }
-
-          // Print warning if relative error is large
-          if (error > expected_error || std::isnan(error) || std::isinf(error)) {
-            std::cout << "  GRADIENT ERROR: " << w->get_name() << ", "
-                      << "entry (" << row << "," << col << ")" << std::endl;
-            std::cout << "    Weight              = " << initial_weight << std::endl
-                      << "    Analytical gradient = " << analytical_gradient << std::endl
-                      << "    Numerical gradient  = " << numerical_gradient << std::endl
-                      << "    Error               = " << error << std::endl
-                      << "    Relative error      = " << relative_error << std::endl;
-            if (m_error_on_failure) {
-              throw lbann_exception("callback_check_gradients: found large error in gradient");
-            }
-          } else if (m_verbose) {
-            std::cout << "  " << w->get_name() << ", "
-                      << "entry (" << row << "," << col << ")" << std::endl;
-            std::cout << "    Weight              = " << initial_weight << std::endl
-                      << "    Analytical gradient = " << analytical_gradient << std::endl
-                      << "    Numerical gradient  = " << numerical_gradient << std::endl
-                      << "    Error               = " << error << std::endl
-                      << "    Relative error      = " << relative_error << std::endl;
-          }
-        }
-
-      }
-    }
-
-  }
-
-  if (comm->am_world_master()) {
-    std::cout << "--------------------------------------------------------------------------------" << std::endl;
-  }
-
-}
-
-DataType lbann_callback_check_gradients::compute_objective_function(model *m) {
-  const std::vector<Layer*>& layers = m->get_layers();
-  objective_function* obj_fn = m->get_objective_function();
-  for (size_t l = 1; l < layers.size(); l++) {
-    layers[l]->forward_prop();
-  }
-  obj_fn->start_evaluation(m->get_execution_mode(),
-                           m->get_current_mini_batch_size());
-  return obj_fn->finish_evaluation(m->get_execution_mode(),
-                                   m->get_current_mini_batch_size());
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_check_init.cpp b/src/callbacks/callback_check_init.cpp
deleted file mode 100644
index 2d50f07dad1..00000000000
--- a/src/callbacks/callback_check_init.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_check_init .hpp .cpp - Check multi-model init
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_check_init.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-void lbann_callback_check_init::on_train_begin(model *m) {
-  // Skip after the first epoch.
-  if (m->get_epoch() != 0) {
-    return;
-  }
-  lbann_comm *comm = m->get_comm();
-  if (comm->am_world_master()) {
-    std::cout << "Checking all model initial weights match..." << std::endl;
-  }
-  if (comm->get_num_trainers() == 1) {
-    return;
-  }
-
-  for (const auto w : m->get_weights()) {
-    if (comm->am_world_master()) {
-      std::cout << "Checking " << w->get_name() << std::endl;
-    }
-    // Model 0 holds the master copy, it gathers the values from other models
-    // and compares them.
-    const AbsMat& local_matrix = w->get_values().LockedMatrix();
-    CPUMat remote_matrix(local_matrix.Height(), local_matrix.Width());
-    for (int model = 1; model < comm->get_num_trainers(); ++model) {
-      comm->global_barrier();
-      if (comm->get_trainer_rank() == 0) {
-        comm->recv(remote_matrix, model);
-        if (!check_equal(local_matrix, remote_matrix)) {
-          std::stringstream ss;
-          ss << "check_init: "
-             << "model " << model << " "
-             << "rank in model " << comm->get_rank_in_trainer() << " "
-             << "does not match model 0";
-          throw lbann_exception(ss.str());
-        }
-      } else if (comm->get_trainer_rank() == model) {
-        comm->send(local_matrix, 0);
-      }
-    }
-  }
-}
-
-bool lbann_callback_check_init::check_equal(const AbsMat& x, const AbsMat& y) const {
-  const El::Int height = x.Height();
-  const El::Int width = x.Width();
-  if (height != y.Height() || width != y.Width() || x.LDim() != y.LDim()) {
-    return false;
-  }
-  const DataType *x_buf = x.LockedBuffer();
-  const DataType *y_buf = y.LockedBuffer();
-  for (El::Int i = 0; i < height * width; ++i) {
-    if (x_buf[i] != y_buf[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp
deleted file mode 100644
index 2e3719a2c82..00000000000
--- a/src/callbacks/callback_check_metric.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_check_metric.hpp"
-
-namespace lbann {
-
-lbann_callback_check_metric::lbann_callback_check_metric(std::string metric_name,
-                                                         std::set<execution_mode> modes,
-                                                         EvalType lower_bound,
-                                                         EvalType upper_bound,
-                                                         bool error_on_failure)
-  : m_metric_name(std::move(metric_name)),
-    m_modes(std::move(modes)),
-    m_lower_bound(lower_bound),
-    m_upper_bound(upper_bound),
-    m_error_on_failure(error_on_failure) {
-  if (lower_bound > upper_bound) {
-    std::stringstream err;
-    err << "callback \"" << name() << "\" "
-        << "got an invalid range for metric values "
-        << "(lower bound " << m_lower_bound << ", "
-        << "upper bound " << m_upper_bound << ")";
-    LBANN_ERROR(err.str());
-  }
-}
-
-
-void lbann_callback_check_metric::check_metric(const model& m) const {
-  std::stringstream err;
-
-  // Return immediately if execution mode is invalid
-  const auto& mode = m.get_execution_mode();
-  if (!m_modes.empty() && m_modes.count(mode) == 0) { return; }
-
-  // Get metric
-  const metric* met = nullptr;
-  for (const auto* met_ : m.get_metrics()) {
-    if (met_->name() == m_metric_name) {
-      met = met_;
-    }
-  }
-  if (met == nullptr) {
-    err << "callback \"" << name() << "\" could not find "
-        << "metric \"" << m_metric_name << "\"";
-    LBANN_ERROR(err.str());
-  }
-
-  // Check if metric value is within expected range
-  const auto& value = met->get_mean_value(mode);
-  if (!(m_lower_bound <= value && value <= m_upper_bound)) {
-    err << "callback \"" << name() << "\" expected "
-        << "metric \"" << m_metric_name << "\" "
-        << "to have a value in range "
-        << "[" << m_lower_bound << "," << m_upper_bound << "], "
-        << "but found a value of " << value;
-    if (m_error_on_failure) {
-      LBANN_ERROR(err.str());
-    } else if (m.get_comm()->am_trainer_master()) {
-      LBANN_WARNING(err.str());
-    }
-  }
-
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_checknan.cpp b/src/callbacks/callback_checknan.cpp
deleted file mode 100644
index 143ec4ad776..00000000000
--- a/src/callbacks/callback_checknan.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_checknan.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-namespace {
-
-/** Check whether a matrix contains a NaN.
- *  If a NaN entry is detected, return true and output the local entry
- *  position in row and col. mat is assumed to be a CPU matrix.
- */
-bool has_nan(const AbsDistMat& mat, El::Int& row, El::Int& col) {
-  row = -1;
-  col = -1;
-  const auto& local_mat = mat.LockedMatrix();
-  for (El::Int j = 0; j < local_mat.Width(); ++j) {
-    for (El::Int i = 0; i < local_mat.Height(); ++i) {
-      if (std::isnan(local_mat(i,j))) {
-        row = i;
-        col = j;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/** Check whether a matrix contains an inf.
- *  If an inf entry is detected, return true and output the entry
- *  position in row and col. mat is assumed to be a CPU matrix.
- */
-bool has_inf(const AbsDistMat& mat, El::Int& row, El::Int& col) {
-  row = -1;
-  col = -1;
-  const auto& local_mat = mat.LockedMatrix();
-  for (El::Int j = 0; j < local_mat.Width(); ++j) {
-    for (El::Int i = 0; i < local_mat.Height(); ++i) {
-      if (std::isinf(local_mat(i,j))) {
-        row = i;
-        col = j;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/** Dump the local network matrices for debugging.
- *  Dump only the local matrices because not every rank will
- *  necessarily have bad data, and the check is purely local.
- */
-void dump_network(model *m) {
-  for (const auto* l : m->get_layers()) {
-    std::stringstream ss;
-    ss << "model" << m->get_comm()->get_trainer_rank()
-       << "-rank" << m->get_comm()->get_rank_in_trainer()
-       << "-epoch" << m->get_epoch()
-       << "-step" << m->get_step(execution_mode::training)
-       << "-" << l->get_name() << "-";
-    const std::string prefix = ss.str();
-    for (int i = 0; i < l->get_num_children(); ++i) {
-      El::Write(l->get_local_activations(i),
-                prefix + "Activations" + std::to_string(i),
-                El::ASCII);
-    }
-    for (int i = 0; i < l->get_num_parents(); ++i) {
-      El::Write(l->get_local_error_signals(i),
-                prefix + "ErrorSignal" + std::to_string(i),
-                El::ASCII);
-    }
-  }
-  for (auto* w : m->get_weights()) {
-    std::stringstream ss;
-    ss << "model" << m->get_comm()->get_trainer_rank()
-       << "-rank" << m->get_comm()->get_rank_in_trainer()
-       << "-epoch" << m->get_epoch()
-       << "-step" << m->get_step(execution_mode::training)
-       << "-" << w->get_name() << "-";
-    const std::string prefix = ss.str();
-    El::Write(w->get_values().LockedMatrix(),
-              prefix + "Weights",
-              El::ASCII);
-    auto* opt = w->get_optimizer();
-    if (opt != nullptr) {
-      El::Write(opt->get_gradient().LockedMatrix(),
-                prefix + "Gradient",
-                El::ASCII);
-    }
-  }
-}
-
-} // namespace
-
-void lbann_callback_checknan::on_forward_prop_end(model *m, Layer *l) {
-  std::stringstream err;
-  const auto& num_outputs = l->get_num_children();
-  for (int i = 0; i < num_outputs; ++i) {
-    El::Int row, col;
-    AbsDistMatReadProxy<El::Device::CPU> mat_proxy(l->get_activations(i));
-    if (has_nan(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is NaN "
-          << "in activations ";
-      if (num_outputs > 1) { err << i << " "; }
-      err << "of layer \"" << l->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-    if (has_inf(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is inf "
-          << "in activations ";
-      if (num_outputs > 1) { err << i << " "; }
-      err << "of layer \"" << l->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-  }
-}
-
-void lbann_callback_checknan::on_backward_prop_end(model *m, Layer *l) {
-  std::stringstream err;
-  const auto& num_inputs = l->get_num_parents();
-  for (int i = 0; i < num_inputs; ++i) {
-    El::Int row, col;
-    AbsDistMatReadProxy<El::Device::CPU> mat_proxy(l->get_error_signals(i));
-    if (has_nan(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is NaN "
-          << "in error signals ";
-      if (num_inputs > 1) { err << i << " "; }
-      err << "of layer \"" << l->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-    if (has_inf(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is inf "
-          << "in error signals ";
-      if (num_inputs > 1) { err << i << " "; }
-      err << "of layer \"" << l->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-  }
-}
-
-void lbann_callback_checknan::on_backward_prop_end(model *m) {
-  std::stringstream err;
-  for (weights *w : m->get_weights()) {
-    auto* opt = w->get_optimizer();
-    if (opt != nullptr) {
-      El::Int row, col;
-      AbsDistMatReadProxy<El::Device::CPU> mat_proxy(opt->get_gradient());
-      if (has_nan(mat_proxy.GetLocked(), row, col)) {
-        dump_network(m);
-        err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-            << "local entry (" << row << "," << col << ") is NaN "
-            << "in gradient w.r.t. weights \"" << w->get_name() << "\"";
-        LBANN_ERROR(err.str());
-      }
-      if (has_inf(mat_proxy.GetLocked(), row, col)) {
-        dump_network(m);
-        err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-            << "local entry (" << row << "," << col << ") is inf "
-            << "in gradient w.r.t. weights \"" << w->get_name() << "\"";
-        LBANN_ERROR(err.str());
-      }
-    }
-  }
-}
-
-void lbann_callback_checknan::on_batch_end(model *m) {
-  std::stringstream err;
-  for (weights *w : m->get_weights()) {
-    El::Int row, col;
-    AbsDistMatReadProxy<El::Device::CPU> mat_proxy(w->get_values());
-    if (has_nan(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is NaN "
-          << "in weights \"" << w->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-    if (has_inf(mat_proxy.GetLocked(), row, col)) {
-      dump_network(m);
-      err << "rank " << m->get_comm()->get_rank_in_world() << ": "
-          << "local entry (" << row << "," << col << ") is inf "
-          << "in weights \"" << w->get_name() << "\"";
-      LBANN_ERROR(err.str());
-    }
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp
deleted file mode 100644
index 2fafefaf836..00000000000
--- a/src/callbacks/callback_checkpoint.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_checkpoint .hpp .cpp - Callback hooks to checkpoint model
-////////////////////////////////////////////////////////////////////////////////
-
-
-#include "lbann/callbacks/callback_checkpoint.hpp"
-
-namespace lbann {
-// Load from checkpoint occurs during setup callbacks
-void lbann_callback_checkpoint::setup(model *m) {
-  p.set_cb_type(callback_type::invalid);
-  restart(m);
-}
-// Interval defined with checkpoint_epochs or ckpt_dist_epochs
-void lbann_callback_checkpoint::on_epoch_end(model *m) {
-  p.set_cb_type(callback_type::epoch);
-  if(need_checkpoint(m)){
-    checkpoint(m);
-  }
-  p.set_cb_type(callback_type::invalid);
-}
-// Interval defined with checkpoint_epochs or ckpt_dist_epochs
-void lbann_callback_checkpoint::on_validation_end(model *m) {
-  p.set_cb_type(callback_type::validation);
-  if(need_checkpoint(m)){
-    checkpoint(m);
-  }
-  p.set_cb_type(callback_type::invalid);
-}
- // Interval defined with checkpoint_steps or ckpt_dist_steps
-void lbann_callback_checkpoint::on_batch_end(model *m) {
-  p.set_cb_type(callback_type::batch);
-  if(need_checkpoint(m)){
-    checkpoint(m);
-  }
-  p.set_cb_type(callback_type::invalid);
-}
-
-// Decide if we need to trigger a checkpoint for either mode, based on prototext defined intervals
-bool lbann_callback_checkpoint::need_checkpoint(model *m) {
-  /* TODO: since we're using clocks, this requires a bcast for each call,
-   * we could use number of samples processed to make a local decision */
-  // if none of our checkpoint conditions are set, assume we're not checkpointing
-  if (m_checkpoint_epochs == 0 &&
-      m_checkpoint_steps  == 0 &&
-      m_checkpoint_secs   == 0.0 &&
-      m_ckpt_dist_epochs == 0 &&
-      m_ckpt_dist_steps== 0) {
-    return false;
-  }
-  // assume that we won't checkpoint
-  m_checkpoint_shared = false;
-  m_checkpoint_dist = false;
-  lbann_comm *comm = m->get_comm();
-  int cur_epoch = m->get_epoch();
-  // If we are at the end of a training epoch and the training epoch lands on defined interval, ckpt
-  if (!m_checkpoint_shared && m_checkpoint_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){
-      m_checkpoint_shared = (cur_epoch > 0) && (cur_epoch % m_checkpoint_epochs == 0);
-    }
-
-  if(!m_checkpoint_dist && m_ckpt_dist_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){
-      m_checkpoint_dist = (cur_epoch > 0) && (cur_epoch % m_ckpt_dist_epochs == 0);
-  }
-
-  // If we are at the end of a training mb step and the training mb step lands on defined interval, trigger checkpoint
-  if (!m_checkpoint_shared && m_checkpoint_steps > 0) {
-    m_checkpoint_shared = (m->get_step(execution_mode::training) > 0) && (m->get_step(execution_mode::training) % m_checkpoint_steps == 0);
-  }
-
-  if(!m_checkpoint_dist && m_ckpt_dist_steps > 0){
-      m_checkpoint_dist = (m->get_step(execution_mode::training) > 0) && (m->get_step(execution_mode::training) % m_ckpt_dist_steps == 0);
-  }
-
-  // check the clock if time-based checkpoint is enabled
-  if (!m_checkpoint_shared && m_checkpoint_secs != 0.0) {
-    // have rank 0 determine whether we should checkpoint
-    // to avoid issues with clock skew, we rely on rank 0 to make decision
-    if (comm->am_trainer_master()) {
-      // get the current time
-      EvalType current = MPI_Wtime();
-      // compute time next checkpoint is due
-      EvalType next = m_checkpoint_last + m_checkpoint_secs;
-      // determine whether it's time for a checkpoint
-      m_checkpoint_shared = (current >= next);
-    }
-    comm->trainer_broadcast(0, m_checkpoint_shared);
-  }
-  // If either checkpoint version is triggered, return true, otherwise false.
-  return (m_checkpoint_shared || m_checkpoint_dist);
-}
-
-// Checkpoint Shared/Distributed
-bool lbann_callback_checkpoint::checkpoint(model *m) {
-  // if the checkpoint directory is not defined, bail
-  if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) {
-    return false;
-  }
-  // time how long this takes
-  // read current epoch and step counters from model
-  El::Timer timer;
-  char dir[1024];
-  std::string epochdir;
-  std::string latest_file;
-  int epoch = -1;
-  int step = -1 ;
-  lbann_comm *comm = m->get_comm();
-  // TODO: we would want to prepend dir with the model name and model rank:
-  // m->get_name() + '.' + std::to_string(comm->get_trainer_rank()) + '.'
-  // However, rng state is not part of model state but that of the world.
-  // So, it needs to be in the root folder.
-  comm->trainer_barrier();
-  // let user know we're saving a checkpoint
-  if (comm->am_trainer_master()) {
-    epoch = m->get_epoch();
-    step = m->get_step(execution_mode::training);
-    timer.Start();
-    printf("Checkpoint: epoch %d step %d ...\n", epoch, step);
-    fflush(stdout);
-  }
-  comm->trainer_broadcast(0, epoch);
-  comm->trainer_broadcast(0, step);
-
-  // Distributed ckpt
-  if(m_checkpoint_dist){
-    // prepend per rank directory with shared checkpoint dir name
-    // Per rank directory typically a cache location like node local SSDs
-    if(m_per_rank_dir.length() != 0){
-      snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), m_checkpoint_dir.c_str());
-    } else {
-      strcpy(dir, m_checkpoint_dir.c_str());
-    }
-    makedir(dir);
-    // create directories per ranks
-    epochdir = get_distributed_checkpoint_dirname(m, dir, epoch, step);
-    p.open_checkpoint(epochdir.c_str());
-    // Call top level save to checkpoint function in model, in turn calls save to checkpoint functions for other model classes (weights, layers)
-    m->save_to_checkpoint_distributed(p);
-    p.close_checkpoint();
-    // Print latest checkpoint to file
-    if (comm->am_trainer_master()) {
-      latest_file = get_last_distributed_checkpoint_filename(m, dir);
-      write_latest(latest_file, epoch, step);
-    }
-  }
-  // Shared checkpoint, logic identical to Distributed.i
-  if(m_checkpoint_shared){
-    strcpy(dir, m_checkpoint_dir.c_str());
-    makedir(dir);
-    epochdir = get_shared_checkpoint_dirname(m, dir, epoch, step);
-    if (comm->am_trainer_master()) {
-      p.open_checkpoint(epochdir.c_str());
-    }
-    // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state
-    comm->trainer_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir));
-    m->save_to_checkpoint_shared(p);
-    // close our checkpoint
-    p.close_checkpoint();
-    if (comm->am_trainer_master()) {
-      latest_file = get_last_shared_checkpoint_filename(m, dir);
-      write_latest(latest_file, epoch, step);
-    }
-  }
-
-  uint64_t bytes_count = p.get_bytes();
-
-  if (comm->am_trainer_master()) {
-    EvalType secs = timer.Stop();
-    EvalType bw = 0;
-    if (secs > 0.0) {
-      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
-    }
-    printf("[%s.%d] Checkpoint complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n",
-           m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw);
-    fflush(stdout);
-  }
-  // record last checkpoint time in case checkpoint_secs interval defined.
-  m_checkpoint_last = MPI_Wtime();
-  p.reset_bytes();
-  return true;
-}
-
-// Restart Shared/Distributed
-bool lbann_callback_checkpoint::restart(model *m) {
-  // if the checkpoint directory is not defined, bail
-  if (m_checkpoint_dir.length() == 0 &&  m_per_rank_dir.length() == 0) {
-    return false;
-  }
-  constexpr unsigned int max_len_dirname = 1024;
-  // get top level directory
-  char dir[max_len_dirname];
-  std::string latest_file;
-  int epoch = -1;
-  int step = -1;
-  int epoch_dist = -1;
-  int step_dist = -1;
-  lbann_comm *comm = m->get_comm();
-  int shared = 1;
-  // Grab latest checkpoint information, checks for latest in dist and shared, restarts from most recent between the two.
-  if (comm->am_trainer_master()) {
-    if(m_per_rank_dir.length()){
-      snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), m_checkpoint_dir.c_str());
-      latest_file = get_last_distributed_checkpoint_filename(m, dir);
-      read_latest(latest_file, &epoch, &step);
-    }
-    if(m_checkpoint_dir.length()){
-      strcpy(dir, m_checkpoint_dir.c_str());
-      latest_file = get_last_shared_checkpoint_filename(m, dir);
-      read_latest(latest_file, &epoch, &step);
-    }
-
-    if(epoch > epoch_dist){
-      strcpy(dir, m_checkpoint_dir.c_str());
-      shared = 1;
-    }
-    else if(epoch == epoch_dist && step > step_dist){
-      strcpy(dir, m_checkpoint_dir.c_str());
-      shared = 1;
-    }
-    else {
-      snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), m_checkpoint_dir.c_str());
-      step = step_dist;
-      epoch = epoch_dist;
-      shared = 0;
-    }
-  }
-  // Update other ranks on where we are loading from.
-  // TODO: we would want to prepend dir with the model name and model rank:
-  // m->get_name() + '.' + std::to_string(comm->get_trainer_rank()) + '.'
-#if 1
-  header_t<max_len_dirname> header;
-
-  header.epoch = epoch;
-  header.step = step;
-  header.shared = shared;
-  memcpy(header.dirname, dir, sizeof(dir));
-
-  comm->trainer_broadcast(0, header);
-
-  epoch = header.epoch;
-  step = header.step;
-  shared = header.shared;
-  memcpy(dir, header.dirname, sizeof(dir));
-#else
-  comm->trainer_broadcast(0, epoch);
-  comm->trainer_broadcast(0, step);
-  comm->trainer_broadcast(0, shared);
-  comm->trainer_broadcast(0, &(dir[0]), sizeof(dir));
-#endif
-
-  // if we couldn't find the latest epoch, just return
-  if (epoch < 0) {
-    return false;
-  }
-  // time how long this takes
-  El::Timer timer;
-  // let user know we're restarting from a checkpoint
-  if (comm->am_trainer_master()) {
-    timer.Start();
-    printf("Restart: epoch %d ...\n", epoch);
-    fflush(stdout);
-  }
-
-  std::string epochdir;
-  // Create dir to restart from based off last recorded checkpoint (or overriden values in last.shared[distributed].checkpoint
-  if(!shared){
-    epochdir = get_distributed_checkpoint_dirname(m, dir, epoch, step);
-    p.open_restart(epochdir.c_str());
-    m->load_from_checkpoint_distributed(p);
-    p.close_restart();
-  }
-  else {
-    epochdir = get_shared_checkpoint_dirname(m, dir, epoch, step);
-    if (comm->am_trainer_master()) {
-      p.open_restart(epochdir.c_str());
-    }
-    // Ensure all ranks have access to checkpoint dir, needed for loading rank specific rng state
-    comm->trainer_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir));
-    m->load_from_checkpoint_shared(p);
-    if(comm->am_trainer_master())
-      p.close_restart();
-  }
-
-  // close our checkpoint
-  uint64_t bytes_count = p.get_bytes();
-  // let user know we've completed reading our restart
-  if (comm->am_trainer_master()) {
-    EvalType secs = timer.Stop();
-    EvalType bw = 0.0;
-    if (secs > 0.0) {
-      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
-    }
-    printf("[%s.%d] Restart complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n",
-           m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw
-          );
-    fflush(stdout);
-  }
-  p.reset_bytes();
-  return true;
-}
-
-}
diff --git a/src/callbacks/callback_checksmall.cpp b/src/callbacks/callback_checksmall.cpp
deleted file mode 100644
index e310c64b6da..00000000000
--- a/src/callbacks/callback_checksmall.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_checksmall.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-void lbann_callback_checksmall::on_forward_prop_end(model *m, Layer *l) {
-  const AbsDistMat& acts = l->get_activations();
-  if (!is_good(acts)) {
-    std::stringstream ss;
-    ss << name() << ": "
-       << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: "
-       << "error in activations of " << l->get_name() << " "
-       << "(step=" << std::to_string(m->get_step(execution_mode::training)) << ")";
-    throw lbann_exception(ss.str());
-  }
-}
-
-void lbann_callback_checksmall::on_backward_prop_end(model *m) {
-  for (weights *w : m->get_weights()) {
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr && !is_good(opt->get_gradient())) {
-      std::stringstream ss;
-      ss << name() << ": "
-         << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: "
-         << "error in weights gradient of " << w->get_name() << " "
-         << "(step=" << std::to_string(m->get_step(execution_mode::training)) << ")";
-      throw lbann_exception(ss.str());
-    }
-  }
-}
-
-void lbann_callback_checksmall::on_batch_end(model *m) {
-  for (weights *w : m->get_weights()) {
-    if (!is_good(w->get_values())) {
-      std::stringstream ss;
-      ss << name() << ": "
-         << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: "
-         << "error in weights of " << w->get_name() << " "
-         << "(step=" << std::to_string(m->get_step(execution_mode::training)-1) << ")";
-      throw lbann_exception(ss.str());
-    }
-  }
-}
-
-bool lbann_callback_checksmall::is_good(const AbsDistMat& m) {
-  const AbsMat& local_mat = m.LockedMatrix();
-  const El::Int height = local_mat.Height();
-  const El::Int width = local_mat.Width();
-  for (El::Int col = 0; col < width; ++col) {
-    for (El::Int row = 0; row < height; ++row) {
-      const DataType val = std::abs(local_mat(row, col));
-      if (val > 0 && val <= m_threshold) {
-        std::cout << "Found small value " << val << " "
-                  << "at (" << row << "," << col << ")!" << std::endl;
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/callback_confusion_matrix.cpp
deleted file mode 100644
index 03eef71e449..00000000000
--- a/src/callbacks/callback_confusion_matrix.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_confusion_matrix.hpp"
-
-namespace lbann {
-
-// ---------------------------------------------------------
-// Constructors
-// ---------------------------------------------------------
-
-lbann_callback_confusion_matrix::lbann_callback_confusion_matrix(std::string prediction_layer,
-                                                                 std::string label_layer,
-                                                                 std::string prefix)
-  : lbann_callback(1, nullptr),
-    m_prediction_layer(std::move(prediction_layer)),
-    m_label_layer(std::move(label_layer)),
-    m_prefix(std::move(prefix)) {}
-
-lbann_callback_confusion_matrix::lbann_callback_confusion_matrix(const lbann_callback_confusion_matrix& other)
-  : lbann_callback(other),
-    m_prediction_layer(other.m_prediction_layer),
-    m_label_layer(other.m_label_layer),
-    m_prefix(other.m_prefix),
-    m_counts(other.m_counts),
-    m_predictions_v(other.m_predictions_v ? other.m_predictions_v->Copy() : nullptr),
-    m_labels_v(other.m_labels_v ? other.m_labels_v->Copy() : nullptr) {}
-
-lbann_callback_confusion_matrix& lbann_callback_confusion_matrix::operator=(const lbann_callback_confusion_matrix& other) {
-  lbann_callback::operator=(other);
-  m_prediction_layer = other.m_prediction_layer;
-  m_label_layer = other.m_label_layer;
-  m_prefix = other.m_prefix;
-  m_counts = other.m_counts;
-  m_predictions_v.reset(other.m_predictions_v ? other.m_predictions_v->Copy() : nullptr);
-  m_labels_v.reset(other.m_labels_v ? other.m_labels_v->Copy() : nullptr);
-  return *this;
-}
-
-// ---------------------------------------------------------
-// Setup
-// ---------------------------------------------------------
-
-void lbann_callback_confusion_matrix::setup(model* m) {
-  lbann_callback::setup(m);
-
-  // Initialize matrix views/copies
-  const auto& predictions = get_predictions(*m);
-  const auto& labels = get_labels(*m);
-  auto dist_data = predictions.DistData();
-  dist_data.device = El::Device::CPU;
-  m_predictions_v.reset(AbsDistMat::Instantiate(dist_data));
-  m_labels_v.reset(AbsDistMat::Instantiate(dist_data));
-
-  // Check output dimensions of prediction and label layers
-  if (predictions.Height() != labels.Height()) {
-    std::stringstream err;
-    err << "callback \"" << name() << "\" "
-        << "has prediction and label layers with different dimensions "
-        << "(prediction layer \"" << m_prediction_layer << "\" "
-        << "outputs " << predictions.Height() << " entries, "
-        << "label layer \"" << m_label_layer << "\" "
-        << "outputs " << labels.Height() << " entries)";
-    LBANN_ERROR(err.str());
-  }
-
-}
-
-// ---------------------------------------------------------
-// Matrix access functions
-// ---------------------------------------------------------
-
-const AbsDistMat& lbann_callback_confusion_matrix::get_predictions(const model& m) const {
-  for (const auto* l : m.get_layers()) {
-    if (l->get_name() == m_prediction_layer) {
-      return l->get_activations();
-    }
-  }
-  std::stringstream err;
-  err << "callback \"" << name() << "\" could not find "
-      << "prediction layer \"" << m_prediction_layer << "\"";
-  LBANN_ERROR(err.str());
-  return m.get_layers()[0]->get_activations();
-}
-
-const AbsDistMat& lbann_callback_confusion_matrix::get_labels(const model& m) const {
-  for (const auto* l : m.get_layers()) {
-    if (l->get_name() == m_label_layer) {
-      return l->get_activations();
-    }
-  }
-  std::stringstream err;
-  err << "callback \"" << name() << "\" could not find "
-      << "label layer \"" << m_prediction_layer << "\"";
-  LBANN_ERROR(err.str());
-  return m.get_layers()[0]->get_activations();
-}
-
-// ---------------------------------------------------------
-// Count management functions
-// ---------------------------------------------------------
-
-void lbann_callback_confusion_matrix::reset_counts(const model& m) {
-  auto& counts = m_counts[m.get_execution_mode()];
-  const auto& num_classes = get_predictions(m).Height();
-  counts.assign(num_classes * num_classes, 0);
-}
-
-void lbann_callback_confusion_matrix::update_counts(const model& m) {
-  constexpr DataType zero = 0;
-
-  // Get predictions
-  const auto& predictions = get_predictions(m);
-  const auto& num_classes = predictions.Height();
-  m_predictions_v->Empty(false);
-  m_predictions_v->AlignWith(predictions);
-  if (m_predictions_v->DistData() == predictions.DistData()) {
-    El::LockedView(*m_predictions_v, predictions);
-  } else {
-    El::Copy(predictions, *m_predictions_v);
-  }
-  const auto& local_predictions = m_predictions_v->LockedMatrix();
-
-  // Get labels
-  const auto& labels = get_labels(m);
-  m_labels_v->Empty(false);
-  m_labels_v->AlignWith(predictions);
-  if (m_labels_v->DistData() == labels.DistData()) {
-    El::LockedView(*m_labels_v, labels);
-  } else {
-    El::Copy(labels, *m_labels_v);
-  }
-  const auto& local_labels = m_labels_v->LockedMatrix();
-
-  // Update counts
-  auto& counts = m_counts[m.get_execution_mode()];
-  const auto& local_height = local_predictions.Height();
-  const auto& local_width = local_predictions.Width();
-  for (El::Int local_col = 0; local_col < local_width; ++local_col) {
-    El::Int prediction_index = -1, label_index = -1;
-    LBANN_OMP_PARALLEL_FOR
-    for (El::Int local_row = 0; local_row < local_height; ++local_row) {
-      if (local_predictions(local_row, local_col) != zero) {
-        prediction_index = m_predictions_v->GlobalRow(local_row);
-      }
-      if (local_labels(local_row, local_col) != zero) {
-        label_index = m_labels_v->GlobalRow(local_row);
-      }
-    }
-    if (prediction_index >= 0 && label_index >= 0) {
-      counts[label_index + prediction_index * num_classes]++;
-    }
-  }
-
-}
-
-void lbann_callback_confusion_matrix::save_confusion_matrix(const model& m) {
-
-  // Get counts
-  const auto& mode = m.get_execution_mode();
-  auto& counts = m_counts[mode];
-
-  // Accumulate counts in master process
-  // Note: Counts in non-root processes are set to zero, so this can
-  // be called multiple times without affecting correctness.
-  auto&& comm = *m.get_comm();
-  if (comm.am_trainer_master()) {
-    comm.trainer_reduce(static_cast<El::Int*>(MPI_IN_PLACE),
-                      counts.size(),
-                      counts.data());
-  } else {
-    comm.trainer_reduce(counts.data(), counts.size(),
-                      comm.get_trainer_master());
-    counts.assign(counts.size(), 0);
-  }
-
-  // Save confusion matrix on master process
-  if (comm.am_trainer_master()) {
-    const auto& num_classes = get_predictions(m).Height();
-    const auto& total_count = std::accumulate(counts.begin(), counts.end(), 0);
-    const auto& scale = DataType(1) / total_count;
-
-    // Construct output file name
-    std::string mode_string;
-    switch (mode) {
-    case execution_mode::training:
-      mode_string = "train-epoch" + std::to_string(m.get_epoch());
-      break;
-    case execution_mode::validation:
-      mode_string = "validation-epoch" + std::to_string(m.get_epoch());
-      break;
-    case execution_mode::testing:
-      mode_string = "test";
-      break;
-    default: return; // Exit immediately if execution mode is unknown
-    }
-
-    // Write to file
-    std::ofstream fs(m_prefix + mode_string + ".csv");
-    for (El::Int i = 0; i < num_classes; ++i) {
-      for (El::Int j = 0; j < num_classes; ++j) {
-        fs << (j > 0 ? "," : "") << counts[j + i * num_classes] * scale;
-      }
-      fs << "\n";
-    }
-    fs.close();
-
-  }
-
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp
deleted file mode 100644
index ffb391272d6..00000000000
--- a/src/callbacks/callback_debug.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_debug.hpp"
-#include "lbann/comm.hpp"
-
-namespace lbann {
-
-namespace {
-
-/** Get human-readable string describing process rank. */
-std::string rank_string(const lbann_comm& comm) {
-  std::stringstream msg;
-  msg << "rank " << comm.get_rank_in_world();
-  if (comm.get_num_trainers() > 1) {
-    msg << " (rank " << comm.get_rank_in_trainer()
-        << " of model " << comm.get_trainer_rank() << ")";
-  }
-  return msg.str();
-}
-
-/** Get human-readable string describing layer. */
-std::string layer_string(const Layer& l) {
-  return l.get_type() + " layer \"" + l.get_name() + "\"";
-}
-
-/** Get human-readable string describing weights and optimizer. */
-std::string weights_string(const weights& w) {
-  std::stringstream msg;
-  msg << "weights \"" << w.get_name() << "\" (";
-  const auto* opt = w.get_optimizer();
-  if (opt == nullptr) { msg << "no"; }
-  else { msg << opt->get_type(); }
-  msg << " optimizer)";
-  return msg.str();
-}
-
-/** Get human-readable string describing current batch step. */
-std::string batch_step_string(const model& m) {
-  std::stringstream msg;
-  const auto& mode = m.get_execution_mode();
-  msg << _to_string(mode) << " batch " << m.get_step();
-  msg << " (epoch " << m.get_epoch() << ")";
-  return msg.str();
-}
-
-} // namespace
-
-// Status updates for batch beginnings/endings
-void lbann_callback_debug::on_batch_begin(model *m) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": "
-        << "starting " << batch_step_string(*m) << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_batch_end(model *m) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": "
-        << "ending " << batch_step_string(*m) << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_batch_evaluate_begin(model *m) {
-  on_batch_begin(m);
-}
-void lbann_callback_debug::on_batch_evaluate_end(model *m) {
-  on_batch_end(m);
-}
-
-// Status updates for beginning/ending of layer forward/backward prop
-void lbann_callback_debug::on_forward_prop_begin(model *m, Layer *l) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
-        << " is starting forward prop for " << batch_step_string(*m)
-        << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_forward_prop_end(model *m, Layer *l) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
-        << " is   ending forward prop for " << batch_step_string(*m)
-        << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_backward_prop_begin(model *m, Layer *l) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
-        << " is starting backward prop for " << batch_step_string(*m)
-        << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_backward_prop_end(model *m, Layer *l) {
-  if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) {
-    std::stringstream msg;
-    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
-        << " is   ending backward prop for " << batch_step_string(*m)
-        << std::endl;
-    std::cerr << msg.str();
-  }
-}
-void lbann_callback_debug::on_evaluate_forward_prop_begin(model *m, Layer *l) {
-  on_forward_prop_begin(m, l);
-}
-void lbann_callback_debug::on_evaluate_forward_prop_end(model *m, Layer *l) {
-  on_backward_prop_end(m, l);
-}
-
-// Status updates for optimization step
-void lbann_callback_debug::on_optimize_begin(model *m, weights *w) {
-  std::stringstream msg;
-  msg << rank_string(*m->get_comm()) << ": " << weights_string(*w)
-      << " is starting optimization step for " << batch_step_string(*m)
-      << std::endl;
-  std::cerr << msg.str();
-}
-void lbann_callback_debug::on_optimize_end(model *m, weights *w) {
-  std::stringstream msg;
-  msg << rank_string(*m->get_comm()) << ": " << weights_string(*w)
-      << " is   ending optimization step for " << batch_step_string(*m)
-      << std::endl;
-  std::cerr << msg.str();
-}
-
-} // namespace lbann
diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp
deleted file mode 100644
index 78bedc27c11..00000000000
--- a/src/callbacks/callback_debug_io.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_debug .hpp .cpp - Callback hooks to debug LBANN
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_debug_io.hpp"
-
-/// BVE FIXME @todo The use of execution_mode invalid needs to be reconsidered
-void lbann::lbann_callback_debug_io::on_epoch_begin(model *m) {
-  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::training) {
-    print_phase_start(m, execution_mode::training);
-  }
-}
-
-void lbann::lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) {
-  auto *input = dynamic_cast<generic_input_layer*>(l);
-  if (input == nullptr || m_debug_lvl < 1) {
-    return;
-  }
-
-  if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) {
-    if(m_debug_phase == execution_mode::invalid || m_debug_phase == m->get_execution_mode()) {
-      print_fp_start(m, input);
-    }
-  }
-  /// BVE Note - what is hte role of hte current mini-batch index
-  /// versus the current position
-  /// I think that the reset mini batch index may be off
-}
-
-void lbann::lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *input) {
-  const auto& step = m->get_step();
-  std::cout << "[" << m->get_comm()->get_trainer_rank()
-            << "." << m->get_comm()->get_rank_in_trainer()
-            << "] @" << m->get_epoch() << "." << step
-            << " Phase: " << _to_string(m->get_execution_mode())
-            << " starting forward propagation for layer " << input->get_name()
-            << " type: " << input->get_type()
-            << " iteration: " << input->get_data_reader()->get_current_mini_batch_index()
-            << " of " << input->get_num_iterations_per_epoch()
-            << " loading idx " << input->get_data_reader()->get_loaded_mini_batch_index()
-            << " bs=" << input->get_current_mini_batch_size() << "/"
-            << input->get_current_global_mini_batch_size()
-            << " @" << input->get_data_reader()->get_position()
-    //              << " %" << input->get_data_reader()->get_batch_stride()
-            << " ^" << input->get_data_reader()->get_sample_stride()
-            << std::endl;
-}
-
-//  179i @ 300s (=5m*60s) + 1i @ 100s (=5m*45s):offset <- num models
-void lbann::lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) {
-
-  // Get data reader from first input layer in model
-  generic_data_reader* data_reader = nullptr;
-  for (auto&& l : m->get_layers()) {
-    auto&& input = dynamic_cast<generic_input_layer*>(l);
-    if (input != nullptr) {
-      data_reader = input->get_data_reader(mode);
-      break;
-    }
-  }
-  if (data_reader == nullptr) { return; }
-
-  const auto& step = m->get_step();
-
-  if(data_reader->get_rank() < data_reader->get_num_parallel_readers()) {
-    std::cout << "[" << m->get_comm()->get_trainer_rank()
-              << "." << m->get_comm()->get_rank_in_trainer()
-              << "] @" << 0 << "." << step
-              << " Starting Phase: " << _to_string(mode)
-              << " " << (data_reader->get_num_iterations_per_epoch() - 1)
-              << "i @ " << data_reader->get_global_mini_batch_size()
-              << "s (=" << m->get_comm()->get_num_trainers()
-              << "m *" << data_reader->get_mini_batch_size()
-              << "s [+" << data_reader->get_stride_to_next_mini_batch()
-              << "s]) + 1i @ " << data_reader->get_global_last_mini_batch_size()
-              << "s (=" << m->get_comm()->get_num_trainers()
-              << "m *" << data_reader->get_last_mini_batch_size()
-              << "s [+" << data_reader->get_stride_to_last_mini_batch()
-              << "s]):"
-              <<" base offset "<< data_reader->get_base_offset()
-              << " model offset " << data_reader->get_model_offset()
-              << " par. readers = " << data_reader->get_num_parallel_readers()
-              << "r"
-              << std::endl;
-  }else {
-    std::cout << "[" << m->get_comm()->get_trainer_rank()
-              << "." << m->get_comm()->get_rank_in_trainer()
-              << "] @" << 0 << "." << step
-              << " Starting Phase: " << _to_string(mode)
-              << " " << (data_reader->get_num_iterations_per_epoch())
-              << "i "
-              << " par. readers = " << data_reader->get_num_parallel_readers()
-              << "r (Inactive Reader)"
-              << std::endl;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Evaluation phase debugging
-////////////////////////////////////////////////////////////////////////////////
-void lbann::lbann_callback_debug_io::on_validation_begin(model *m) {
-  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::validation) {
-    print_phase_start(m, execution_mode::validation);
-  }
-}
-
-void lbann::lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) {
-  auto *input = dynamic_cast<generic_input_layer*>(l);
-  if (input == nullptr || m_debug_lvl < 1) {
-    return;
-  }
-
-  if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) {
-    if(m_debug_phase == execution_mode::invalid || m_debug_phase == m->get_execution_mode()) {
-      print_fp_start(m, input);
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Testing phase debugging
-////////////////////////////////////////////////////////////////////////////////
-void lbann::lbann_callback_debug_io::on_test_begin(model *m) {
-  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::testing) {
-    print_phase_start(m, execution_mode::testing);
-  }
-}
diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/callback_dump_error_signals.cpp
deleted file mode 100644
index f204a1caef4..00000000000
--- a/src/callbacks/callback_dump_error_signals.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_dump_error_signals.hpp"
-
-namespace lbann {
-
-void lbann_callback_dump_error_signals::on_backward_prop_end(model *m, Layer *l) {
-
-  // Write each activation matrix to file
-  for (int i = 0; i < l->get_num_parents(); ++i) {
-
-    // File name
-    std::stringstream file;
-    file << m_basename
-         << "model" << m->get_comm()->get_trainer_rank() << "-"
-         << "epoch" << m->get_epoch() << "-"
-         << "step" << m->get_step() << "-"
-         << l->get_name() << "-"
-         << "ErrorSignals";
-    if (l->get_num_parents() > 1) { file << i; }
-
-    // Write activations to file
-    El::Write(l->get_error_signals(i), file.str(), El::ASCII);
-
-  }
-
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/callback_dump_gradients.cpp
deleted file mode 100644
index 7f2a55c25af..00000000000
--- a/src/callbacks/callback_dump_gradients.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients
-////////////////////////////////////////////////////////////////////////////////
-
-#include <vector>
-#include "lbann/callbacks/callback_dump_gradients.hpp"
-
-namespace lbann {
-
-void lbann_callback_dump_gradients::on_backward_prop_end(model *m) {
-  for (weights *w : m->get_weights()) {
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr) {
-      const std::string file
-        = (m_basename
-           + "model" + std::to_string(m->get_comm()->get_trainer_rank())
-           + "-epoch" + std::to_string(m->get_epoch())
-           + "-step" + std::to_string(m->get_step())
-           + "-" + w->get_name()
-           + "-Gradient");
-      El::Write(opt->get_gradient(), file, El::ASCII);
-    }
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp
deleted file mode 100644
index cd0aec7dba7..00000000000
--- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_minibatch_sample_indices .hpp .cpp - Callbacks
-// to dump the list of indices per minibatch
-////////////////////////////////////////////////////////////////////////////////
-
-#include <vector>
-#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp"
-#include "lbann/layers/io/input/input_layer.hpp"
-#include <iomanip>
-#include <cstdlib>
-
-namespace lbann {
-
-void lbann_callback_dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t step) {
-  // Print minibatch sample indices of input layers
-  auto *input = dynamic_cast<generic_input_layer*>(l);
-  if (input != nullptr) {
-    El::Matrix<El::Int>* indices = l->get_sample_indices_per_mb();
-    if (indices == nullptr
-        || indices->Height() == 0
-        || indices->Width() == 0) {
-      return;
-    }
-
-    std::ostringstream s;
-    s << "mkdir -p " << m_basename;
-    const int dir= system(s.str().c_str());
-    if (dir< 0) {
-      LBANN_ERROR("callback_dump_minibatch_sample_indices is unable to create the target director");
-    }
-
-    const std::string file
-      = (m_basename
-         + _to_string(m->get_execution_mode())
-         + "-model" + std::to_string(m->get_comm()->get_trainer_rank())
-         + "-rank" + std::to_string(m->get_comm()->get_rank_in_trainer())
-         + "-epoch" + std::to_string(m->get_epoch())
-         + "-step" + std::to_string(m->get_step(execution_mode::training))
-         + "-" + l->get_name()
-         + "-MB_Sample_Indices");
-    El::Write(*indices, file, El::ASCII);
-  }
-}
-
-void lbann_callback_dump_minibatch_sample_indices::on_forward_prop_end(model *m, Layer *l) {
-  dump_to_file(m, l, m->get_step());
-}
-
-void lbann_callback_dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer *l) {
-  dump_to_file(m, l, m->get_step());
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp
deleted file mode 100644
index 2c5fc57cb34..00000000000
--- a/src/callbacks/callback_dump_outputs.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_dump_outputs.hpp"
-#include "lbann/utils/file_utils.hpp"
-
-#ifdef LBANN_HAS_CNPY
-#include <cnpy.h>
-#endif // LBANN_HAS_CNPY
-
-namespace lbann {
-
-namespace {
-
-/** Save text file.
- *
- *  Each line corresponds to a mini-batch sample. This is the
- *  transpose of our internal column-major matrix representation.
- */
-void save_text(const std::string& file_name,
-               std::string delimiter,
-               const CPUMat& data) {
-  std::ofstream fs(file_name.c_str());
-  if (!fs.is_open()) {
-    LBANN_ERROR("failed to open output file (" + file_name + ")");
-  }
-  for (El::Int col = 0; col < data.Width(); ++col) {
-    for (El::Int row = 0; row < data.Height(); ++row) {
-      fs << (row > 0 ? delimiter : "") << data(row, col);
-    }
-    fs << "\n";
-  }
-}
-
-
-/** Save NumPy binary file. */
-void save_npy(const std::string& file_name,
-              const std::vector<int>& dims,
-              const CPUMat& data) {
-#ifndef LBANN_HAS_CNPY
-  LBANN_ERROR("CNPY not detected");
-#else
-  if (!data.Contiguous()) {
-    LBANN_ERROR("expected contiguous data matrix");
-  }
-  std::vector<size_t> shape;
-  shape.push_back(data.Width());
-  for (const auto& d : dims) { shape.push_back(d); }
-  cnpy::npy_save(file_name, data.LockedBuffer(), shape);
-#endif // LBANN_HAS_CNPY
-}
-
-/** Save NumPy zip file. */
-void save_npz(const std::string& file_name,
-              const std::string& tensor_name,
-              const std::vector<int>& dims,
-              const CPUMat& data) {
-#ifndef LBANN_HAS_CNPY
-  LBANN_ERROR("CNPY not detected");
-#else
-  if (!data.Contiguous()) {
-    LBANN_ERROR("expected contiguous data matrix");
-  }
-  std::vector<size_t> shape;
-  shape.push_back(data.Width());
-  for (const auto& d : dims) { shape.push_back(d); }
-  cnpy::npz_save(file_name, tensor_name, data.LockedBuffer(), shape);
-#endif // LBANN_HAS_CNPY
-}
-
-} // namespace
-
-lbann_callback_dump_outputs::lbann_callback_dump_outputs(std::set<std::string> layer_names,
-                                                         std::set<execution_mode> modes,
-                                                         El::Int batch_interval,
-                                                         std::string directory,
-                                                         std::string file_format)
-  : lbann_callback(std::max(batch_interval, El::Int(1))),
-    m_layer_names(std::move(layer_names)),
-    m_modes(std::move(modes)),
-    m_directory(std::move(directory)),
-    m_file_format(std::move(file_format)) {
-  std::stringstream err;
-
-  // Initialize directory for output files
-  // Note: Default directory is current working directory. Make sure
-  // pathname has trailing slash.
-  if (m_directory.empty()) { m_directory = "./"; }
-  if (m_directory.back() != '/') { m_directory += "/"; }
-
-  // Initialize file format
-  if (m_file_format.empty()) { m_file_format = "csv"; }
-#ifndef LBANN_HAS_CNPY
-  if (m_file_format == "npy" || m_file_format == "npz") {
-    err << "callback \"" << this->name() << "\" attempted "
-        << "to use NumPy file format (" << m_file_format << "), "
-        << "but CNPY was not detected";
-    LBANN_ERROR(err.str());
-  }
-#endif // LBANN_HAS_CNPY
-  if (m_file_format != "csv" && m_file_format != "tsv"
-      && m_file_format != "npy" && m_file_format != "npz") {
-    err << "callback \"" << this->name() << "\" attempted "
-        << "to use invalid file format (" << m_file_format << ")";
-    LBANN_ERROR(err.str());
-  }
-
-}
-
-void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) {
-
-  // Get mini-batch step information
-  const auto& mode = m.get_execution_mode();
-  const auto& epoch = m.get_epoch();
-  const auto& step = m.get_step();
-
-  // Quit if output dump isn't needed
-  if (!m_modes.empty() && m_modes.count(mode) == 0) { return; }
-  if (!m_layer_names.empty()
-      && m_layer_names.count(l.get_name()) == 0) { return; }
-
-  // Create directory
-  file::make_directory(m_directory);
-
-  // Save layer outputs on root process
-  for (int i = 0; i < l.get_num_children(); ++i) {
-    const CircMat<El::Device::CPU> circ_data(l.get_activations(i));
-    if (circ_data.CrossRank() == circ_data.Root()) {
-      const auto& data = static_cast<const CPUMat&>(circ_data.LockedMatrix());
-      const std::string file_name = (m_directory
-                                     + m.get_name()
-                                     + "-" + _to_string(mode)
-                                     + "-epoch" + std::to_string(epoch)
-                                     + "-step" + std::to_string(step)
-                                     + "-" + l.get_name()
-                                     + "-output" + std::to_string(i)
-                                     + "." + m_file_format);
-      if (m_file_format == "csv") {
-        save_text(file_name, ",", data);
-      } else if (m_file_format == "tsv") {
-        save_text(file_name, "\t", data);
-      } else if (m_file_format == "npy") {
-        save_npy(file_name, l.get_output_dims(i), data);
-      } else if (m_file_format == "npz") {
-        save_npz(file_name,
-                 l.get_name() + "_output" + std::to_string(i),
-                 l.get_output_dims(i),
-                 data);
-      }
-    }
-  }
-
-}
-
-} // namespace lbann
diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/callback_dump_weights.cpp
deleted file mode 100644
index 4129a2a2acd..00000000000
--- a/src/callbacks/callback_dump_weights.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices
-////////////////////////////////////////////////////////////////////////////////
-
-#include <vector>
-#include "lbann/callbacks/callback_dump_weights.hpp"
-
-namespace lbann {
-
-void lbann_callback_dump_weights::on_train_begin(model *m) {
-  dump_weights(m, "initial");
-}
-
-void lbann_callback_dump_weights::on_epoch_end(model *m) {
-  dump_weights(m);
-}
-
-void lbann_callback_dump_weights::dump_weights(model *m, std::string s) {
-  for (weights *w : m->get_weights()) {
-    std::string epoch = "-epoch" + std::to_string(m->get_epoch()-1);
-    if(s != "") {
-      epoch = "-" + s;
-    }
-    const std::string file
-      = (m_basename
-         + "model" + std::to_string(m->get_comm()->get_trainer_rank())
-         + epoch
-         + "-" + w->get_name()
-         + "-Weights");
-    El::Write(w->get_values(), file, El::ASCII);
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/callback_early_stopping.cpp
deleted file mode 100644
index d7af962290b..00000000000
--- a/src/callbacks/callback_early_stopping.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_early_stopping.hpp"
-
-namespace lbann {
-
-lbann_callback_early_stopping::lbann_callback_early_stopping(int64_t patience) :
-  lbann_callback(), m_patience(patience) {}
-
-/// Monitor the objective function to see if the validation score
-/// continues to improve
-void lbann_callback_early_stopping::on_validation_end(model *m) {
-  execution_mode mode = m->get_execution_mode();
-  EvalType score = m->get_objective_function()->get_mean_value(mode);
-  if (score < m_last_score) {
-    if (m->get_comm()->am_trainer_master()) {
-      std::cout << "Model " << m->get_comm()->get_trainer_rank() <<
-        " early stopping: score is improving " << m_last_score << " >> " <<
-        score << std::endl;
-    }
-    m_last_score = score;
-    m_wait = 0;
-  } else {
-    if (m_wait >= m_patience) {
-      m->set_terminate_training(true);
-      if (m->get_comm()->am_trainer_master()) {
-        std::cout << "Model " << m->get_comm()->get_trainer_rank() <<
-          " terminating training due to early stopping: " << score <<
-          " score and " << m_last_score << " last score" << std::endl;
-      }
-    } else {
-      ++m_wait;
-    }
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_gpu_memory_usage.cpp b/src/callbacks/callback_gpu_memory_usage.cpp
deleted file mode 100644
index 4cacebf1a81..00000000000
--- a/src/callbacks/callback_gpu_memory_usage.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_gpu_memory_usage.hpp"
-#include <iomanip>
-#include <sstream>
-
-namespace {
-template <typename T>
-T get_mean(const std::vector<T> &v) {
-  return std::accumulate(v.begin(), v.end(), 0.0) /
-      v.size();
-}
-template <typename T>
-T get_median(const std::vector<T> &v) {
-  std::vector<T> tmp = v;
-  int median_idx = tmp.size() / 2 - 1 + tmp.size() % 2;
-  std::nth_element(tmp.begin(), tmp.begin() + median_idx, tmp.end());
-  return tmp[median_idx];
-}
-template <typename T>
-T get_max(const std::vector<T> &v) {
-  return *std::max_element(v.begin(), v.end());
-}
-template <typename T>
-T get_min(const std::vector<T> &v) {
-  return *std::min_element(v.begin(), v.end());
-}
-}
-
-namespace lbann {
-
-void lbann_callback_gpu_memory_usage::on_epoch_begin(model *m) {
-#ifdef LBANN_HAS_CUDA
-  size_t available;
-  size_t total;
-  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
-  size_t used = total - available;
-  auto comm = m->get_comm();
-  if (comm->am_trainer_master()) {
-    auto num_procs = comm->get_procs_per_trainer();
-    std::vector<size_t> used_list(num_procs);
-    comm->trainer_gather(used, used_list.data());
-    double used_mean = get_mean(used_list) / 1024.0 / 1024.0 / 1024.0;
-    double used_median = get_median(used_list) / 1024.0 / 1024.0 / 1024.0;
-    double used_max = get_max(used_list) / 1024.0 / 1024.0 / 1024.0;
-    double used_min = get_min(used_list) / 1024.0 / 1024.0 / 1024.0;
-    std::stringstream ss;
-    ss << "Model " << m->get_comm()->get_trainer_rank()
-       << " GPU memory usage statistics : "
-       << std::setprecision(3)
-       << used_mean  << " GiB mean, "
-       << std::setprecision(3)
-       << used_median  << " GiB median, "
-       << std::setprecision(3)
-       << used_max  << " GiB max, "
-       << std::setprecision(3)
-       << used_min  << " GiB min "
-       << "("
-       << std::setprecision(3)
-       << (total / 1024.0 / 1024.0 / 1024.0)
-       << " GiB total)" << std::endl;
-    std::cout << ss.str();
-  } else {
-    comm->trainer_gather(used, comm->get_trainer_master());
-  }
-#endif
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_helpers.hpp b/src/callbacks/callback_helpers.hpp
new file mode 100644
index 00000000000..e612ef21850
--- /dev/null
+++ b/src/callbacks/callback_helpers.hpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/exception.hpp"
+
+#include <string>
+#include <vector>
+
+namespace lbann {
+namespace {
+template <typename T>
+std::vector<T*> select_things_by_name(
+  std::vector<T*> const& things,
+  std::vector<std::string> const& thing_names) {
+
+  std::vector<T*> out_things;
+  for (auto const& name : thing_names) {
+    auto it = std::find_if(
+      things.begin(), things.end(),
+      [&name](const T* t) { return t->get_name() == name; });
+    if (it != things.end())
+      out_things.push_back(*it);
+    else
+      LBANN_ERROR(std::string("Requested thing \"") + name
+                  + "\" does not exist in the list of things.");
+  }
+  return out_things;
+}
+} // namespace
+} // namespace lbann
diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/callback_imcomm.cpp
deleted file mode 100644
index 014d5724af8..00000000000
--- a/src/callbacks/callback_imcomm.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_imcomm .hpp .cpp - Send gradient updates between models
-////////////////////////////////////////////////////////////////////////////////
-
-#include <typeinfo>
-#include <typeindex>
-#include "lbann/callbacks/callback_imcomm.hpp"
-#include "lbann/utils/timer.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-lbann_callback_imcomm::lbann_callback_imcomm(lbann_callback_imcomm::comm_type ct,
-    lbann_summary *summarizer) :
-  lbann_callback(1, summarizer), m_default_ct(ct) {}
-
-lbann_callback_imcomm::lbann_callback_imcomm(lbann_callback_imcomm::comm_type ct,
-    std::unordered_set<weights *> weights_list,
-    lbann_summary *summarizer) :
-  lbann_callback_imcomm(ct, summarizer) {
-  for (weights *w : weights_list) {
-    m_weights_params[w] = {};
-    m_weights_params[w].ct = ct;
-  }
-}
-
-void lbann_callback_imcomm::set_weights_comm(weights *w,
-                                             comm_type ct) {
-  m_weights_params[w] = {};
-  m_weights_params[w].ct = ct;
-}
-
-void lbann_callback_imcomm::setup(model *m) {
-  for (weights *w : m->get_weights()) {
-
-    // Add weights if not already in list
-    if (m_weights_params.find(w) == m_weights_params.end()) {
-      m_weights_params[w] = {};
-      m_weights_params[w].ct = (w->get_optimizer() != nullptr ?
-                                m_default_ct :
-                                NONE);
-    }
-
-    // Setup imcomm parameters if needed
-    imcomm_params& params = m_weights_params[w];
-    if (params.ct != NONE) {
-      optimizer *opt = w->get_optimizer();
-      if (opt == nullptr) {
-        std::stringstream err;
-        err << __FILE__ << " " << __LINE__ << " :: "
-            << "imcomm: trying to do inter-model gradient communication on "
-            << w->get_name() << ", which has no optimizer";
-        throw(err.str());
-      }
-    }
-
-  }
-}
-
-void lbann_callback_imcomm::on_train_begin(model *m) {
-  lbann_comm *comm = m->get_comm();
-  if (comm->get_num_trainers() == 1) {
-    return;  // No point with only one model.
-  }
-  for (weights *w : m->get_weights()) {
-    AbsDistMat *values = w->get_values().Copy();
-    comm->intertrainer_broadcast_matrix(*values, 0);
-    w->set_values(*values);
-    delete values;
-  }
-}
-
-void lbann_callback_imcomm::on_backward_prop_end(model *m) {
-  lbann_comm *comm = m->get_comm();
-  if (comm->get_num_trainers() == 1 ||
-      m->get_execution_mode() != execution_mode::training) {
-    return;  // No point with only one model.
-  }
-  for (weights *w : m->get_weights()) {
-    EvalType start_time = get_time();
-    imcomm_params& params = m_weights_params[w];
-    if (params.ct == NONE) {
-      continue;
-    }
-    optimizer *opt = w->get_optimizer();
-    auto gradient = opt->get_gradient().Copy();
-    Mat* local_gradients = &(static_cast<CPUMat&>(gradient->Matrix()));
-    switch (params.ct) {
-    case NORMAL:
-      comm->intertrainer_sum_matrix(*local_gradients);
-      break;
-    default:
-      throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: "
-         + "imcomm: unknown comm type");
-    }
-    opt->clear_gradient();
-    opt->add_to_gradient(*gradient);
-    delete gradient;
-    EvalType im_time = get_time() - start_time;
-    do_summary(m, w, im_time);
-  }
-}
-
-void lbann_callback_imcomm::do_summary(model *m, weights *w,
-                                       EvalType im_time) {
-  if (m_summarizer == nullptr) {
-    return;
-  }
-  std::string prefix = w->get_name() + "/imcomm_";
-  m_summarizer->reduce_scalar(prefix + "time",
-                              im_time, m->get_step(execution_mode::training));
-  // Use the same approximation the comm layer does.
-  const CPUMat& local_gradients =
-    static_cast<const CPUMat&>(w->get_optimizer()->get_gradient().LockedMatrix());
-  size_t bytes_sent =
-    sizeof(DataType) * local_gradients.Height() * local_gradients.Width();
-  size_t bytes_received =
-    sizeof(DataType) * local_gradients.Height() * local_gradients.Width();
-  m_summarizer->reduce_scalar(prefix + "bytes_sent",
-                              bytes_sent, m->get_step(execution_mode::training));
-  m_summarizer->reduce_scalar(prefix + "bytes_received",
-                              bytes_received, m->get_step(execution_mode::training));
-}
-
-static std::vector<std::string> comm_type_names  =
-    { "none", "normal" };
-
-/** returns a string representation of the weight_initialization */
-std::string get_comm_type_name(lbann_callback_imcomm::comm_type m) {
-  if ((int)m < 0 or (int)m >= (int)comm_type_names.size()) {
-    throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: "
-           + " Invalid comm_type");
-  }
-  return comm_type_names[(int)m];
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp
deleted file mode 100644
index 640172f7f07..00000000000
--- a/src/callbacks/callback_io.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_io .hpp .cpp - Callback hooks for I/O monitoring
-////////////////////////////////////////////////////////////////////////////////
-
-#include <utility>
-
-#include "lbann/callbacks/callback_io.hpp"
-#include "lbann/layers/io/input/generic_input_layer.hpp"
-
-namespace lbann {
-
-lbann_callback_io::lbann_callback_io() : lbann_callback() {}
-
-lbann_callback_io::lbann_callback_io(
-  std::unordered_set<Layer *> layers) : lbann_callback(), m_layer_indices(std::move(layers)) {}
-
-void lbann_callback_io::on_epoch_end(model *m) {
-  lbann_comm *comm = m->get_comm();
-  for (Layer *layer : m->get_layers()) {
-    if(m_layer_indices.size() == 0
-       || m_layer_indices.find(layer) != m_layer_indices.end()) {
-      auto *input = (generic_input_layer *) dynamic_cast<generic_input_layer *> (layer);
-      if(input != nullptr) {
-        std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed "
-                  << input->get_num_samples_trained() << " training samples of "
-                  << input->get_total_num_training_samples() << " ("
-                  << input->get_num_samples_trained() / m->get_epoch() << " per epoch)" << std::endl;
-      }
-    }
-  }
-}
-
-void lbann_callback_io::on_test_end(model *m) {
-  lbann_comm *comm = m->get_comm();
-  for (Layer *layer : m->get_layers()) {
-    if(m_layer_indices.size() == 0
-       || m_layer_indices.find(layer) != m_layer_indices.end()) {
-      auto *input = (generic_input_layer *) dynamic_cast<generic_input_layer *> (layer);
-      if(input != nullptr) {
-        std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed "
-                  << input->get_num_samples_tested() << " test samples of "
-                  << input->get_total_num_testing_samples() << " ("
-                  << input->get_num_samples_tested() / m->get_epoch() << " per epoch)" << std::endl;
-      }
-    }
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp
deleted file mode 100644
index 07c33580ffd..00000000000
--- a/src/callbacks/callback_learning_rate.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_learning_rate.hpp"
-#include <limits>
-#include <utility>
-#include <cmath> // std::pow
-
-namespace lbann {
-
-float lbann_callback_learning_rate::m_cur_global_lr = 0.0f;
-
-lbann_callback_learning_rate::lbann_callback_learning_rate() {}
-
-lbann_callback_learning_rate::lbann_callback_learning_rate(
-  std::unordered_set<weights *> weights_list) : m_weights(std::move(weights_list)) {}
-
-void lbann_callback_learning_rate::setup(model *m) {
-
-  // Add all weights if list of weights is not initialized
-  std::vector<weights *> weights_list(m_weights.begin(), m_weights.end());
-  if (weights_list.empty()) {
-    weights_list = m->get_weights();
-  }
-
-  // Remove weights that are not being optimized
-  m_weights.clear();
-  for (weights *w : weights_list) {
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr) {
-      m_weights.insert(w);
-      // Initialize the global learning rate, exactly once.
-      if (m_cur_global_lr == 0.0f) {
-        m_cur_global_lr = opt->get_learning_rate();
-      }
-    }
-  }
-
-}
-
-void lbann_callback_learning_rate::on_epoch_end(model *m) {
-  const float new_lr = global_schedule(m);
-  const float old_global_lr = m_cur_global_lr;
-  m_cur_global_lr = new_lr;
-  lbann_comm *comm = m->get_comm();
-  if (comm->am_trainer_master() && new_lr != old_global_lr) {
-    std::cout << "Model " << comm->get_trainer_rank() << ": "
-              << "changing global learning rate to " << new_lr
-              << " at epoch " << m->get_epoch() << std::endl;
-  }
-  for (weights *w : m_weights) {
-    optimizer *opt = w->get_optimizer();
-    const float old_lr = opt->get_learning_rate();
-    if (old_lr != new_lr) {
-      opt->set_learning_rate(new_lr);
-    }
-  }
-}
-
-void lbann_callback_learning_rate::on_backward_prop_end(model *m) {
-  for (weights *w : m_weights) {
-    optimizer& opt = *w->get_optimizer();
-    const float old_lr = opt.get_learning_rate();
-    const float new_lr = optimizer_schedule(m, opt);
-    if (old_lr != new_lr) {
-      opt.set_learning_rate(new_lr);
-    }
-  }
-}
-
-lbann_callback_step_learning_rate::lbann_callback_step_learning_rate(
-  int step, float amt) :
-  lbann_callback_learning_rate(), m_step(step), m_amt(amt) {}
-
-lbann_callback_step_learning_rate::lbann_callback_step_learning_rate(
-  int step, float amt, std::unordered_set<weights *> weights_list) :
-  lbann_callback_learning_rate(weights_list), m_step(step), m_amt(amt) {}
-
-float lbann_callback_step_learning_rate::global_schedule(model *m) {
-  if (m->get_epoch() % m_step == 0) {
-    return m_cur_global_lr * m_amt;
-  } else {
-    return m_cur_global_lr;
-  }
-}
-
-lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate(
-  int64_t patience, float amt) :
-  lbann_callback_adaptive_learning_rate(patience, amt,
-                                        std::unordered_set<weights *>()) {}
-
-lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate(
-  int64_t patience, float amt, std::unordered_set<weights *> weights_list) :
-  lbann_callback_learning_rate(weights_list), m_patience(patience), m_amt(amt) {}
-
-float lbann_callback_adaptive_learning_rate::global_schedule(model *m) {
-  // Determine behavior the first time this is called in an epoch
-  if (m_cur_epoch != m->get_epoch()) {
-    m_cur_epoch = m->get_epoch();
-    const execution_mode mode = m->get_execution_mode();
-    const EvalType score = m->get_objective_function()->get_mean_value(mode);
-    if (score < m_last_score) {
-      // Reset wait counter if score has decreased
-      m_last_score = score;
-      m_wait = 0;
-      m_adjust_learning_rate = false;
-    } else if (m_wait >= m_patience) {
-      // Adjust learning rate if patience has been exceeded
-      m_last_score = score;
-      m_wait = 0;
-      m_adjust_learning_rate = true;
-    } else {
-      // Otherwise increment wait counter
-      m_wait++;
-      m_adjust_learning_rate = false;
-    }
-  }
-
-  // Adjust learning rate if needed
-  if (m_adjust_learning_rate) {
-    return m_cur_global_lr * m_amt;
-  } else {
-    return m_cur_global_lr;
-  }
-}
-
-lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate(
-  std::vector<int64_t> drop_epochs, float amt) :
-  lbann_callback_drop_fixed_learning_rate(drop_epochs, amt,
-                                          std::unordered_set<weights *>()) {}
-
-lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate(
-  std::vector<int64_t> drop_epochs, float amt, std::unordered_set<weights *> weights_list) :
-  lbann_callback_learning_rate(weights_list), m_amt(amt), m_drop_epochs(std::move(drop_epochs)) {
-  // Sort in reverse order.
-  std::sort(m_drop_epochs.rbegin(), m_drop_epochs.rend());
-}
-
-float lbann_callback_drop_fixed_learning_rate::global_schedule(model* m) {
-  // Delete last drop epoch if we have already passed it
-  while (!m_drop_epochs.empty()
-         && m->get_epoch() > m_drop_epochs.back()) {
-    m_drop_epochs.pop_back();
-  }
-
-  // Adjust learning rate if at a drop epoch
-  if (!m_drop_epochs.empty() && m->get_epoch() == m_drop_epochs.back()) {
-    return m_cur_global_lr * m_amt;
-  } else {
-    return m_cur_global_lr;
-  }
-}
-
-lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate(
-  float target, int64_t num_epochs) :
-  lbann_callback_linear_growth_learning_rate(target, num_epochs, 0,
-                                             std::unordered_set<weights *>()) {}
-
-lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate(
-  float target, int64_t num_epochs, int64_t delay) :
-  lbann_callback_linear_growth_learning_rate(target, num_epochs, delay,
-                                             std::unordered_set<weights *>()) {}
-
-lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate(
-  float target, int64_t num_epochs, int64_t delay,
-  std::unordered_set<weights *> weights_list) :
-  lbann_callback_learning_rate(weights_list), m_target(target), m_inc(0),
-  m_num_epochs(num_epochs), m_delay(delay) {}
-
-void lbann_callback_linear_growth_learning_rate::setup(model *m) {
-  lbann_callback_learning_rate::setup(m);
-  // Compute the learning rate increase.
-  if (!m_weights.empty()) {
-    // Assumes all optimizers have the same initial learning rate.
-    m_base_lr = m_cur_global_lr;
-    m_inc = (m_target - m_base_lr) / m_num_epochs;
-  }
-}
-
-float lbann_callback_linear_growth_learning_rate::global_schedule(model *m) {
-  if (m->get_epoch() < m_delay) {
-    return m_cur_global_lr;
-  } else if (m->get_epoch() <= m_num_epochs + m_delay) {
-    int num_left = m_num_epochs + m_delay - m->get_epoch();
-    return m_base_lr + m_inc*(m_num_epochs - num_left);
-  } else {
-    return m_cur_global_lr;
-  }
-}
-
-/**
- * This constructor takes the policy specific parameters, the exponent (p)
- * and the maximum number of iterations (max_iter).
- * In case that max_iter is set to 0, it is calculated from the number of
- * epochs (n_epochs). n_epochs is not used otherwise.
- */
-lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate(
-  double p, uint64_t n_epochs, uint64_t max_iter)
-  : lbann_callback_learning_rate(std::unordered_set<weights *>()),
-    m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter),
-    m_end_lr(0.0f),
-    m_lr(1.0f), m_last_epoch_lr(1.0f) {}
-
-lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate(
-  double p, uint64_t n_epochs, uint64_t max_iter, double end_lr,  std::unordered_set<weights *> weights_list)
-  : lbann_callback_learning_rate(weights_list),
-    m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter),
-    m_end_lr(end_lr),
-    m_lr(1.0f), m_last_epoch_lr(1.0f) {}
-
-/**
- * Check if the maximum number of iterations is set. If not, compute it by the
- * number of epochs and the number of iterations per epoch.
- */
-void lbann_callback_poly_learning_rate::setup(model *m) {
-  lbann_callback_learning_rate::setup(m);
-  if (m_max_iter == 0ull) {
-    m_max_iter = m_num_epochs * m->get_num_iterations_per_epoch(execution_mode::training);
-  }
-}
-
-/**
- * Keep the record of the learning rate at the end of the current epoch.
- */
-float lbann_callback_poly_learning_rate::global_schedule(model *m) {
-  const float scale = m_lr / m_last_epoch_lr;
-  m_last_epoch_lr = m_lr;
-  return (m_cur_global_lr - m_end_lr) * scale + m_end_lr;
-}
-
-/**
- * Compute the learning rate for the next iteration.
- */
-float lbann_callback_poly_learning_rate::optimizer_schedule(model *m, optimizer &opt) {
-  const uint64_t cur_iter = static_cast<uint64_t>(m->get_step(execution_mode::training));
-  if (m_max_iter > cur_iter) {
-    m_lr = static_cast<float>(std::pow(static_cast<double>(m_max_iter - cur_iter)/m_max_iter, m_p));
-  }
-  const float scale = m_lr / m_last_epoch_lr;
-  return (m_cur_global_lr - m_end_lr) * scale + m_end_lr;
-}
-
-lbann_callback_optimizerwise_adaptive_learning_rate::lbann_callback_optimizerwise_adaptive_learning_rate(
-  float scale) :
-  lbann_callback_optimizerwise_adaptive_learning_rate(scale,
-                                                      std::unordered_set<weights *>()) {}
-
-lbann_callback_optimizerwise_adaptive_learning_rate::lbann_callback_optimizerwise_adaptive_learning_rate(
-  float scale, std::unordered_set<weights *> weights_list) :
-  lbann_callback_learning_rate(weights_list), m_scale(scale) {}
-
-float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule(
-  model *m, optimizer &opt) {
-  DataType param_norm = El::Nrm2(opt.get_weights().get_values());
-  DataType param_grad_norm = El::Nrm2(opt.get_gradient());
-  if (param_norm > DataType(0) && param_grad_norm > DataType(0)) {
-    // TODO: Should incorporate weight decay, etc. here.
-    return m_cur_global_lr * m_scale * param_norm / param_grad_norm;
-  } else {
-    return opt.get_learning_rate();
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp
deleted file mode 100644
index 4ec15242449..00000000000
--- a/src/callbacks/callback_ltfb.cpp
+++ /dev/null
@@ -1,516 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include <tuple>
-#include "lbann/callbacks/callback_ltfb.hpp"
-#include "lbann/callbacks/callback_imcomm.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/optimizers/sgd.hpp"
-#include "lbann/optimizers/adam.hpp"
-
-namespace lbann {
-
-namespace {
-
-/** Generate partner trainer assignments.
- *
- *  Requires a scatter from the world master process. If there are an
- *  odd number of trainers, one of them is partnered with itself.
- */
-El::Int get_partner_trainer(lbann_comm& comm,
-                            const std::string& message_prefix) {
-  if (comm.am_world_master()) { // Root process
-
-    // Assign partner trainers
-    // Note: The first trainer in 'trainers' is paired with the
-    // second, the third with the fourth, and so on. If there are an
-    // odd number of trainers, the last one is partnered with itself.
-    const El::Int num_trainers = comm.get_num_trainers();
-    const El::Int procs_per_trainer = comm.get_procs_per_trainer();
-    std::vector<El::Int> trainers(num_trainers);
-    std::iota(trainers.begin(), trainers.end(), 0);
-    std::shuffle(trainers.begin(), trainers.end(), get_fast_generator());
-
-    // Print partner assignments to standard output
-    std::stringstream msg;
-    msg << message_prefix << "tournament partners -";
-    for (El::Int i = 0; i < num_trainers; i += 2) {
-      msg << (i > 0 ? "," : "")
-          << " {" << trainers[i];
-      if (i+1 < num_trainers) {
-        msg << "," << trainers[i+1];
-      }
-      msg << "}";
-    }
-    msg << "\n";
-    std::cout << msg.str();
-
-    // Send partner assignments to all processes
-    std::vector<El::Int> send_buffer(num_trainers * procs_per_trainer);
-    for (El::Int i = 0; i < num_trainers; i += 2) {
-      const auto& trainer1 = trainers[i];
-      const auto& trainer2 = (i+1 < num_trainers) ? trainers[i+1] : trainer1;
-      std::fill_n(&send_buffer[trainer1 * procs_per_trainer],
-                  procs_per_trainer, trainer2);
-      std::fill_n(&send_buffer[trainer2 * procs_per_trainer],
-                  procs_per_trainer, trainer1);
-    }
-    return comm.scatter(send_buffer.data(), comm.get_world_comm());
-
-  } else { // Non-root process
-    return comm.scatter<El::Int>(comm.get_world_master(),
-                                 comm.get_world_comm());
-  }
-}
-
-/** Exchange weights values with partner trainer.
- *
- *  @param weights_names    Names of weights to exchange. If empty,
- *                          then all weights are exchanged.
- *  @param send_weights     Weights values sent to partner.
- *  @param recv_weights     Weights values recieved from partner.
- */
-void exchange_models__sendrecv_weights(lbann_comm& comm,
-                                       El::Int partner_trainer,
-                                       const std::set<std::string>& weights_names,
-                                       const std::vector<weights*>& send_weights,
-                                       std::vector<weights*>& recv_weights) {
-
-  // Get partner process
-  const El::Int rank_in_trainer = comm.get_rank_in_trainer();
-  const El::Int procs_per_trainer = comm.get_procs_per_trainer();
-  const El::Int partner_rank_in_world = (partner_trainer * procs_per_trainer
-                                         + rank_in_trainer);
-
-  // Exchange weights with partner
-  for (size_t i = 0; i < send_weights.size(); ++i) {
-    const auto& send = *send_weights[i];
-    auto& recv = *recv_weights[i];
-    if (weights_names.empty()
-        || (std::find(weights_names.begin(), weights_names.end(),
-                      send.get_name())
-            != weights_names.end())) {
-
-      // Exchange weights values
-      El::SendRecv(send.get_values().LockedMatrix(),
-                   recv.get_values().Matrix(),
-                   comm.get_world_comm(),
-                   partner_rank_in_world,
-                   partner_rank_in_world);
-
-      // Exchange optimizer state
-      const auto* send_opt = send.get_optimizer();
-      auto* recv_opt = recv.get_optimizer();
-      const auto* send_sgd = dynamic_cast<const sgd*>(send_opt);
-      auto* recv_sgd = dynamic_cast<sgd*>(recv_opt);
-      if (send_sgd != nullptr && recv_sgd != nullptr) {
-        using hyperparameters_type = std::tuple<DataType, DataType, bool>;
-        hyperparameters_type hyperparameters(send_sgd->get_learning_rate(),
-                                             send_sgd->get_momentum(),
-                                             send_sgd->using_nesterov());
-        El::mpi::SendRecv(reinterpret_cast<El::byte*>(&hyperparameters),
-                          sizeof(hyperparameters_type),
-                          partner_rank_in_world,
-                          partner_rank_in_world,
-                          comm.get_world_comm(),
-                          El::SyncInfo<El::Device::CPU>{});
-        recv_sgd->set_learning_rate(std::get<0>(hyperparameters));
-        recv_sgd->set_momentum(std::get<1>(hyperparameters));
-        recv_sgd->set_nesterov(std::get<2>(hyperparameters));
-        El::SendRecv(send_sgd->get_velocity().LockedMatrix(),
-                     recv_sgd->get_velocity().Matrix(),
-                     comm.get_world_comm(),
-                     partner_rank_in_world,
-                     partner_rank_in_world);
-      }
-      const auto* send_adam = dynamic_cast<const adam*>(send_opt);
-      auto* recv_adam = dynamic_cast<adam*>(recv_opt);
-      if (send_adam != nullptr && recv_adam != nullptr) {
-        using hyperparameters_type = std::tuple<DataType, DataType, DataType, DataType, DataType, DataType>;
-        hyperparameters_type hyperparameters(send_adam->get_learning_rate(),
-                                             send_adam->get_beta1(),
-                                             send_adam->get_beta2(),
-                                             send_adam->get_eps(),
-                                             send_adam->get_current_beta1(),
-                                             send_adam->get_current_beta2());
-        El::mpi::SendRecv(reinterpret_cast<El::byte*>(&hyperparameters),
-                          sizeof(hyperparameters_type),
-                          partner_rank_in_world,
-                          partner_rank_in_world,
-                          comm.get_world_comm(),
-                          El::SyncInfo<El::Device::CPU>{});
-        recv_adam->set_learning_rate(std::get<0>(hyperparameters));
-        recv_adam->set_beta1(std::get<1>(hyperparameters));
-        recv_adam->set_beta2(std::get<2>(hyperparameters));
-        recv_adam->set_eps(std::get<3>(hyperparameters));
-        recv_adam->set_current_beta1(std::get<4>(hyperparameters));
-        recv_adam->set_current_beta2(std::get<5>(hyperparameters));
-        El::SendRecv(send_adam->get_moment1().LockedMatrix(),
-                     recv_adam->get_moment1().Matrix(),
-                     comm.get_world_comm(),
-                     partner_rank_in_world,
-                     partner_rank_in_world);
-        El::SendRecv(send_adam->get_moment2().LockedMatrix(),
-                     recv_adam->get_moment2().Matrix(),
-                     comm.get_world_comm(),
-                     partner_rank_in_world,
-                     partner_rank_in_world);
-      }
-
-    }
-  }
-
-}
-
-void exchange_models__checkpoint_file(lbann_comm& comm,
-                                      El::Int partner_trainer,
-                                      model& m,
-                                      const std::set<std::string>& weights_names,
-                                      const std::vector<weights*>& local_weights) {
-
-  // Checkpoint directories
-  const auto local_trainer = comm.get_trainer_rank();
-  const auto step = m.get_step();
-  const std::string send_dir = (m.get_name()
-                                + "_trainer" + std::to_string(local_trainer)
-                                + "_step" + std::to_string(step));
-  const std::string recv_dir = (m.get_name()
-                                + "_trainer" + std::to_string(partner_trainer)
-                                + "_step" + std::to_string(step));
-
-  // Save model checkpoint
-  persist p;
-  p.set_cb_type(callback_type::batch);
-  if (comm.am_trainer_master()) {
-    p.open_checkpoint(send_dir.c_str());
-  } else {
-    std::strcpy(p.m_checkpoint_dir, send_dir.c_str());
-  }
-  m.save_to_checkpoint_shared(p);
-  p.close_checkpoint();
-
-  // Synchronize with partner trainer
-  {
-    const auto rank_in_trainer = comm.get_rank_in_trainer();
-    DataType send = false, recv = false;
-    comm.sendrecv(&send, 1, partner_trainer, rank_in_trainer,
-                  &recv, 1, partner_trainer, rank_in_trainer,
-                  El::SyncInfo<El::Device::CPU>{});
-  }
-
-  // Load model checkpoint from partner trainer
-  p.set_cb_type(callback_type::batch);
-  if (comm.am_trainer_master()) {
-    p.open_restart(recv_dir.c_str());
-  } else {
-    std::strcpy(p.m_checkpoint_dir, recv_dir.c_str());
-  }
-  m.load_from_checkpoint_shared(p);
-  if (comm.am_trainer_master()) {
-    p.close_restart();
-  }
-
-  // Restore weights that shouldn't be exchanged
-  if (!weights_names.empty()) {
-    const auto& model_weights = m.get_weights();
-    for (size_t i = 0; i < model_weights.size(); ++i) {
-      if (std::find(weights_names.begin(),
-                    weights_names.end(),
-                    model_weights[i]->get_name())
-          == weights_names.end()) {
-        *model_weights[i] = *local_weights[i];
-      }
-    }
-  }
-
-}
-
-void restore_local_model__checkpoint_file(lbann_comm& comm, model& m) {
-
-  // Checkpoint directories
-  const auto local_trainer = comm.get_trainer_rank();
-  const auto step = m.get_step();
-  const std::string checkpoint_dir = (m.get_name()
-                                      + "_trainer" + std::to_string(local_trainer)
-                                      + "_step" + std::to_string(step));
-
-  // Load local model checkpoint
-  persist p;
-  p.set_cb_type(callback_type::batch);
-  if (comm.am_trainer_master()) {
-    p.open_restart(checkpoint_dir.c_str());
-  } else {
-    std::strcpy(p.m_checkpoint_dir, checkpoint_dir.c_str());
-  }
-  m.load_from_checkpoint_shared(p);
-  if (comm.am_trainer_master()) {
-    p.close_restart();
-  }
-
-}
-
-/** Get mean metric value with validation set. */
-EvalType evaluate(model& m, const std::string& metric_name) {
-
-  // Make sure data readers finish asynchronous work
-  const auto original_mode = m.get_execution_mode();
-  m.collect_background_data_fetch(original_mode);
-
-  // Mark the data store as loading - Note that this is a temporary fix
-  // for the current use of the tournament
-  m.mark_data_store_explicitly_loading(execution_mode::validation);
-
-  // Evaluate model on validation set
-  m.evaluate(execution_mode::validation);
-
-  // Get metric value
-  bool found_metric = false;
-  EvalType metric_value = 0;
-  for (const auto& met : m.get_metrics()) {
-    if (met->name() == metric_name) {
-      found_metric = true;
-      metric_value = met->get_mean_value(execution_mode::validation);
-      break;
-    }
-  }
-  if (!found_metric) {
-    std::stringstream err;
-    err << "could not find metric \"" << metric_name << "\""
-        << "in model \"" << m.get_name() << "\"";
-    LBANN_ERROR(err.str());
-  }
-
-  // Mark the data store as loaded - Note that this is a temporary fix
-  // for the current use of the tournament
-  m.make_data_store_preloaded(execution_mode::validation);
-
-  // Clean up and return metric value
-  m.set_execution_mode(original_mode);
-  return metric_value;
-
-}
-
-} // namespace
-
-lbann_callback_ltfb::lbann_callback_ltfb(El::Int batch_interval,
-                                         std::string metric_name,
-                                         std::set<std::string> weights_names,
-                                         bool low_score_wins,
-                                         communication_algorithm comm_algo,
-                                         lbann_summary *summarizer)
-  : lbann_callback(batch_interval, summarizer),
-    m_metric_name(std::move(metric_name)),
-    m_weights_names(std::move(weights_names)),
-    m_low_score_wins(low_score_wins),
-    m_comm_algo(comm_algo) {}
-
-lbann_callback_ltfb::lbann_callback_ltfb(const lbann_callback_ltfb& other) :
-  lbann_callback(other),
-  m_metric_name(other.m_metric_name),
-  m_weights_names(other.m_weights_names),
-  m_low_score_wins(other.m_low_score_wins),
-  m_comm_algo(other.m_comm_algo) {
-
-  // Deep copy
-  m_workspace_weights.clear();
-  m_workspace_weights.reserve(other.m_workspace_weights.size());
-  for (const auto& w : other.m_workspace_weights) {
-    m_workspace_weights.emplace_back(w->copy());
-  }
-
-}
-
-lbann_callback_ltfb& lbann_callback_ltfb::operator=(const lbann_callback_ltfb& other) {
-  lbann_callback::operator=(other);
-
-  // Shallow copies
-  m_metric_name = other.m_metric_name;
-  m_weights_names = other.m_weights_names;
-  m_low_score_wins = other.m_low_score_wins;
-  m_comm_algo = other.m_comm_algo;
-
-  // Deep copy
-  m_workspace_weights.clear();
-  m_workspace_weights.reserve(other.m_workspace_weights.size());
-  for (const auto& w : other.m_workspace_weights) {
-    m_workspace_weights.emplace_back(w->copy());
-  }
-
-  return *this;
-}
-
-void lbann_callback_ltfb::setup(model *m) {
-
-  // Create workspace objects
-  const auto& model_weights = m->get_weights();
-  m_workspace_weights.clear();
-  m_workspace_weights.reserve(model_weights.size());
-  for (const auto& w : model_weights) {
-    m_workspace_weights.emplace_back(w->copy());
-  }
-
-  // Make sure model does not have inter-trainer communication callback
-  for (auto&& cb : m->get_callbacks()) {
-    if (dynamic_cast<lbann_callback_imcomm*>(cb) != nullptr) {
-      LBANN_ERROR("Detected both LTFB and imcomm callbacks. ");
-    }
-  }
-
-}
-
-void lbann_callback_ltfb::on_train_begin(model *m) {
-  auto&& comm = *m->get_comm();
-
-  if (comm.am_world_master()) {
-    std::cout << "starting synchronizing trainers...\n";
-  }
-  double tm1 = get_time();
-  /// Make sure that all of the trainers are ready to go before starting
-  comm.intertrainer_barrier();
-
-  if (comm.am_world_master()) {
-    std::cout << "synchronizing trainers... " << get_time()-tm1 <<"s\n";
-  }
-}
-
-void lbann_callback_ltfb::on_batch_begin(model *m) {
-  auto&& comm = *m->get_comm();
-
-  // Check whether to start LTFB round
-  const auto mode = m->get_execution_mode();
-  const auto step = m->get_step();
-  if (mode != execution_mode::training || step == 0) { return; }
-
-  // Print message
-  const auto message_prefix = (std::string{} + "LTFB ("
-                               + "model \"" + m->get_name() + "\", "
-                               + "step " + std::to_string(step)
-                               + "): ");
-  if (comm.am_world_master()) {
-    std::cout << message_prefix + "starting tournament...\n";
-  }
-
-  // Determine partner model for tournament
-  const El::Int local_trainer = comm.get_trainer_rank();
-  const El::Int partner_trainer
-    = get_partner_trainer(comm, message_prefix);
-
-  // Evaluate local model
-  if (comm.am_world_master()) {
-    std::cout << message_prefix + "evaluating local model...\n";
-  }
-  const auto local_score = evaluate(*m, m_metric_name);
-
-  // Store local model data
-  auto&& model_weights = m->get_weights();
-  std::vector<weights*> local_weights;
-  for (size_t i = 0; i < model_weights.size(); ++i) {
-    local_weights.push_back(m_workspace_weights[i].get());
-    *local_weights[i] = *model_weights[i];
-  }
-
-  // Exchange model data with partner trainer
-  if (comm.am_world_master()) {
-    std::cout << message_prefix + "exchanging model data...\n";
-  }
-  switch (m_comm_algo) {
-  case communication_algorithm::sendrecv_weights:
-    exchange_models__sendrecv_weights(comm,
-                                      partner_trainer,
-                                      m_weights_names,
-                                      local_weights,
-                                      model_weights);
-    break;
-  case communication_algorithm::checkpoint_file:
-    exchange_models__checkpoint_file(comm,
-                                     partner_trainer,
-                                     *m,
-                                     m_weights_names,
-                                     local_weights);
-    break;
-  default:
-    LBANN_ERROR("invalid LTFB communication algorithm");
-  }
-
-  // Evaluate partner model
-  if (comm.am_world_master()) {
-    std::cout << message_prefix + "evaluating partner model...\n";
-  }
-  const auto& partner_score = evaluate(*m, m_metric_name);
-
-  // Choose tournament winner
-  // Note: restore local model data if it got a better score.
-  El::Int tournament_winner = partner_trainer;
-  if ((m_low_score_wins && local_score <= partner_score) ||
-      (!m_low_score_wins && local_score >= partner_score)) {
-    tournament_winner = local_trainer;
-    switch (m_comm_algo) {
-    case communication_algorithm::sendrecv_weights:
-      for (size_t i = 0; i < model_weights.size(); ++i) {
-        *model_weights[i] = *local_weights[i];
-      }
-      break;
-    case communication_algorithm::checkpoint_file:
-      restore_local_model__checkpoint_file(comm, *m);
-      break;
-    default:
-      LBANN_ERROR("invalid LTFB communication algorithm");
-    }
-  }
-
-  // Report tournament winner
-  if (comm.am_trainer_master()) {
-    std::stringstream msg;
-    msg << message_prefix
-        << "trainer " << local_trainer << " "
-        << "selected model from trainer " << tournament_winner
-        << " (trainer " << local_trainer << " score "
-        << "= " << local_score << ", "
-        << "trainer " << partner_trainer << " score "
-        << "= " << partner_score << ")" << "\n";
-    std::cout << msg.str();
-  }
-
-}
-
-lbann_callback_ltfb::communication_algorithm
-lbann_callback_ltfb::string_to_comm_algo(const std::string& str) {
-  if (str.empty() || str == "sendrecv_weights") {
-    return communication_algorithm::sendrecv_weights;
-  }
-  if (str == "checkpoint_file") {
-    return communication_algorithm::checkpoint_file;
-  }
-
-  // Invalid LTFB communication algorithm
-  std::stringstream err;
-  err << "invalid LTFB communication algorithm (" << str << ")";
-  LBANN_ERROR(err.str());
-  return communication_algorithm::sendrecv_weights;
-
-}
-
-} // namespace lbann
diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp
deleted file mode 100644
index 7a170be15cf..00000000000
--- a/src/callbacks/callback_perturb_adam.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_perturb_adam.hpp"
-#include "lbann/utils/random.hpp"
-
-namespace lbann {
-
-lbann_callback_perturb_adam::lbann_callback_perturb_adam(DataType learning_rate_factor,
-                                                         DataType beta1_factor,
-                                                         DataType beta2_factor,
-                                                         DataType eps_factor,
-                                                         bool perturb_during_training,
-                                                         El::Int batch_interval,
-                                                         std::set<std::string> weights_names)
-  : lbann_callback(batch_interval),
-    m_learning_rate_factor(learning_rate_factor),
-    m_beta1_factor(beta1_factor),
-    m_beta2_factor(beta2_factor),
-    m_eps_factor(eps_factor),
-    m_perturb_during_training(perturb_during_training),
-    m_weights_names(std::move(weights_names)) {}
-
-void lbann_callback_perturb_adam::setup(model* m) {
-  perturb(*m);
-}
-
-void lbann_callback_perturb_adam::on_batch_begin(model* m) {
-  if (m_perturb_during_training && m->get_step() > 0) {
-    perturb(*m);
-  }
-}
-
-void lbann_callback_perturb_adam::perturb(model& m) const {
-  auto* comm = m.get_comm();
-  for (auto* w : m.get_weights()) {
-    if (w == nullptr) {
-      std::stringstream err;
-      err << "callback \"" << name() << "\" "
-          << "got a weights pointer that is a null pointer";
-      LBANN_ERROR(err.str());
-    }
-    if (m_weights_names.empty()
-        || m_weights_names.count(w->get_name()) > 0) {
-
-      // Check if weights has Adam optimizer
-      auto* opt = dynamic_cast<adam*>(w->get_optimizer());
-      if (!m_weights_names.empty() && opt == nullptr) {
-        auto* opt_ = w->get_optimizer();
-        std::stringstream err;
-        err << "callback \"" << name() << "\" "
-            << "expected weights \"" << w->get_name() << "\" "
-            << "to have an Adam optimizer, but found ";
-        if (opt_ == nullptr) {
-          err << "no optimizer";
-        } else {
-          err << opt_->get_type();
-        }
-        LBANN_ERROR(err.str());
-      }
-
-      // Perturb Adam optimizer
-      if (opt != nullptr) {
-        perturb(*comm, *opt);
-      }
-
-    }
-  }
-}
-
-void lbann_callback_perturb_adam::perturb(lbann_comm& comm, adam& opt) const {
-
-  // Perturb hyperparameters on master process
-  std::vector<DataType> hyperparameters(4);
-  if (comm.am_trainer_master()) {
-
-    // Useful constants
-    // Note: half_epsilon is the difference between 1.0 and the next
-    // smallest representable value.
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
-    constexpr DataType min_val = std::numeric_limits<DataType>::min();
-    constexpr DataType half_epsilon = std::numeric_limits<DataType>::epsilon() / 2;
-
-    // RNG
-    auto& gen = get_generator();
-    std::normal_distribution<DataType> dist(zero, one);
-
-    // Perturb log(learning_rate)
-    auto learning_rate = opt.get_learning_rate();
-    if (m_learning_rate_factor != zero && learning_rate >= zero) {
-      auto log_val = std::log(std::max(learning_rate, min_val));
-      log_val += m_learning_rate_factor * dist(gen);
-      learning_rate = std::exp(log_val);
-    }
-    hyperparameters[0] = learning_rate;
-
-    // Perturb log(1 - beta1)
-    auto beta1 = opt.m_beta1;
-    if (m_beta1_factor != zero && zero <= beta1 && beta1 <= one) {
-      auto log_val = std::log(std::max(one - beta1, half_epsilon));
-      log_val += m_beta1_factor * dist(gen);
-      beta1 = std::max(one - std::exp(log_val), zero);
-    }
-    hyperparameters[1] = beta1;
-
-    // Perturb log(1 - beta2)
-    auto beta2 = opt.m_beta2;
-    if (m_beta2_factor != zero && zero <= beta2 && beta2 <= one) {
-      auto log_val = std::log(std::max(one - beta2, half_epsilon));
-      log_val += m_beta2_factor * dist(gen);
-      beta2 = std::max(one - std::exp(log_val), zero);
-    }
-    hyperparameters[2] = beta2;
-
-    // Perturb log(eps)
-    auto eps = opt.m_eps;
-    if (m_eps_factor != zero && eps >= zero) {
-      auto log_val = std::log(std::max(eps, min_val));
-      log_val += m_eps_factor * dist(gen);
-      eps = std::exp(log_val);
-    }
-    hyperparameters[3] = eps;
-
-  }
-
-  // Communicate hyperparameters from master processes
-  comm.trainer_broadcast(comm.get_trainer_master(),
-                       hyperparameters.data(),
-                       hyperparameters.size());
-
-  // Update hyperparameters
-  opt.set_learning_rate(hyperparameters[0]);
-  opt.m_beta1 = hyperparameters[1];
-  opt.m_beta2 = hyperparameters[2];
-  opt.m_eps = hyperparameters[3];
-
-}
-
-} // namespace lbann
diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp
deleted file mode 100644
index 85b31d0e56d..00000000000
--- a/src/callbacks/callback_print.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_print .hpp .cpp - Callback hooks to print information
-////////////////////////////////////////////////////////////////////////////////
-
-#include <vector>
-#include "lbann/callbacks/callback_print.hpp"
-#include "lbann/layers/io/input/input_layer.hpp"
-#include <iomanip>
-
-namespace lbann {
-
-void lbann_callback_print::setup(model *m) {
-#ifdef LBANN_VERSION
-  lbann_comm *comm = m->get_comm();
-  if (comm->am_world_master()) {
-    std::cout << "Training with LLNL LBANN version "
-              << LBANN_MAKE_STR(LBANN_VERSION) << std::endl;
-  }
-#endif
-}
-
-void lbann_callback_print::on_epoch_begin(model *m) {
-  lbann_comm *comm = m->get_comm();
-  if (comm->am_world_master()) {
-
-    // Get first input layer in model
-    generic_input_layer* input = nullptr;
-    for (auto&& l : m->get_layers()) {
-      input = dynamic_cast<generic_input_layer*>(l);
-      if (input != nullptr) { break; }
-    }
-    if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
-
-    // Print message
-    std::cout << "--------------------------------------------------------------------------------"
-              << std::endl;
-    std::cout << "[" << m->get_epoch() << "] Epoch : stats formated [tr/v/te]"
-              << " iter/epoch ="
-              << " ["
-              << input->get_num_iterations_per_epoch(execution_mode::training)
-              << "/"
-              << input->get_num_iterations_per_epoch(execution_mode::validation)
-              << "/"
-              << input->get_num_iterations_per_epoch(execution_mode::testing)
-              << "]"
-              << std::endl;
-    std::cout << std::setfill(' ') << std::setw(23)
-              << " global MB ="
-              << " ["
-              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::training)
-              << "/"
-              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::validation)
-              << "/"
-              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::testing)
-              << "]"
-              << " global last MB ="
-              << " ["
-              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::training)
-              << std::setw(2) << " "
-              << "/"
-              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::validation)
-              << std::setw(2) << " "
-              << "/"
-              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::testing)
-              << std::setw(2) << " "
-              << "]"
-              << std::endl;
-    std::cout << std::setfill(' ') << std::setw(23)
-              << "  local MB ="
-              << " ["
-              << std::setw(4) << input->get_mini_batch_size(execution_mode::training)
-              << "/"
-              << std::setw(4) << input->get_mini_batch_size(execution_mode::validation)
-              << "/"
-              << std::setw(4) << input->get_mini_batch_size(execution_mode::testing)
-              << "]"
-              << "  local last MB ="
-              << " ["
-              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::training)
-              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::training)
-              << "/"
-              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::validation)
-              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::validation)
-              << "/"
-              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::testing)
-              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::testing)
-              << "]"
-              << std::endl;
-    std::cout << "--------------------------------------------------------------------------------"
-              << std::endl;
-  }
-}
-
-void lbann_callback_print::on_epoch_end(model *m) {
-  report_results(m);
-}
-
-void lbann_callback_print::on_validation_end(model *m) {
-  report_results(m);
-}
-
-void lbann_callback_print::on_test_end(model *m) {
-  report_results(m);
-}
-
-void lbann_callback_print::report_results(model *m) {
-  lbann_comm *comm = m->get_comm();
-
-  // Get string for execution mode
-  const execution_mode mode = m->get_execution_mode();
-  std::string mode_string;
-  switch (mode) {
-  case execution_mode::training:
-    mode_string = "training epoch " + std::to_string(m->get_epoch()-1);
-    break;
-  case execution_mode::validation:
-    mode_string = "validation";
-    break;
-  case execution_mode::testing:
-    mode_string = "test";
-    break;
-  default:
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: "
-        << "invalid execution mode for reporting results";
-    throw lbann_exception(err.str());
-  }
-
-  if (comm->am_trainer_master()) {
-    const int num_trainers = comm->get_num_trainers();
-
-    // Report objective function value
-    const EvalType obj_fn = m->get_objective_function()->get_mean_value(mode);
-    const int obj_fn_samples = m->get_objective_function()->get_statistics_num_samples(mode);
-    if (comm->am_world_master()) {
-      std::vector<EvalType> obj_fn_list(comm->get_num_trainers());
-      std::vector<int> num_samples_list(comm->get_num_trainers());
-      comm->intertrainer_gather(obj_fn, obj_fn_list);
-      comm->intertrainer_gather(obj_fn_samples, num_samples_list);
-      if(!m_print_global_stat_only) {
-        for (int i = 0; i < num_trainers; ++i) {
-          std::cout << m->get_name() << " (instance " <<  i <<  ") "  << mode_string << " "
-                    << "objective function : " << obj_fn_list[i]
-                    << std::endl;
-        }
-      }
-      if (num_trainers > 1) {
-        const EvalType avg_obj_fn = (std::inner_product(num_samples_list.begin(),
-                                                        num_samples_list.end(),
-                                                        obj_fn_list.begin(),
-                                                        EvalType(0))
-                                     / std::accumulate(num_samples_list.begin(),
-                                                       num_samples_list.end(),
-                                                       0));
-        std::cout << m->get_name() << " global average " << mode_string << " "
-                  << "objective function : " << avg_obj_fn
-                  << std::endl;
-      }
-    } else {
-      comm->intertrainer_gather(obj_fn, comm->get_world_master());
-      comm->intertrainer_gather(obj_fn_samples, comm->get_world_master());
-    }
-
-    // Report score for each metric
-    for (const auto& met : m->get_metrics()) {
-      const EvalType score = met->get_mean_value(mode);
-      const int score_samples = met->get_statistics_num_samples(mode);
-      if (comm->am_world_master()) {
-        std::vector<EvalType> score_list(comm->get_num_trainers());
-        std::vector<int> num_samples_list(comm->get_num_trainers());
-        comm->intertrainer_gather(score, score_list);
-        comm->intertrainer_gather(score_samples, num_samples_list);
-        if(!m_print_global_stat_only) {
-          for (int i = 0; i < num_trainers; ++i) {
-            std::cout << m->get_name() << " (instance " << i <<  ") " << mode_string << " "
-                      << met->name() << " : "
-                      << score_list[i] << met->get_unit()
-                      << std::endl;
-          }
-        }
-        if (num_trainers > 1) {
-          const EvalType min_score = *std::min_element(score_list.begin(), score_list.end());
-          const EvalType avg_score = (std::inner_product(num_samples_list.begin(),
-                                                         num_samples_list.end(),
-                                                         score_list.begin(),
-                                                         EvalType(0))
-                                      / std::accumulate(num_samples_list.begin(),
-                                                        num_samples_list.end(),
-                                                        0));
-          const EvalType max_score = *std::max_element(score_list.begin(), score_list.end());
-          EvalType scores_stdev = EvalType(0);
-          for (const auto& t : score_list) {
-            const auto& diff = t - avg_score;
-            scores_stdev += diff * diff;
-          }
-          scores_stdev /= score_list.size() - 1;
-          scores_stdev = std::sqrt(std::max(scores_stdev, EvalType(0)));
-          std::cout << m->get_name() << " (global average) "  << mode_string << " "
-                    << met->name() << " : "
-                    << avg_score << met->get_unit()
-                    << std::endl;
-          std::cout << m->get_name() << " (global min) "  << mode_string << " "
-                    << met->name() << " : "
-                    << min_score << met->get_unit()
-                    << std::endl;
-          std::cout << m->get_name() << " (global max) "  << mode_string << " "
-                    << met->name() << " : "
-                    << max_score << met->get_unit()
-                    << std::endl;
-          std::cout << m->get_name() << " (global stdev) "  << mode_string << " "
-                    << met->name() << " : "
-                    << scores_stdev << met->get_unit()
-                    << std::endl;
-        }
-      } else {
-        comm->intertrainer_gather(score, comm->get_intertrainer_master());
-        comm->intertrainer_gather(score_samples, comm->get_intertrainer_master());
-      }
-    }
-
-  }
-
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp
deleted file mode 100644
index d177a36f1a9..00000000000
--- a/src/callbacks/callback_replace_weights.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_replace_weights.hpp"
-
-namespace lbann {
-
-void lbann_callback_replace_weights::on_batch_end(model *m) {
-  const auto& step = m->get_step(execution_mode::training);
-  if(step % m_batch_interval == 0) {
-    for(size_t i = 0; i < m_src_layers.size(); i++) {
-      m_dst_layers[i]->replace_weights(m_src_layers[i]);
-    }
-  }
-}
-
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp
deleted file mode 100644
index 83dac7384c8..00000000000
--- a/src/callbacks/callback_save_images.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_save_images.hpp"
-#ifdef LBANN_HAS_OPENCV
-#include <opencv2/imgcodecs.hpp>
-#endif // LBANN_HAS_OPENCV
-
-namespace lbann {
-
-namespace {
-
-void save_image(std::string prefix,
-                std::string format,
-                const std::vector<Layer*>& layers,
-                const std::vector<std::string>& layer_names) {
-#ifdef LBANN_HAS_OPENCV
-  for (const auto* l : layers) {
-
-    // Only save outputs of layers in list
-    const auto& name = l->get_name();
-    if (std::find(layer_names.begin(), layer_names.end(), name)
-        == layer_names.end()) {
-      continue;
-    }
-
-    // Check that tensor dimensions are valid for images
-    const auto& dims = l->get_output_dims();
-    El::Int num_channels(0), height(0), width(0);
-    if (dims.size() == 2) {
-      num_channels = 1;
-      height = dims[0];
-      width = dims[1];
-    } else if (dims.size() == 3) {
-      num_channels = dims[0];
-      height = dims[1];
-      width = dims[2];
-    }
-    if (!(num_channels == 1 || num_channels == 3)
-        || height < 1 || width < 1) {
-      std::stringstream err;
-      err << "images are assumed to either be "
-          << "2D tensors in HW format or 3D tensors in CHW format, "
-          << "but the output of layer \"" << l->get_name() << "\" "
-          << "has dimensions ";
-        for (size_t i = 0; i < dims.size(); ++i) {
-          err << (i > 0 ? "" : " x ") << dims[i];
-        }
-      LBANN_ERROR(err.str());
-    }
-
-    // Get tensor data
-    const auto& raw_data = l->get_activations();
-    std::unique_ptr<AbsDistMat> raw_data_v(raw_data.Construct(raw_data.Grid(), raw_data.Root()));
-    El::LockedView(*raw_data_v, raw_data, El::ALL, El::IR(0));
-    CircMat<El::Device::CPU> circ_data(raw_data_v->Grid(), raw_data_v->Root());
-    circ_data = *raw_data_v;
-
-    // Export tensor as image
-    if (circ_data.CrossRank() == circ_data.Root()) {
-      const auto& data = circ_data.LockedMatrix();
-
-      // Data will be scaled to be in [0,256]
-      DataType lower = data(0, 0);
-      DataType upper = data(0, 0);
-      for (El::Int i = 1; i < data.Height(); ++i) {
-        lower = std::min(lower, data(i, 0));
-        upper = std::max(upper, data(i, 0));
-      }
-      const auto& scale = ((upper > lower) ?
-                           256 / (upper - lower) :
-                           DataType(1));
-
-      // Copy data into OpenCV matrix
-      int type = -1;
-      if (num_channels == 1) { type = CV_8UC1; }
-      if (num_channels == 3) { type = CV_8UC3; }
-      cv::Mat img(height, width, type);
-      for (El::Int row = 0; row < height; ++row) {
-        for (El::Int col = 0; col < width; ++col) {
-          const auto& offset = row * width + col;
-          if (num_channels == 1) {
-            img.at<uchar>(row, col)
-              = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
-          } else if (num_channels == 3) {
-            cv::Vec3b pixel;
-            pixel[0] = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
-            pixel[1] = cv::saturate_cast<uchar>(scale * (data(height*width + offset, 0) - lower));
-            pixel[2] = cv::saturate_cast<uchar>(scale * (data(2*height*width + offset, 0) - lower));
-            img.at<cv::Vec3b>(row, col) = pixel;
-          }
-        }
-      }
-
-      // Write image to file
-      cv::imwrite(prefix + "-" + name + "." + format, img);
-
-    }
-
-  }
-#endif // LBANN_HAS_OPENCV
-}
-
-} // namespace
-
-lbann_callback_save_images::lbann_callback_save_images(std::vector<std::string> layer_names,
-                                                       std::string image_format,
-                                                       std::string image_prefix)
-  : lbann_callback(),
-    m_layer_names(std::move(layer_names)),
-    m_image_format(image_format.empty() ? "jpg" : image_format),
-    m_image_prefix(std::move(image_prefix)) {
-#ifndef LBANN_HAS_OPENCV
-  LBANN_ERROR("OpenCV not detected");
-#endif // LBANN_HAS_OPENCV
-}
-
-void lbann_callback_save_images::on_epoch_end(model *m) {
-  save_image(m_image_prefix + "epoch" + std::to_string(m->get_epoch()),
-             m_image_format,
-             m->get_layers(),
-             m_layer_names);
-}
-
-void lbann_callback_save_images::on_test_end(model *m) {
-  save_image(m_image_prefix + "test",
-             m_image_format,
-             m->get_layers(),
-             m_layer_names);
-}
-
-} // namespace lbann
diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp
deleted file mode 100644
index b9efbd08354..00000000000
--- a/src/callbacks/callback_save_model.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_save_model .hpp .cpp - Callbacks to save a models description and weights
-////////////////////////////////////////////////////////////////////////////////
-
-#include <string>
-#include "lbann/callbacks/callback_save_model.hpp"
-#include "lbann/callbacks/callback_checkpoint.hpp" // Reuse the checkpoint naming scheme
-#include <google/protobuf/text_format.h>
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <fstream>
-#include <unistd.h>
-#include <dirent.h>
-#include <cstdlib>
-
-namespace lbann {
-
-
-/// Save the model's prototext and weights
-void lbann_callback_save_model::on_train_end(model *m) {
-  if(!m_disable_save_after_training){
-    save_model(m);
-  }
-}
-
-void lbann_callback_save_model::write_proto_binary(const lbann_data::Model& proto,
-                                                   const std::string filename) {
-  std::fstream output(filename.c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
-  proto.SerializeToOstream(&output);
-}
-
-void lbann_callback_save_model::write_proto_text(const lbann_data::Model& proto,
-                                                 const std::string filename) {
-  int fd = openwrite(filename.c_str());
-  auto output = new google::protobuf::io::FileOutputStream(fd);
-  google::protobuf::TextFormat::Print(proto, output);
-  delete output;
-  close(fd);
-}
-
-bool lbann_callback_save_model::save_model(model *m) {
-  lbann_data::Model model_param;
-
-  p.set_cb_type(callback_type::inference);
-  save_model_weights(m);
-  p.set_cb_type(callback_type::invalid);
-
-#if 0 /// @todo BVE FIXME this method for writing out the prototext does not seem to work
-  m->write_proto(&model_param);
-  std::string filename = m->get_name() + "." + m_extension;
-  std::string fullpath = m_dir + "/" + filename;
-  //@todo flag to save as either binary or text
-  if(m_extension == "bin") write_proto_binary(model_param,fullpath);
-  else write_proto_text(model_param,fullpath);
-#endif
-
-  return true;
-}
-
-// Save model weights
-bool lbann_callback_save_model::save_model_weights(model *m) {
-  // if the checkpoint directory is not defined, bail
-  if (m_dir.length() == 0) {
-    return false;
-  }
-  // time how long this takes
-  // read current epoch and step counters from model
-  El::Timer timer;
-  lbann_comm *comm = m->get_comm();
-  comm->trainer_barrier();
-  // let user know we're saving the weights
-  int epoch = m->get_epoch();
-  int step = m->get_step(execution_mode::training);
-  if (comm->am_trainer_master()) {
-    timer.Start();
-    printf("[%s.%d] Saving model weights: epoch %d step %d ...\n", m->get_name().c_str(), comm->get_trainer_rank(), epoch, step);
-    fflush(stdout);
-  }
-
-  // Shared checkpoint, logic identical to Distributed.i
-  makedir(m_dir.c_str());
-  std::string epochdir = get_shared_checkpoint_dirname(m, m_dir.c_str(), epoch, step);
-  if (comm->am_trainer_master()) {
-    p.open_checkpoint(epochdir.c_str());
-  }
-  // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state
-  comm->trainer_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir));
-  m->save_weights(p);
-  // close our checkpoint
-  p.close_checkpoint();
-  if (comm->am_trainer_master()) {
-    std::string latest_file = get_last_shared_checkpoint_filename(m, m_dir.c_str());
-    write_latest(latest_file, epoch, step);
-  }
-
-  uint64_t bytes_count = p.get_bytes();
-
-  if (comm->am_trainer_master()) {
-    EvalType secs = timer.Stop();
-    EvalType bw = 0;
-    if (secs > 0.0) {
-      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
-    }
-    printf("[%s.%d] Saving model weights complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n",
-           m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw);
-    fflush(stdout);
-  }
-  p.reset_bytes();
-  return true;
-}
-
-bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * m) {
-  std::vector<std::string> weight_list = std::vector<std::string>();
-  int epochLast = -1;
-  int stepLast = -1;
-  std::string active_ckpt_dir = get_last_shared_checkpoint_filename(m, ckpt_dir);
-
-  // get last epoch and step saved.
-  int success = read_latest(active_ckpt_dir, &epochLast, &stepLast);
-  if(!success) {
-    return false;
-  }
-  active_ckpt_dir = get_shared_checkpoint_dirname(m, ckpt_dir, epochLast, stepLast);
-  lbann_comm *comm = m->get_comm();
-  if(comm->am_trainer_master()) {
-    std::cout << "Loading model weights from " << active_ckpt_dir << std::endl;
-  }
-
-  DIR *weight_dir = opendir(active_ckpt_dir.c_str());
-  if(weight_dir == nullptr)
-  {
-    std::cout << "error opening " << active_ckpt_dir << "\n";
-    return false;
-  }
-  // Populate weight list
-  struct dirent *weight_file;
-  while ((weight_file = readdir(weight_dir)) != nullptr){
-    if(!strncmp(weight_file->d_name,"model_weights_",14))
-      weight_list.push_back(std::string(weight_file->d_name));
-  }
-  closedir(weight_dir);
-
-  // load weights that appear in weight list.
-  m->reload_weights(active_ckpt_dir, weight_list);
-  return true;
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/callback_summary.cpp
deleted file mode 100644
index a5d66de440a..00000000000
--- a/src/callbacks/callback_summary.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_callback_summary .hpp .cpp - Callback hooks to summarize to Tensorboard
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_summary.hpp"
-#include "lbann/utils/profiling.hpp"
-
-namespace lbann {
-
-lbann_callback_summary::lbann_callback_summary(lbann_summary *summarizer,
-                                               int batch_interval,
-                                               int mat_interval) :
-  lbann_callback(batch_interval, summarizer),
-  m_mat_interval(mat_interval) {}
-
-lbann_callback_summary::~lbann_callback_summary() {
-  delete m_summarizer;
-}
-
-void lbann_callback_summary::on_train_begin(model *m) {
-  save_histograms(m);
-}
-
-void lbann_callback_summary::on_batch_end(model *m) {
-  prof_region_begin("summary-batch", prof_colors[0], false);
-  m->summarize_stats(*m_summarizer);
-  if (m_mat_interval > 0 && m->get_step(execution_mode::training) % m_mat_interval == 0) {
-    m->summarize_matrices(*m_summarizer);
-  }
-  lbann_comm *comm = m->get_comm();
-  size_t bytes_sent = comm->get_bytes_sent();
-  size_t bytes_received = comm->get_bytes_received();
-  size_t trainer_barriers = comm->get_num_trainer_barriers();
-  size_t intertrainer_barriers = comm->get_num_intertrainer_barriers();
-  size_t global_barriers = comm->get_num_global_barriers();
-  comm->reset_stats_counters();
-  m_summarizer->sum_reduce_scalar("bytes_sent", bytes_sent, m->get_step(execution_mode::training));
-  m_summarizer->sum_reduce_scalar("bytes_received", bytes_received,
-                                  m->get_step(execution_mode::training));
-  m_summarizer->reduce_scalar("trainer_barriers", trainer_barriers,
-                              m->get_step(execution_mode::training));
-  m_summarizer->reduce_scalar("intertrainer_barriers", intertrainer_barriers,
-                              m->get_step(execution_mode::training));
-  m_summarizer->reduce_scalar("global_barriers", global_barriers,
-                              m->get_step(execution_mode::training));
-  prof_region_end("summary-batch", false);
-}
-
-void lbann_callback_summary::on_epoch_end(model *m) {
-  prof_region_begin("summary-epoch", prof_colors[0], false);
-  for (const auto& met : m->get_metrics()) {
-    EvalType train_score = met->get_mean_value(m->get_execution_mode());
-    // Replace spaces with _ for consistency.
-    std::string metric_name = met->name();
-    std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(),
-                   [] (char c) { return c == ' ' ? '_' : c; });
-    std::string phase = "train_" + metric_name;
-    m_summarizer->reduce_scalar(phase, train_score, m->get_step(execution_mode::training));
-  }
-  save_histograms(m);
-  m_summarizer->flush();
-  prof_region_end("summary-epoch", false);
-}
-
-void lbann_callback_summary::on_test_end(model *m) {
-  prof_region_begin("summary-test", prof_colors[0], false);
-  lbann_comm *comm = m->get_comm();
-  for (auto&& met : m->get_metrics()) {
-    EvalType test_score = met->get_mean_value(m->get_execution_mode());
-    // Replace spaces with _ for consistency.
-    std::string metric_name = met->name();
-    std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(),
-                   [] (char c) { return c == ' ' ? '_' : c; });
-    std::string phase = "test_" + metric_name;
-    m_summarizer->reduce_scalar(phase, test_score, m->get_step(execution_mode::training));
-  }
-  // Reset counters incremented during test phase.
-  comm->reset_stats_counters();
-  for (auto&& layer : m->get_layers()) {
-    layer->reset_counters();
-  }
-  prof_region_end("summary-test", false);
-}
-
-void lbann_callback_summary::save_histograms(model *m) {
-  for (const auto& layer : m->get_layers()) {
-    const std::string prefix = layer->get_name() + "/";
-    for (int i = 0; i < layer->get_num_children(); ++i) {
-      AbsDistMatReadProxy<El::Device::CPU> acts(layer->get_activations(i));
-      m_summarizer->reduce_histogram(prefix + "activations" + std::to_string(i),
-                                     acts.GetLocked(),
-                                     m->get_step(execution_mode::training));
-    }
-  }
-  for (const auto& w : m->get_weights()) {
-    const std::string prefix = w->get_name() + "/";
-    AbsDistMatReadProxy<El::Device::CPU> weights(w->get_values());
-    m_summarizer->reduce_histogram(prefix + "weights",
-                                   weights.GetLocked(),
-                                   m->get_step(execution_mode::training));
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr) {
-      AbsDistMatReadProxy<El::Device::CPU> gradients(opt->get_gradient());
-      m_summarizer->reduce_histogram(prefix + "weights_gradient",
-                                     gradients.GetLocked(),
-                                     m->get_step(execution_mode::training));
-    }
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/callback_sync_layers.cpp
deleted file mode 100644
index de8fb939c55..00000000000
--- a/src/callbacks/callback_sync_layers.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_sync_layers.cpp - Callback to synchronize layers
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_sync_layers.hpp"
-#include "lbann/layers/io/input/generic_input_layer.hpp"
-#include "lbann/utils/timer.hpp"
-
-namespace lbann {
-
-void lbann_callback_sync_layers::on_forward_prop_end(model *m, Layer *l) {
-  if (m_only_input && dynamic_cast<generic_input_layer*>(l) == nullptr) {
-    return;  // Skip non-input layers.
-  }
-  double start = get_time();
-  do_sync(l);
-  l->m_fp_time += get_time() - start;
-}
-
-void lbann_callback_sync_layers::on_backward_prop_end(model *m, Layer *l) {
-  if (m_only_input) {
-    return;
-  }
-  double start = get_time();
-  do_sync(l);
-  l->m_bp_time += get_time() - start;
-}
-
-void lbann_callback_sync_layers::do_sync(Layer *l) {
-  #ifdef LBANN_HAS_CUDNN
-  if (m_sync_gpus) {
-    El::GPUManager::SynchronizeDevice();
-  }
-  #endif
-  if (m_sync_mpi) {
-    l->get_comm()->global_barrier();
-  }
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/callback_sync_selected.cpp
deleted file mode 100644
index 8844cd176b2..00000000000
--- a/src/callbacks/callback_sync_selected.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_sync_selected.cpp - Callback to synchronize selected layers
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_sync_selected.hpp"
-#include "lbann/utils/timer.hpp"
-#ifdef LBANN_NVPROF
-#include <cuda_profiler_api.h>
-#include "lbann/utils/file_utils.hpp"
-#include <sstream>
-#endif // LBANN_NVPROF
-
-namespace lbann {
-
-bool lbann_callback_sync_selected::m_cuda_profiler_initialized = false;
-const std::map<lbann_callback_sync_selected::prop_t, std::string>
-  lbann_callback_sync_selected::m_prop_str
-    = {std::make_pair(lbann_callback_sync_selected::prop_t::Both, "Both"),
-       std::make_pair(lbann_callback_sync_selected::prop_t::Forward, "Forward"),
-       std::make_pair(lbann_callback_sync_selected::prop_t::Backward, "Backward")};
-
-lbann_callback_sync_selected::lbann_callback_sync_selected(
-  const lbann_callback_sync_selected::layers_t& layers, bool async_gpus, bool async_mpi)
-  : lbann_callback_sync_layers(!async_gpus, !async_mpi, false),
-    m_layers(layers), m_all_set(false) {
-  #ifdef LBANN_NVPROF
-  cudaProfilerStop(); // make sure to flush out profile data
-  #endif
-
-  size_t cnt_fwd = 0u;
-  size_t cnt_bwd = 0u;
-  for(const auto& l: m_layers) {
-    switch (l.second) {
-      case Forward: cnt_fwd ++; break;
-      case Backward: cnt_bwd ++; break;
-      case Both: cnt_fwd ++; cnt_bwd ++; break;
-    }
-  }
-  m_fwd_ptrs.reserve(cnt_fwd);
-  m_bwd_ptrs.reserve(cnt_bwd);
-}
-
-lbann_callback_sync_selected::~lbann_callback_sync_selected() {
-  #ifdef LBANN_NVPROF
-  cudaProfilerStop(); // make sure to flush out profile data
-  #endif
-}
-
-std::string lbann_callback_sync_selected::get_description() const {
-  std::string selection;
-  for (const auto& l: m_layers) {
-    std::map<prop_t, std::string>::const_iterator it = m_prop_str.find(l.second);
-    selection += l.first + '.' + it->second + ' ';
-  }
-  return "sync_selected : { " + selection + '}';
-}
-
-void lbann_callback_sync_selected::turn_off_init_cuda_profiler() {
-  m_cuda_profiler_initialized = true;
-}
-
-bool lbann_callback_sync_selected::check_if_cuda_profiler_initialized() {
-  return m_cuda_profiler_initialized;
-}
-
-/**
- * Allow users to pass parameters to cudaProfilerInitialize() via prototext.
- * @param cfg_file configuration file for cuda profiler.
- *        (cuda_profiler_setup::config_file in the prototext)
- * @param out_dir output mode for cuda profiler.
- *        (cuda_profiler_setup::output_dir in the prototext)
- * @param out_mode output mode for cuda profiler.
- *        (cuda_profiler_setup::output_mode in the prototext)
- * @param comm global world communicator.
- * The profile output will be wrttien to out_dir/layer_name.prop.rank.prof
- */
-void lbann_callback_sync_selected::init_cuda_profiler(
-  const std::string cfg_file, const std::string out_dir, int out_mode, lbann_comm* comm) const {
-#ifdef LBANN_NVPROF
-  if (check_if_cuda_profiler_initialized()) {
-    return;
-  }
-  turn_off_init_cuda_profiler();
-
-  std::string o_dir = out_dir;
-  if (comm->am_world_master()) {
-    if (!lbann::create_dir(o_dir)) {
-      throw lbann_exception("sync_selected failed to create output directory: " + out_dir);
-    }
-  }
-  o_dir = add_delimiter(o_dir);
-
-  El::GPUManager::SynchronizeDevice();
-  comm->global_barrier();
-
-  std::string selection;
-  for (const auto& l: m_layers) {
-    std::map<prop_t, std::string>::const_iterator it = m_prop_str.find(l.second);
-    selection += l.first + '.' + it->second + '.';
-  }
-  const std::string o_prefix = o_dir + selection;
-  const int my_rank = comm->get_rank_in_world();
-  const std::string o_file = o_prefix + std::to_string(my_rank) + ".prof";
-  const cudaOutputMode_t o_mode = (out_mode == 0)? cudaKeyValuePair : cudaCSV;
-
-  const auto ret = cudaProfilerInitialize(cfg_file.c_str(), o_file.c_str(), o_mode);
-
-  if (ret == cudaErrorInvalidValue) {
-    throw lbann_exception("sync_selected is unabled to initialze cuda profiler: invalid inputs.");
-  } else if (ret == cudaErrorProfilerDisabled) {
-    std::stringstream err;
-    err << "sync_selected is unable to initialize cuda profiler: " << std::endl
-        << "  An external profiling tool (nvprof/nvvp) may already be running." << std::endl
-        << "  To use this callback with such a tool, set 'cuda_profiler::no_init'." << std::endl;
-    throw lbann_exception(err.str());
-  } else {
-    cudaProfilerStop(); // suppress profiling until reaching the region of interest
-
-    if (comm->am_world_master()) {
-      std::string msg = "Preparing callback sync_selected";
-      if (!o_prefix.empty()) {
-        msg += " with cudaProfiler writing to " + o_prefix + ".rank.prof";
-      }
-      std::cout << msg << std::endl;
-    }
-  }
-#endif
-}
-
-void lbann_callback_sync_selected::setup(model *m) {
-  const std::vector<Layer *>& layers = m->get_layers();
-  for (auto l: layers) {
-    populate_layer_ptrs(l, Forward);
-    populate_layer_ptrs(l, Backward);
-  }
-  if (!m_all_set) {
-    throw lbann_exception("sync_selected cannot recognize all the layer names");
-  }
-}
-
-
-void lbann_callback_sync_selected::on_forward_prop_begin(model *m, Layer *l) {
-  const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l);
-
-  if (it == m_fwd_ptrs.cend()) {
-    return;
-  }
-  // We do not measure the time to synchronize here and thus not contribute it
-  // back to the cost of the preceding layer as we are only interested in the
-  // selected layer.
-  do_pre_sync(l);
-}
-
-void lbann_callback_sync_selected::on_forward_prop_end(model *m, Layer *l) {
-  const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l);
-  if (it == m_fwd_ptrs.cend()) {
-    return;
-  }
-  const double start = get_time();
-  do_sync(l);
-  l->m_fp_time += get_time() - start;
-}
-
-void lbann_callback_sync_selected::on_backward_prop_begin(model *m, Layer *l) {
-  const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l);
-
-  if (it == m_bwd_ptrs.cend()) {
-    return;
-  }
-  do_pre_sync(l);
-}
-
-void lbann_callback_sync_selected::on_backward_prop_end(model *m, Layer *l) {
-  const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l);
-  if (it == m_bwd_ptrs.cend()) {
-    return;
-  }
-  const double start = get_time();
-  do_sync(l);
-  l->m_bp_time += get_time() - start;
-}
-
-bool lbann_callback_sync_selected::check_if_all_accounted_for() const {
-  return (m_fwd_ptrs.size() + m_bwd_ptrs.size()
-         == m_layers.size() + m_both_ptrs.size());
-}
-
-/**
- * When the pointer of a selected layer is not known, rely on the layer name
- * to match. When the first time the match is found, save the pointer of the
- * selected layer and use it for the subsequent matching instead of name.
- */
-lbann_callback_sync_selected::layer_ptrs_t::iterator
-lbann_callback_sync_selected::populate_layer_ptrs(
-  Layer* l, const lbann_callback_sync_selected::prop_t current_prop) {
-
-  std::pair<layer_ptrs_t::iterator, bool> ret
-    = std::make_pair(((current_prop == Forward)? m_fwd_ptrs.end() : m_bwd_ptrs.end()), false);
-
-  const layers_t::const_iterator it = m_layers.find(l->get_name());
-
-  if (it != m_layers.cend()) { // A matching layer is found
-    const prop_t selected_prop = it->second;
-
-    if ((selected_prop != Both) && (selected_prop != current_prop)) {
-      return ret.first; // Prop direction does not match
-    }
-
-    if (selected_prop == Forward) {
-      ret = m_fwd_ptrs.emplace(l);
-    } else if (selected_prop == Backward) {
-      ret = m_bwd_ptrs.emplace(l);
-    } else { // Both
-      m_both_ptrs.emplace(l);
-
-      if (current_prop == Forward) {
-        ret = m_fwd_ptrs.emplace(l);
-        m_bwd_ptrs.emplace(l);
-      } else {
-        m_fwd_ptrs.emplace(l);
-        ret = m_bwd_ptrs.emplace(l);
-      }
-    }
-    if (check_if_all_accounted_for()) {
-      m_all_set = true;
-    }
-  }
-  return ret.first;
-}
-
-
-void lbann_callback_sync_selected::do_pre_sync(Layer *l) {
-  lbann_callback_sync_layers::do_sync(l);
-  #ifdef LBANN_NVPROF
-  cudaProfilerStart();
-  #endif
-}
-
-void lbann_callback_sync_selected::do_sync(Layer *l) {
-#ifdef LBANN_NVPROF //(also deinfed LBANN_HAS_GPU)
-  if (m_sync_gpus) {
-    El::GPUManager::SynchronizeDevice();
-    cudaProfilerStop();
-  }
-  if (m_sync_mpi) {
-    l->get_comm()->global_barrier();
-  }
-  if (!m_sync_gpus) {
-    cudaProfilerStop();
-  }
-#else
-  lbann_callback_sync_layers::do_sync(l);
-#endif
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/callback_timeline.cpp
deleted file mode 100644
index c4701078d29..00000000000
--- a/src/callbacks/callback_timeline.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime
-////////////////////////////////////////////////////////////////////////////////
-
-#include <fstream>
-#include "lbann/callbacks/callback_timeline.hpp"
-#include "lbann/utils/timer.hpp"
-
-namespace lbann {
-
-void lbann_callback_timeline::on_train_begin(model *m) {
-  // Set up layers and weights.
-  for (const auto& l : m->get_layers()) {
-    m_fp_times.emplace(l->get_name(), std::vector<std::pair<EvalType,EvalType>>());
-    m_bp_times.emplace(l->get_name(), std::vector<std::pair<EvalType,EvalType>>());
-  }
-  for (const auto& w : m->get_weights()) {
-    m_opt_times.emplace(w->get_name(), std::vector<std::pair<EvalType,EvalType>>());
-  }
-  // Ensure the model is synchronized at the start.
-  m->get_comm()->trainer_barrier();
-  m_start_time = get_time();
-}
-
-void lbann_callback_timeline::on_train_end(model *m) {
-  const std::string path = m_outdir + "/timeline.m" +
-    std::to_string(m->get_comm()->get_trainer_rank()) + "." +
-    std::to_string(m->get_comm()->get_rank_in_trainer()) + ".txt";
-  std::ofstream f(path);
-  for (const auto& kv : m_fp_times) {
-    const std::string layer_name = "fp-" + kv.first;
-    for (const auto& time : kv.second) {
-      f << layer_name << ":" << time.first << ":" << time.second << '\n';
-    }
-  }
-  for (const auto& kv : m_bp_times) {
-    const std::string layer_name = "bp-" + kv.first;
-    for (const auto& time : kv.second) {
-      f << layer_name << ":" << time.first << ":" << time.second << '\n';
-    }
-  }
-  for (const auto& kv : m_opt_times) {
-    const std::string weights_name = "opt-" + kv.first;
-    for (const auto& time : kv.second) {
-      f << weights_name << ":" << time.first << ":" << time.second << '\n';
-    }
-  }
-}
-
-void lbann_callback_timeline::on_forward_prop_begin(model *m, Layer *l) {
-  m_fp_start_time = get_rel_time();
-}
-
-void lbann_callback_timeline::on_forward_prop_end(model *m, Layer *l) {
-  EvalType end = get_rel_time();
-  m_fp_times[l->get_name()].emplace_back(m_fp_start_time, end);
-}
-
-void lbann_callback_timeline::on_backward_prop_begin(model *m, Layer *l) {
-  m_bp_start_time = get_rel_time();
-}
-
-void lbann_callback_timeline::on_backward_prop_end(model *m, Layer *l) {
-  EvalType end = get_rel_time();
-  m_bp_times[l->get_name()].emplace_back(m_bp_start_time, end);
-}
-
-void lbann_callback_timeline::on_optimize_begin(model *m, weights *w) {
-  m_opt_start_time = get_rel_time();
-}
-
-void lbann_callback_timeline::on_optimize_end(model *m, weights *w) {
-  EvalType end = get_rel_time();
-  m_opt_times[w->get_name()].emplace_back(m_opt_start_time, end);
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_timer.cpp b/src/callbacks/callback_timer.cpp
deleted file mode 100644
index 2300951a335..00000000000
--- a/src/callbacks/callback_timer.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-///////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/callbacks/callback_timer.hpp"
-#include "lbann/utils/timer.hpp"
-#include <algorithm>
-
-namespace lbann {
-
-void lbann_callback_timer::batch_timing_begin(const model& m) {
-  const auto& mode = m.get_execution_mode();
-  m_batch_start_times[mode] = get_time();
-}
-
-void lbann_callback_timer::batch_timing_end(const model& m) {
-  const auto& mode = m.get_execution_mode();
-  const auto& batch_time = get_time() - m_batch_start_times[mode];
-  m_batch_times[mode].push_back(batch_time);
-  if (m_summarizer != nullptr) {
-    m_summarizer->reduce_scalar("minibatch_time", batch_time, m.get_step(execution_mode::training)-1);
-    m_summarizer->reduce_scalar_all("minibatch_time", batch_time, m.get_step(execution_mode::training)-1);
-  }
-}
-
-void lbann_callback_timer::timing_begin(const model& m) {
-  const auto& mode = m.get_execution_mode();
-  m_start_times[mode] = get_time();
-  m_batch_times[mode].clear();
-}
-
-void lbann_callback_timer::timing_end(model& m) {
-  constexpr EvalType zero = 0;
-
-  // Get run time
-  const auto& mode = m.get_execution_mode();
-  const auto& run_time = get_time() - m_start_times[mode];
-
-  // Compute minibatch statistics
-  const auto& batch_times = m_batch_times[mode];
-  const auto& num_batches = batch_times.size();
-  EvalType batch_time_mean = std::nan("");
-  EvalType batch_time_min = std::nan("");
-  EvalType batch_time_max = std::nan("");
-  EvalType batch_time_stdev = std::nan("");
-  if (num_batches > 0) {
-    batch_time_mean = std::accumulate(batch_times.begin(),
-                                      batch_times.end(),
-                                      zero) / num_batches;
-    batch_time_min = *std::min_element(batch_times.begin(),
-                                       batch_times.end());
-    batch_time_max = *std::max_element(batch_times.begin(),
-                                       batch_times.end());
-  }
-  if (num_batches > 1) {
-    batch_time_stdev = zero;
-    for (const auto& t : batch_times) {
-      const auto& diff = t - batch_time_mean;
-      batch_time_stdev += diff * diff;
-    }
-    batch_time_stdev /= num_batches - 1;
-    batch_time_stdev = std::sqrt(std::max(batch_time_stdev, zero));
-  }
-
-  // Get string for execution mode
-  std::string mode_string;
-  switch(mode) {
-  case execution_mode::training:
-    mode_string = "training epoch " + std::to_string(m.get_epoch()-1);
-    break;
-  case execution_mode::validation:
-    mode_string = "validation";
-    break;
-  case execution_mode::testing:
-    mode_string = "test";
-    break;
-  default:
-    LBANN_ERROR("invalid execution mode");
-  }
-
-  // Report timing results
-  auto& comm = *m.get_comm();
-  const El::Int num_models = comm.get_num_trainers();
-  if (comm.am_trainer_master()) {
-
-    // Gather timing results in world master
-    std::vector<EvalType> run_time_list(num_models);
-    std::vector<EvalType> mean_list(num_models);
-    std::vector<EvalType> min_list(num_models);
-    std::vector<EvalType> max_list(num_models);
-    std::vector<EvalType> stdev_list(num_models);
-    if (comm.am_world_master()) {
-      comm.intertrainer_gather(run_time, run_time_list);
-      comm.intertrainer_gather(batch_time_mean, mean_list);
-      comm.intertrainer_gather(batch_time_min, min_list);
-      comm.intertrainer_gather(batch_time_max, max_list);
-      comm.intertrainer_gather(batch_time_stdev, stdev_list);
-    } else {
-      const auto& world_master = comm.get_intertrainer_master();
-      comm.intertrainer_gather(run_time, world_master);
-      comm.intertrainer_gather(batch_time_mean, world_master);
-      comm.intertrainer_gather(batch_time_min, world_master);
-      comm.intertrainer_gather(batch_time_max, world_master);
-      comm.intertrainer_gather(batch_time_stdev, world_master);
-    }
-
-    // Print results
-    if (comm.am_world_master()) {
-      for (El::Int i = 0; i < num_models; ++i) {
-        std::cout << m.get_name() << " (instance "<< i << ") " << mode_string << " "
-                  << "run time : " << run_time_list[i] << "s"
-                  << std::endl;
-      }
-      for (El::Int i = 0; i < num_models; ++i) {
-        std::cout << m.get_name() << " (instance " << i << ") " << mode_string << " "
-                  << "mini-batch time statistics : ";
-        if (std::isnan(mean_list[i])) {
-          std::cout << "N/A";
-        } else {
-          std::cout << mean_list[i] << "s";
-        }
-        std::cout << " mean, ";
-        if (std::isnan(max_list[i])) {
-          std::cout << "N/A";
-        } else {
-          std::cout << max_list[i] << "s";
-        }
-        std::cout << " max, ";
-        if (std::isnan(min_list[i])) {
-          std::cout << "N/A";
-        } else {
-          std::cout << min_list[i] << "s";
-        }
-        std::cout << " min, ";
-        if (std::isnan(stdev_list[i])) {
-          std::cout << "N/A";
-        } else {
-          std::cout << stdev_list[i] << "s";
-        }
-        std::cout << " stdev" << std::endl;
-      }
-
-    }
-  }
-
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/callback_variable_minibatch.cpp
deleted file mode 100644
index 5300b881b84..00000000000
--- a/src/callbacks/callback_variable_minibatch.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches
-////////////////////////////////////////////////////////////////////////////////
-
-#include <utility>
-
-#include "lbann/callbacks/callback_variable_minibatch.hpp"
-#include "lbann/layers/io/input/input_layer.hpp"
-
-namespace lbann {
-
-lbann_callback_variable_minibatch::lbann_callback_variable_minibatch(
-  int starting_mbsize) : m_starting_mbsize(starting_mbsize),
-                         m_current_mini_batch_size(starting_mbsize) {}
-
-void lbann_callback_variable_minibatch::on_train_begin(model *m) {
-  // Avoid issues with the train method being called multiple times.
-  if (m->get_epoch() != 0) { return; }
-
-  // Get first input layer in model
-  generic_input_layer* input = nullptr;
-  for (auto&& l : m->get_layers()) {
-    input = dynamic_cast<generic_input_layer*>(l);
-    if (input != nullptr) { break; }
-  }
-  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
-
-  if (m_starting_mbsize > m->get_max_mini_batch_size()) {
-    throw lbann_exception(
-      "variable_minibatch: starting mini-batch size is larger than max");
-  }
-  if (m_starting_mbsize == m->get_max_mini_batch_size()) {
-    if (m->get_comm()->am_world_master()) {
-      std::cout << "WARNING: starting mini-batch size equals max mini-batch "
-                << "size and using variable-sized mini-batches" << std::endl;
-    }
-  }
-  input->calculate_num_iterations_per_epoch_training_spans_models(
-    m_starting_mbsize);
-}
-
-void lbann_callback_variable_minibatch::on_epoch_end(model *m) {
-
-  // Get first input layer in model
-  generic_input_layer* input = nullptr;
-  for (auto&& l : m->get_layers()) {
-    input = dynamic_cast<generic_input_layer*>(l);
-    if (input != nullptr) { break; }
-  }
-  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
-
-  lbann_comm *comm = m->get_comm();
-  int new_mbsize = 0;
-  float new_lr = 0.0f;
-  int ramp_time = 0;
-  if (schedule(m, new_mbsize, new_lr, ramp_time)) {
-    if (new_mbsize > m->get_max_mini_batch_size()) {
-      if (comm->am_trainer_master()) {
-        std::cout << "Model " << comm->get_trainer_rank() << ": WARNING " <<
-          "requested new mini-batch size " << new_mbsize <<
-          " is greater than the model maximum mini-batch size " <<
-          m->get_max_mini_batch_size() << std::endl;
-      }
-      new_mbsize = m->get_max_mini_batch_size();
-    }
-    input->calculate_num_iterations_per_epoch_training_spans_models(new_mbsize);
-    m_current_mini_batch_size = new_mbsize;
-    m_ramp_count = ramp_time;
-    if (new_lr != 0.0f) {
-      if (ramp_time == 0) {
-        // Change learning rate immediately.
-        change_learning_rate(m, new_lr);
-      } else {
-        // Compute the per-epoch learning rate increment.
-        float cur_lr = get_current_learning_rate(m);
-        m_lr_incr = (new_lr - cur_lr) / ramp_time;
-      }
-      if (comm->am_trainer_master()) {
-        std::cout << "Model " << comm->get_trainer_rank() <<
-          ": Changing mini-batch size to " << new_mbsize <<
-          " and learning rate to " << new_lr << " at epoch " <<
-          m->get_epoch() << std::endl;
-      }
-    } else if (comm->am_trainer_master()) {
-      std::cout << "Model " << comm->get_trainer_rank() <<
-        ": Changing mini-batch size to " << new_mbsize <<
-        " at epoch " << m->get_epoch() << std::endl;
-    }
-  }
-  // Ramp the learning rate, if needed.
-  if (m_ramp_count > 0) {
-    --m_ramp_count;
-    float target_lr = get_current_learning_rate(m) + m_lr_incr;
-    change_learning_rate(m, target_lr);
-    if (comm->am_trainer_master()) {
-      std::cout << "Model " << comm->get_trainer_rank() <<
-        ": Variable-size mini-batch ramping learning rate to " <<
-        target_lr << std::endl;
-    }
-  }
-}
-
-void lbann_callback_variable_minibatch::change_learning_rate(
-  model *m, float new_lr) const {
-  for (weights *w : m->get_weights()) {
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr) {
-      opt->set_learning_rate(new_lr);
-    }
-  }
-}
-
-float lbann_callback_variable_minibatch::get_current_learning_rate(
-  model *m) const {
-  for (weights *w : m->get_weights()) {
-    optimizer *opt = w->get_optimizer();
-    if (opt != nullptr) {
-      return opt->get_learning_rate();
-    }
-  }
-  return 0.0f;
-}
-
-lbann_callback_step_minibatch::lbann_callback_step_minibatch(
-  int starting_mbsize, int step, int ramp_time) :
-  lbann_callback_variable_minibatch(starting_mbsize), m_step(step),
-  m_ramp_time(ramp_time) {}
-
-bool lbann_callback_step_minibatch::schedule(
-  model *m, int& new_mbsize, float& new_lr, int& ramp_time) {
-  if (m->get_epoch() % m_step == 0) {
-    new_mbsize = m_current_mini_batch_size * 2;
-    new_lr = get_current_learning_rate(m) * 2;
-    ramp_time = m_ramp_time;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-lbann_callback_minibatch_schedule::lbann_callback_minibatch_schedule(
-  int starting_mbsize, std::vector<minibatch_step> steps) :
-  lbann_callback_variable_minibatch(starting_mbsize), m_steps(std::move(steps)) {
-  std::sort(m_steps.rbegin(), m_steps.rend(),
-            [] (const minibatch_step& a, const minibatch_step& b) {
-              return a.epoch < b.epoch;
-            });
-}
-
-bool lbann_callback_minibatch_schedule::schedule(
-  model *m, int& new_mbsize, float& new_lr, int& ramp_time) {
-  if (!m_steps.empty() && m->get_epoch() == m_steps.back().epoch) {
-    new_mbsize = m_steps.back().mbsize;
-    new_lr = m_steps.back().lr;
-    ramp_time = m_steps.back().ramp_time;
-    m_steps.pop_back();
-    return true;
-  }
-  return false;
-}
-
-}  // namespace lbann
diff --git a/src/callbacks/check_dataset.cpp b/src/callbacks/check_dataset.cpp
new file mode 100644
index 00000000000..8b2fcb5cc7f
--- /dev/null
+++ b/src/callbacks/check_dataset.cpp
@@ -0,0 +1,167 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "lbann/callbacks/check_dataset.hpp"
+#include "lbann/layers/io/io_layer.hpp"
+#include "lbann/layers/io/input/input_layer.hpp"
+#include <iomanip>
+
+namespace lbann {
+namespace callback {
+
+void check_dataset::add_to_set(model *m, Layer *l, int64_t step, std::set<long>& set) {
+  if (!dynamic_cast<io_layer<DataType>*>(l)) {
+    return;
+  }
+
+  El::Matrix<El::Int>* indices = l->get_sample_indices_per_mb();
+
+  std::set<long>::iterator it;
+
+  for(El::Int i = 0; i < indices->Height(); i++) {
+    for(El::Int j = 0; j < indices->Width(); j++) {
+      El::Int idx = indices->Get(i,j);
+      it = set.find(idx);
+      if(it != set.end()) {
+        throw lbann_exception(
+          std::string{} + __FILE__ + " " + std::to_string(__LINE__)
+          + " :: @" + std::to_string(step)
+          + " :: found a duplicate index in being loaded: " + std::to_string(idx));
+      }else {
+        set.insert(idx);
+      }
+    }
+  }
+}
+
+void check_dataset::on_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  add_to_set(m, l, c.get_step(), training_set);
+}
+
+void check_dataset::on_evaluate_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  switch(c.get_execution_mode()) {
+  case execution_mode::validation:
+    add_to_set(m, l, c.get_step(), validation_set);
+    break;
+  case execution_mode::testing:
+    add_to_set(m, l, c.get_step(), testing_set);
+    break;
+  default:
+    LBANN_ERROR("check_dataset: invalid execution phase");
+  }
+}
+
+void check_dataset::on_epoch_end(model *m) {
+  lbann_comm* comm = m->get_comm();
+  std::cout << "Training [" << comm->get_rank_in_trainer() <<
+    "] : I have processed " << training_set.size() << " elements" << std::endl;
+
+  // Get first input layer in model
+  generic_input_layer<DataType>* input = nullptr;
+  for (auto&& l : m->get_layers()) {
+    input = dynamic_cast<generic_input_layer<DataType>*>(l);
+    if (input != nullptr) { break; }
+  }
+  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
+
+  int num_samples = training_set.size();
+  std::vector<int> vec_num_samples(comm->get_procs_per_trainer());
+  if (comm->am_trainer_master()) {
+    comm->trainer_gather(num_samples, vec_num_samples.data());
+  }else {
+    comm->trainer_gather(num_samples, comm->get_trainer_master());
+  }
+  std::vector<int> sample_offsets(comm->get_procs_per_trainer());
+  std::partial_sum(vec_num_samples.begin(), vec_num_samples.end(), sample_offsets.begin());
+  std::cout << "Training [" << comm->get_rank_in_trainer() << "] offsets";
+  for (const auto& idx : sample_offsets) {
+    std::cout << idx << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "Training [" << comm->get_rank_in_trainer() << "] counts";
+  for (const auto& idx : vec_num_samples) {
+    std::cout << idx << " ";
+  }
+  std::cout << std::endl;
+
+  // sample_offset[]
+  // for (int i = 0; i < vec_num_samples.size(); i++) {
+  //   //  for (const auto& idx : vec_num_samples) {
+
+  // }
+
+  // Build a vector large enough to hold all the data indices for this rank.
+  std::vector<int> local_data(training_set.size());
+  std::copy(training_set.begin(), training_set.end(), local_data.data());
+
+  std::cout << "Training: my local vector has size " << local_data.size() << std::endl;
+  if (comm->am_trainer_master()) {
+    // Build a vector large enough to hold all indices for the model.
+    std::vector<int> model_training_set(
+      input->get_num_iterations_per_epoch(execution_mode::training) *
+      m->get_execution_context().get_trainer().get_max_mini_batch_size());
+
+    std::cout << "Training: my model vector has size " << model_training_set.size() << std::endl;
+    // comm->trainer_gatherv(local_data.data(), local_data.size(),
+    //                     model_training_set.data(), vec_num_samples.data(), sample_offsets.data());
+
+    std::cout << "Training: The entire model has processed " << model_training_set.size() << " elements" << std::endl;
+  } else {
+    // comm->trainer_gatherv(local_data.data(), local_data.size(),
+    //                     m->get_comm()->get_trainer_master());
+  }
+
+  std::cout << "Training [" << comm->get_rank_in_trainer() << "] ";
+  for (const auto& idx : training_set) {
+    std::cout << idx << " ";
+  }
+  std::cout << std::endl;
+
+  training_set.clear();
+}
+
+void check_dataset::on_validation_end(model *m) {
+  std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << validation_set.size() << " elements" << std::endl;
+#if 0
+  std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] ";
+  for(std::set<long>::iterator iter=validation_set.begin(); iter!=validation_set.end();++iter) {
+    std::cout << *iter << " ";
+  }
+  std::cout << std::endl;
+#endif
+  validation_set.clear();
+}
+
+void check_dataset::on_test_end(model *m) {
+  std::cout << "Testing [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << testing_set.size() << " elements" << std::endl;
+  testing_set.clear();
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp
new file mode 100644
index 00000000000..01b059e3765
--- /dev/null
+++ b/src/callbacks/check_gradients.cpp
@@ -0,0 +1,321 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/check_gradients.hpp"
+#include "lbann/data_readers/data_reader.hpp"
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include "lbann/utils/h2_tmp.hpp"
+
+#include <callbacks.pb.h>
+
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <memory>
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+/** @details Forward prop is applied to all layers, except input
+ *  layers. It is assumed that input layers have already loaded data.
+ */
+EvalType compute_objective_function(model& m) {
+  const auto& c = static_cast<sgd_execution_context&>(m.get_execution_context());
+
+  // Forward prop, skipping input layers
+  for (auto&& l : m.get_layers()) {
+    if (dynamic_cast<generic_input_layer<DataType>*>(l) == nullptr) {
+      l->forward_prop();
+    }
+  }
+
+  // Get objective function value
+  auto&& obj = m.get_objective_function();
+  const auto mode = c.get_execution_mode();
+  const auto mini_batch_size = c.get_current_mini_batch_size();
+  obj->start_evaluation(mode, mini_batch_size);
+  return obj->finish_evaluation(mode, mini_batch_size);
+
+}
+
+struct DefaultErrorReporter
+{
+  template <typename... Ts>
+  void DispatchError(Ts&&...)
+  {
+    LBANN_ERROR("Unable to dispatch functor.");
+  }
+
+  template <typename... Ts>
+  void DeductionError(Ts&&...)
+  {
+    LBANN_ERROR("Unable to deduce an argument type.");
+  }
+};
+
+struct CheckWeightsFunctor : DefaultErrorReporter
+{
+  model &m;
+  sgd_execution_context const& c;
+  EvalType epsilon;
+  EvalType step_size;
+  EvalType expected_error;
+  bool verbose;
+  bool error_on_failure;
+
+  CheckWeightsFunctor(model& arg_m,
+                      sgd_execution_context const& arg_c,
+                      EvalType arg_epsilon,
+                      EvalType arg_step_size,
+                      EvalType arg_expected_error,
+                      bool arg_verbose,
+                      bool arg_error_on_failure)
+    : m(arg_m),
+      c(arg_c),
+      epsilon(arg_epsilon),
+      step_size(arg_step_size),
+      expected_error(arg_expected_error),
+      verbose(arg_verbose),
+      error_on_failure(arg_error_on_failure)
+  {}
+
+  template <typename TensorDataType>
+  void operator()(data_type_weights<TensorDataType>& dtw) {
+    // Get weights matrix and gradient
+    const auto& weights_matrix = dtw.get_values();
+    const auto& gradient = dtw.get_optimizer()->get_gradient();
+
+    // Iterate through weights matrix entries
+    for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
+      for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
+        const bool weight_is_local = weights_matrix.IsLocal(row, col);
+        const El::Int local_row = (weight_is_local
+                                   ? weights_matrix.LocalRow(row)
+                                   : 0);
+        const El::Int local_col = (weight_is_local
+                                   ? weights_matrix.LocalCol(col)
+                                   : 0);
+        const TensorDataType initial_weight =
+          (weight_is_local
+           ? weights_matrix.GetLocal(local_row, local_col)
+           : TensorDataType(0.));
+
+        // Compute objective function values
+        // Note: matrix entry is reset after computing objective
+        // function values
+        dtw.set_value(initial_weight + El::To<TensorDataType>(2 * step_size), row, col);
+        const EvalType f_2h = compute_objective_function(m);
+        dtw.set_value(initial_weight + El::To<TensorDataType>(step_size), row, col);
+        const EvalType f_h = compute_objective_function(m);
+        dtw.set_value(initial_weight - El::To<TensorDataType>(step_size), row, col);
+        const EvalType f_nh = compute_objective_function(m);
+        dtw.set_value(initial_weight - El::To<TensorDataType>(2 * step_size), row, col);
+        const EvalType f_n2h = compute_objective_function(m);
+        dtw.set_value(initial_weight, row, col);
+
+        // Compute relative error in gradient.
+        // Note: only weight owner participates
+        if (weight_is_local && weights_matrix.RedundantRank() == 0) {
+          const EvalType analytical_gradient
+            = gradient.GetLocal(local_row, local_col);
+          const EvalType numerical_gradient
+            = (- f_2h + 8 * f_h - 8 * f_nh + f_n2h) / (12 * step_size);
+          const EvalType error = std::fabs(analytical_gradient - numerical_gradient);
+          auto relative_error = EvalType(0.);
+          if (error != EvalType(0.)) {
+            relative_error = error / std::max(std::fabs(analytical_gradient),
+                                              std::fabs(numerical_gradient));
+          }
+
+          // Print warning if relative error is large
+          if (error > expected_error || std::isnan(error) || std::isinf(error)) {
+            std::cout << "  GRADIENT ERROR: " << dtw.get_name() << ", "
+                      << "entry (" << row << "," << col << ")" << std::endl;
+            std::cout << "    Weight              = " << initial_weight << std::endl
+                      << "    Analytical gradient = " << analytical_gradient << std::endl
+                      << "    Numerical gradient  = " << numerical_gradient << std::endl
+                      << "    Error               = " << error << std::endl
+                      << "    Relative error      = " << relative_error << std::endl;
+            if (error_on_failure) {
+              LBANN_ERROR("gradient checking found large difference between "
+                          "analytical and numerical gradients");
+            }
+          } else if (verbose) {
+            std::cout << "  " << dtw.get_name() << ", "
+                      << "entry (" << row << "," << col << ")" << std::endl;
+            std::cout << "    Weight              = " << initial_weight << std::endl
+                      << "    Analytical gradient = " << analytical_gradient << std::endl
+                      << "    Numerical gradient  = " << numerical_gradient << std::endl
+                      << "    Error               = " << error << std::endl
+                      << "    Relative error      = " << relative_error << std::endl;
+          }
+        }
+      }
+    }
+    return;
+  }
+}; // struct CheckWeightsFunctor
+
+} // namespace
+
+check_gradients::check_gradients(std::set<execution_mode> modes,
+                                 DataType step_size,
+                                 bool verbose,
+                                 bool error_on_failure)
+  : m_modes(std::move(modes)),
+    m_step_size(step_size),
+    m_verbose(verbose),
+    m_error_on_failure(error_on_failure) {}
+
+void check_gradients::do_check_gradients(model& m) const {
+
+  // Get objects from model
+  const auto& c = static_cast<sgd_execution_context&>(m.get_execution_context());
+  auto& comm = *m.get_comm();
+  const auto mode = c.get_execution_mode();
+  const auto& layers = m.get_layers();
+
+  // Return immediately if gradient check isn't currently needed
+  if (!m_modes.empty() && m_modes.count(mode) == 0) { return; }
+
+  // Reset statistics and gradients
+  m.get_objective_function()->reset_statistics(mode);
+  for (auto&& met : m.get_metrics()) {
+    met->reset_statistics(mode);
+  }
+  for (auto&& w : m.get_weights()) {
+    auto&& opt = w->get_optimizer();
+    if (opt != nullptr) { opt->clear_gradient(); }
+  }
+
+  // Load data in input layers
+  for (auto&& l : m.get_layers()) {
+    if (dynamic_cast<generic_input_layer<DataType>*>(l) != nullptr) {
+      l->forward_prop();
+    }
+  }
+
+  // Compute objective function
+  const EvalType objective = compute_objective_function(m);
+
+  // Choose finite difference step
+  // Note: Consider a central difference scheme:
+  //   f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
+  // By Taylor's theorem, the truncation error is bounded by
+  //   E_trunc <= | f'''''(xi) | / 18 * h^4
+  // Assuming f can be computed to a relative accuracy of epsilon,
+  //   E_fl <= epsilon * | f(chi) | / h
+  // For simplicity, we assume f(chi) ~ f(x), and | f'''''(xi) | ~ 1.
+  // If step size is not specified, then we choose h so that
+  //   E_fl <= sqrt(epsilon)
+  // For the integrity of the test, the current implementation uses an
+  // epsilon based on the minimum step size of the float data type
+  const EvalType epsilon = std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
+  const EvalType step_size = (m_step_size > EvalType{0} ?
+                              m_step_size :
+                              std::fabs(objective) * El::Sqrt(epsilon));
+  EvalType expected_error = std::pow((epsilon * objective / step_size
+                                      + std::pow(step_size, 4) / 18),
+                                     0.9);
+
+  // Compute gradients
+  m.get_objective_function()->differentiate();
+  m.get_objective_function()->compute_weight_regularization();
+  for (El::Int i = layers.size()-1; i >= 0; --i) {
+    layers[i]->back_prop();
+  }
+
+  // Print objective function value
+  if (comm.am_world_master()) {
+    std::cout << std::string(64, '-') << "\n"
+              << "Gradient checking...\n"
+              << "  Objective function value = " << objective << "\n"
+              << "  Step size                = " << step_size << "\n"
+              << "  Expected gradient error  = " << expected_error << "\n";
+  }
+
+  for (weights *w : m.get_weights()) {
+    if (!w->has_optimizer()) {
+      continue;
+    }
+    if (comm.am_world_master()) {
+      std::cout << "Checking " << w->get_name() << std::endl;
+    }
+
+    using WeightsTypes =
+      h2::meta::tlist::ExpandTL<data_type_weights, supported_layer_data_type>;
+    using Dispatcher =
+      h2::multimethods::SwitchDispatcher<CheckWeightsFunctor,
+                                         void,
+                                         weights,
+                                         WeightsTypes>;
+    Dispatcher::Exec(
+      CheckWeightsFunctor(m, c,
+                          epsilon, step_size, expected_error,
+                          m_verbose, m_error_on_failure),
+      *w);
+  }
+  if (comm.am_world_master()) {
+    std::cout << std::string(64,'-') << "\n";
+  }
+
+  // Clean up
+  /// @todo tym: I'm not sure if data readers are properly reset
+  for (auto&& l : m.get_layers()) {
+    auto&& input = dynamic_cast<generic_input_layer<DataType>*>(l);
+    if (input != nullptr) {
+      auto&& reader = input->get_data_reader(mode);
+      reader->set_initial_position();
+    }
+  }
+  m.get_objective_function()->reset_statistics(mode);
+  for (auto&& met : m.get_metrics()) {
+    met->reset_statistics(mode);
+  }
+
+}
+
+// Builder function
+std::unique_ptr<callback_base>
+build_check_gradients_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackCheckGradients&>(proto_msg);
+  const auto& modes =
+    parse_set<execution_mode>(params.execution_modes());
+  return make_unique<check_gradients>(modes,
+                                      params.step_size(),
+                                      params.verbose(),
+                                      params.error_on_failure());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/check_init.cpp b/src/callbacks/check_init.cpp
new file mode 100644
index 00000000000..847aabea68c
--- /dev/null
+++ b/src/callbacks/check_init.cpp
@@ -0,0 +1,96 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// check_init .hpp .cpp - Check multi-model init
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/check_init.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+namespace lbann {
+namespace callback {
+namespace {
+template <typename TensorDataType>
+bool check_equal(const El::AbstractMatrix<TensorDataType>& x,
+                 const El::AbstractMatrix<TensorDataType>& y) {
+  const El::Int height = x.Height();
+  const El::Int width = x.Width();
+  if (height != y.Height() || width != y.Width() || x.LDim() != y.LDim()) {
+    return false;
+  }
+  const TensorDataType *x_buf = x.LockedBuffer();
+  const TensorDataType *y_buf = y.LockedBuffer();
+  for (El::Int i = 0; i < height * width; ++i) {
+    if (x_buf[i] != y_buf[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+}// namespace <anon>
+
+void check_init::on_train_begin(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  // Skip after the first epoch.
+  if (c.get_epoch() != 0) {
+    return;
+  }
+  lbann_comm *comm = m->get_comm();
+  if (comm->am_world_master()) {
+    std::cout << "Checking all model initial weights match..." << std::endl;
+  }
+  if (comm->get_num_trainers() == 1) {
+    return;
+  }
+
+  for (const auto* w : m->get_weights()) {
+    if (comm->am_world_master()) {
+      std::cout << "Checking " << w->get_name() << std::endl;
+    }
+    // Model 0 holds the master copy, it gathers the values from other models
+    // and compares them.
+    auto const& dtw = dynamic_cast<data_type_weights<DataType> const&>(*w);
+    const auto& local_matrix = dtw.get_values().LockedMatrix();
+    CPUMat remote_matrix(local_matrix.Height(), local_matrix.Width());
+    for (int model = 1; model < comm->get_num_trainers(); ++model) {
+      comm->global_barrier();
+      if (comm->get_trainer_rank() == 0) {
+        comm->recv(remote_matrix, model);
+        if (!check_equal(local_matrix, remote_matrix)) {
+          LBANN_ERROR("check_init: "
+                      "model ", model, " "
+                      "rank in model ", comm->get_rank_in_trainer(), " "
+                      "does not match model 0");
+        }
+      } else if (comm->get_trainer_rank() == model) {
+        comm->send(local_matrix, 0);
+      }
+    }
+  }
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/check_metric.cpp b/src/callbacks/check_metric.cpp
new file mode 100644
index 00000000000..7aef22ea766
--- /dev/null
+++ b/src/callbacks/check_metric.cpp
@@ -0,0 +1,117 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/check_metric.hpp"
+
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <callbacks.pb.h>
+
+#include <set>
+#include <string>
+#include <sstream>
+#include <utility>
+
+namespace lbann {
+namespace callback {
+
+check_metric::check_metric(std::string metric_name,
+                                                         std::set<execution_mode> modes,
+                                                         EvalType lower_bound,
+                                                         EvalType upper_bound,
+                                                         bool error_on_failure)
+  : m_metric_name(std::move(metric_name)),
+    m_modes(std::move(modes)),
+    m_lower_bound(lower_bound),
+    m_upper_bound(upper_bound),
+    m_error_on_failure(error_on_failure) {
+  if (lower_bound > upper_bound) {
+    std::stringstream err;
+    err << "callback \"" << name() << "\" "
+        << "got an invalid range for metric values "
+        << "(lower bound " << m_lower_bound << ", "
+        << "upper bound " << m_upper_bound << ")";
+    LBANN_ERROR(err.str());
+  }
+}
+
+
+void check_metric::do_check_metric(const model& m) const {
+  const auto& c = m.get_execution_context();
+  std::stringstream err;
+
+  // Return immediately if execution mode is invalid
+  const auto& mode = c.get_execution_mode();
+  if (!m_modes.empty() && m_modes.count(mode) == 0) { return; }
+
+  // Get metric
+  const metric* met = nullptr;
+  for (const auto* met_ : m.get_metrics()) {
+    if (met_->name() == m_metric_name) {
+      met = met_;
+    }
+  }
+  if (met == nullptr) {
+    err << "callback \"" << name() << "\" could not find "
+        << "metric \"" << m_metric_name << "\"";
+    LBANN_ERROR(err.str());
+  }
+
+  // Check if metric value is within expected range
+  const auto& value = met->get_mean_value(mode);
+  if (!(m_lower_bound <= value && value <= m_upper_bound)) {
+    err << "callback \"" << name() << "\" expected "
+        << "metric \"" << m_metric_name << "\" "
+        << "to have a value in range "
+        << "[" << m_lower_bound << "," << m_upper_bound << "], "
+        << "but found a value of " << value;
+    if (m_error_on_failure) {
+      LBANN_ERROR(err.str());
+    } else if (m.get_comm()->am_trainer_master()) {
+      LBANN_WARNING(err.str());
+    }
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_check_metric_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackCheckMetric&>(proto_msg);
+  const auto& modes =
+    parse_set<execution_mode>(params.execution_modes());
+  return make_unique<check_metric>(params.metric(),
+                                                  modes,
+                                                  params.lower_bound(),
+                                                  params.upper_bound(),
+                                                  params.error_on_failure());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/check_nan.cpp b/src/callbacks/check_nan.cpp
new file mode 100644
index 00000000000..764cbeeb126
--- /dev/null
+++ b/src/callbacks/check_nan.cpp
@@ -0,0 +1,292 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/check_nan.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include "lbann/utils/h2_tmp.hpp"
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+/** Check whether a matrix contains a NaN.
+ *  If a NaN entry is detected, return true and output the local entry
+ *  position in row and col. mat is assumed to be a CPU matrix.
+ */
+template <typename TensorDataType>
+bool has_nan(
+  const El::AbstractDistMatrix<TensorDataType>& mat,
+  El::Int& row, El::Int& col) {
+  row = -1;
+  col = -1;
+  const auto& local_mat = mat.LockedMatrix();
+  for (El::Int j = 0; j < local_mat.Width(); ++j) {
+    for (El::Int i = 0; i < local_mat.Height(); ++i) {
+      if (std::isnan(local_mat(i,j))) {
+        row = i;
+        col = j;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/** Check whether a matrix contains an inf.
+ *  If an inf entry is detected, return true and output the entry
+ *  position in row and col. mat is assumed to be a CPU matrix.
+ */
+template <typename TensorDataType>
+bool has_inf(
+  const El::AbstractDistMatrix<TensorDataType>& mat,
+  El::Int& row, El::Int& col) {
+  row = -1;
+  col = -1;
+  const auto& local_mat = mat.LockedMatrix();
+  for (El::Int j = 0; j < local_mat.Width(); ++j) {
+    for (El::Int i = 0; i < local_mat.Height(); ++i) {
+      if (std::isinf(local_mat(i,j))) {
+        row = i;
+        col = j;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+struct DefaultErrorReporter
+{
+  template <typename... Ts>
+  void DispatchError(Ts&&...)
+  {
+    LBANN_ERROR("Unable to dispatch functor.");
+  }
+
+  template <typename... Ts>
+  void DeductionError(Ts&&...)
+  {
+    LBANN_ERROR("Unable to deduce an argument type.");
+  }
+};
+
+struct DumpLayerFunctor : DefaultErrorReporter
+{
+  model * m;
+  sgd_execution_context const& c;
+
+  DumpLayerFunctor(model* arg_m, sgd_execution_context const& arg_c)
+    : m(arg_m), c(arg_c)
+  {}
+
+  template <typename TensorDataType>
+  void operator()(data_type_layer<TensorDataType> const& dtl) {
+    const std::string prefix = build_string(
+      "model", m->get_comm()->get_trainer_rank(),
+      "-rank", m->get_comm()->get_rank_in_trainer(),
+      "-epoch", c.get_epoch(),
+      "-step", c.get_step(),
+      "-",  dtl.get_name(), "-");
+    for (int i = 0; i < dtl.get_num_children(); ++i) {
+      El::Write(dtl.get_local_activations(i),
+                prefix + "Activations" + std::to_string(i),
+                El::ASCII);
+    }
+    for (int i = 0; i < dtl.get_num_parents(); ++i) {
+      El::Write(dtl.get_local_error_signals(i),
+                prefix + "ErrorSignal" + std::to_string(i),
+                El::ASCII);
+    }
+  }
+}; // struct DumpLayerFunctor
+
+struct DumpWeightsFunctor : DefaultErrorReporter
+{
+  model * m;
+  sgd_execution_context const& c;
+
+  DumpWeightsFunctor(model* arg_m, sgd_execution_context const& arg_c)
+    : m(arg_m), c(arg_c)
+  {}
+
+  template <typename TensorDataType>
+  void operator()(data_type_weights<TensorDataType>& dtw) {
+    const std::string prefix = build_string(
+      "model", m->get_comm()->get_trainer_rank(),
+      "-rank", m->get_comm()->get_rank_in_trainer(),
+      "-epoch", c.get_epoch(),
+      "-step", c.get_step(),
+      "-", dtw.get_name(), "-");
+    El::Write(dtw.get_values().LockedMatrix(),
+              prefix + "Weights",
+              El::ASCII);
+    auto* opt = dtw.get_optimizer();
+    if (opt != nullptr) {
+      El::Write(opt->get_gradient().LockedMatrix(),
+                prefix + "Gradient",
+                El::ASCII);
+    }
+  }
+}; // struct DumpWeightsFunctor
+
+/** Dump the local network matrices for debugging.
+ *  Dump only the local matrices because not every rank will
+ *  necessarily have bad data, and the check is purely local.
+ */
+void dump_network(model *m) {
+  using ValidFPTypes = supported_layer_data_type;
+
+  const auto& c = dynamic_cast<sgd_execution_context&>(m->get_execution_context());
+  for (auto* l : m->get_layers()) {
+    using LayerTypes = h2::meta::tlist::ExpandTL<data_type_layer, ValidFPTypes>;
+    using Dispatcher = h2::multimethods::SwitchDispatcher<DumpLayerFunctor,
+                                                          void,
+                                                          Layer,
+                                                          LayerTypes>;
+    Dispatcher::Exec(DumpLayerFunctor(m, c), *l);
+  }
+  for (auto* w : m->get_weights()) {
+    using WeightsTypes =
+      h2::meta::tlist::ExpandTL<data_type_weights, ValidFPTypes>;
+    using Dispatcher = h2::multimethods::SwitchDispatcher<DumpWeightsFunctor,
+                                                          void,
+                                                          weights,
+                                                          WeightsTypes>;
+    Dispatcher::Exec(DumpWeightsFunctor(m, c), *w);
+  }
+}
+} // namespace
+
+void check_nan::on_forward_prop_end(model *m, Layer *l) {
+  using proxy_type =
+    El::AbstractDistMatrixReadDeviceProxy<DataType, El::Device::CPU>;
+
+  if (!m || !l)
+    LBANN_ERROR("Model or layer pointer is null.");
+
+  const auto& num_outputs = l->get_num_children();
+  for (int i = 0; i < num_outputs; ++i) {
+    El::Int row, col;
+    auto const& dtl = dynamic_cast<data_type_layer<DataType>&>(*l);
+    proxy_type mat_proxy(dtl.get_activations(i));
+    if (has_nan(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      std::string activation_id = (num_outputs>1 ? std::to_string(i) + " " : "");
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is NaN "
+                  "in activations ", activation_id,
+                  "of layer \"", l->get_name(), "\"");
+    }
+    if (has_inf(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      std::string activation_id = (num_outputs>1 ? std::to_string(i) + " " : "");
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is inf "
+                  "in activations ", activation_id,
+                  "of layer \"", l->get_name(), "\"");
+    }
+  }
+}
+
+void check_nan::on_backward_prop_end(model *m, Layer *l) {
+  using proxy_type =
+    El::AbstractDistMatrixReadDeviceProxy<DataType, El::Device::CPU>;
+  const auto& num_inputs = l->get_num_parents();
+  for (int i = 0; i < num_inputs; ++i) {
+    El::Int row, col;
+    auto const& dtl = dynamic_cast<data_type_layer<DataType>&>(*l);
+    proxy_type mat_proxy(dtl.get_error_signals(i));
+    if (has_nan(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      std::string signal_id = (num_inputs>1 ? std::to_string(i) + " " : "");
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is NAN "
+                  "in error signals ", signal_id, " of layer \"",
+                  l->get_name(), "\"");
+    }
+    if (has_inf(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      std::string signal_id = (num_inputs>1 ? std::to_string(i) + " " : "");
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is inf "
+                  "in error signals ", signal_id, " of layer \"",
+                  l->get_name(), "\"");
+    }
+  }
+}
+
+void check_nan::on_backward_prop_end(model *m) {
+  using proxy_type =
+    El::AbstractDistMatrixReadDeviceProxy<DataType, El::Device::CPU>;
+  for (weights *w : m->get_weights()) {
+    auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+    auto* opt = dtw.get_optimizer();
+    if (opt != nullptr) {
+      El::Int row, col;
+      proxy_type mat_proxy(opt->get_gradient());
+      if (has_nan(mat_proxy.GetLocked(), row, col)) {
+        dump_network(m);
+        LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                    "local entry (", row, ",", col, ") is NaN "
+                    "in gradient w.r.t. weights \"", w->get_name(), "\"");
+      }
+      if (has_inf(mat_proxy.GetLocked(), row, col)) {
+        dump_network(m);
+        LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                    "local entry (", row, ",", col, ") is inf "
+                    "in gradient w.r.t. weights \"", w->get_name(), "\"");
+      }
+    }
+  }
+}
+
+void check_nan::on_batch_end(model *m) {
+  using proxy_type =
+    El::AbstractDistMatrixReadDeviceProxy<DataType, El::Device::CPU>;
+  for (weights *w : m->get_weights()) {
+    auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+    El::Int row, col;
+    proxy_type mat_proxy(dtw.get_values());
+    if (has_nan(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is NaN "
+                  "in weights \"", w->get_name(), "\"");
+    }
+    if (has_inf(mat_proxy.GetLocked(), row, col)) {
+      dump_network(m);
+      LBANN_ERROR("rank ", m->get_comm()->get_rank_in_world(), ": "
+                  "local entry (", row, ",", col, ") is inf "
+                  "in weights \"", w->get_name(), "\"");
+    }
+  }
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/check_small.cpp b/src/callbacks/check_small.cpp
new file mode 100644
index 00000000000..6cd3137ec63
--- /dev/null
+++ b/src/callbacks/check_small.cpp
@@ -0,0 +1,97 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/check_small.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+namespace callback {
+namespace {
+template <typename TensorDataType>
+bool is_good(const El::AbstractDistMatrix<TensorDataType>& m) {
+  static const TensorDataType threshold
+    = El::Sqrt(std::numeric_limits<TensorDataType>::min());
+
+  const auto& local_mat = m.LockedMatrix();
+  const El::Int height = local_mat.Height();
+  const El::Int width = local_mat.Width();
+  for (El::Int col = 0; col < width; ++col) {
+    for (El::Int row = 0; row < height; ++row) {
+      const auto val = std::abs(local_mat(row, col));
+      if (val > TensorDataType(0) && val <= threshold) {
+        std::cout << "Found small value " << val
+                  << " at (" << row << "," << col << ")!" << std::endl;
+        return false;
+      }
+    }
+  }
+  return true;
+}
+}// namespace <anon>
+
+void check_small::on_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  auto& dtl = dynamic_cast<data_type_layer<DataType>&>(*l);
+  const auto& acts = dtl.get_activations();
+  if (!is_good(acts)) {
+    LBANN_ERROR(name(), ": "
+                "[", std::to_string(m->get_comm()->get_rank_in_world()), "]: "
+                "error in activations of ", l->get_name(), " "
+                "(step=", std::to_string(c.get_step()), ")");
+  }
+}
+
+void check_small::on_backward_prop_end(model *m) {
+  const auto& c = m->get_execution_context();
+  for (weights *w : m->get_weights()) {
+    auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+    auto* opt = dtw.get_optimizer();
+    if (opt != nullptr && !is_good(opt->get_gradient())) {
+      LBANN_ERROR(name(), ": "
+                  "[", std::to_string(m->get_comm()->get_rank_in_world()), "]: "
+                  "error in weights gradient of ", dtw.get_name(), " "
+                  "(step=", std::to_string(c.get_step()), ")");
+    }
+  }
+}
+
+void check_small::on_batch_end(model *m) {
+  const auto& c = m->get_execution_context();
+  for (weights *w : m->get_weights()) {
+    auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+    if (!is_good(dtw.get_values())) {
+      LBANN_ERROR(name(), ": "
+                  "[", std::to_string(m->get_comm()->get_rank_in_world()), "]: "
+                  "error in weights of ", w->get_name(), " "
+                  "(step=", std::to_string(c.get_step()-1), ")");
+    }
+  }
+}
+
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/checkpoint.cpp b/src/callbacks/checkpoint.cpp
new file mode 100644
index 00000000000..c8b4e8bc027
--- /dev/null
+++ b/src/callbacks/checkpoint.cpp
@@ -0,0 +1,532 @@
+////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// checkpoint .hpp .cpp - Callback hooks to checkpoint model
+////////////////////////////////////////////////////////////////////////////////
+
+
+#include "lbann/callbacks/checkpoint.hpp"
+
+#include "lbann/models/model.hpp"
+
+#include <callbacks.pb.h>
+
+#include <memory>
+#include <string>
+
+namespace lbann {
+namespace callback {
+// Load from checkpoint occurs during setup callbacks
+void checkpoint::setup(model *m) {
+  reload_model(m);
+}
+
+void checkpoint::setup(trainer *t) {
+  set_active_trainer(t);
+  auto& p = get_active_trainer().get_persist_obj();
+  p.set_cb_type(callback_type::invalid);
+  reload_trainer(t);
+}
+
+// Restoring the execution context from checkpoint occurs during just
+// before execution phase
+void checkpoint::on_train_begin(model *m) {
+  auto& p = get_active_trainer().get_persist_obj();
+  p.set_cb_type(callback_type::full_checkpoint);
+  restart(m);
+}
+
+// Interval defined with checkpoint_epochs or ckpt_dist_epochs
+void checkpoint::on_epoch_end(model *m) {
+  auto& p = get_active_trainer().get_persist_obj();
+  p.set_cb_type(callback_type::full_checkpoint);
+  if(need_checkpoint(m, callback_phase::epoch)){
+    do_checkpoint(m);
+  }
+  p.set_cb_type(callback_type::invalid);
+}
+// Interval defined with checkpoint_epochs or ckpt_dist_epochs
+void checkpoint::on_validation_end(model *m) {
+  auto& p = get_active_trainer().get_persist_obj();
+  p.set_cb_type(callback_type::full_checkpoint);
+  if(need_checkpoint(m, callback_phase::validation)){
+    do_checkpoint(m);
+  }
+  p.set_cb_type(callback_type::invalid);
+}
+ // Interval defined with checkpoint_steps or ckpt_dist_steps
+void checkpoint::on_batch_end(model *m) {
+  auto& p = get_active_trainer().get_persist_obj();
+  p.set_cb_type(callback_type::full_checkpoint);
+  if(need_checkpoint(m, callback_phase::batch)){
+    do_checkpoint(m);
+  }
+  p.set_cb_type(callback_type::invalid);
+}
+
+// Decide if we need to trigger a checkpoint for either mode, based on prototext defined intervals
+bool checkpoint::need_checkpoint(model *m, callback_phase phase) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  /* TODO: since we're using clocks, this requires a bcast for each call,
+   * we could use number of samples processed to make a local decision */
+  // if none of our checkpoint conditions are set, assume we're not checkpointing
+  if (m_checkpoint_epochs == 0 &&
+      m_checkpoint_steps  == 0 &&
+      m_checkpoint_secs   == 0.0 &&
+      m_ckpt_dist_epochs == 0 &&
+      m_ckpt_dist_steps== 0) {
+    return false;
+  }
+  // assume that we won't checkpoint
+  m_checkpoint_shared = false;
+  m_checkpoint_dist = false;
+  lbann_comm *comm = m->get_comm();
+  int cur_epoch = c.get_epoch();
+  // If we are at the end of a training epoch and the training epoch lands on defined interval, ckpt
+  if (!m_checkpoint_shared && m_checkpoint_epochs > 0 && (phase == callback_phase::epoch || phase == callback_phase::validation)){
+      m_checkpoint_shared = (cur_epoch > 0) && (cur_epoch % m_checkpoint_epochs == 0);
+    }
+
+  if(!m_checkpoint_dist && m_ckpt_dist_epochs > 0 && (phase == callback_phase::epoch || phase == callback_phase::validation)){
+      m_checkpoint_dist = (cur_epoch > 0) && (cur_epoch % m_ckpt_dist_epochs == 0);
+  }
+
+  // If we are at the end of a training mb step and the training mb step lands on defined interval, trigger checkpoint
+  if (!m_checkpoint_shared && m_checkpoint_steps > 0) {
+    m_checkpoint_shared = (c.get_step() > 0) && (c.get_step() % m_checkpoint_steps == 0);
+  }
+
+  if(!m_checkpoint_dist && m_ckpt_dist_steps > 0){
+      m_checkpoint_dist = (c.get_step() > 0) && (c.get_step() % m_ckpt_dist_steps == 0);
+  }
+
+  // check the clock if time-based checkpoint is enabled
+  if (!m_checkpoint_shared && m_checkpoint_secs != 0.0) {
+    // have rank 0 determine whether we should checkpoint
+    // to avoid issues with clock skew, we rely on rank 0 to make decision
+    if (comm->am_trainer_master()) {
+      // get the current time
+      EvalType current = MPI_Wtime();
+      // compute time next checkpoint is due
+      EvalType next = m_checkpoint_last + m_checkpoint_secs;
+      // determine whether it's time for a checkpoint
+      m_checkpoint_shared = (current >= next);
+    }
+    comm->trainer_broadcast(0, m_checkpoint_shared);
+  }
+  // If either checkpoint version is triggered, return true, otherwise false.
+  return (m_checkpoint_shared || m_checkpoint_dist);
+}
+
+// Checkpoint Shared/Distributed
+bool checkpoint::do_checkpoint(model *m) {
+  auto& p = get_active_trainer().get_persist_obj();
+  auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  auto& t = get_active_trainer();
+  if(&t != &c.get_trainer()) { LBANN_ERROR("Mismatched trainers"); }
+  // if the checkpoint directory is not defined, bail
+  if (get_checkpoint_dir().length() == 0 && m_per_rank_dir.length() == 0) {
+    return false;
+  }
+  // time how long this takes
+  // read current epoch and step counters from model
+  El::Timer timer;
+  char dir[1024];
+  std::string epochdir;
+  std::string latest_file;
+  size_t epoch = std::numeric_limits<size_t>::max();
+  size_t step = std::numeric_limits<size_t>::max();
+  lbann_comm *comm = m->get_comm();
+  // TODO: we would want to prepend dir with the model name and model rank:
+  // m->get_name() + '.' + std::to_string(comm->get_trainer_rank()) + '.'
+  // However, rng state is not part of model state but that of the world.
+  // So, it needs to be in the root folder.
+  comm->trainer_barrier();
+  // let user know we're saving a checkpoint
+  if (comm->am_trainer_master()) {
+    epoch = c.get_epoch();
+    step = c.get_step();
+    timer.Start();
+    std::cout << "[" << m->get_name()
+              << "." << comm->get_trainer_rank()
+              << "] Checkpoint [" << to_string(c.get_execution_mode())
+              << "] to " << get_checkpoint_dir()
+              << " : epoch " << epoch << " step " << step << " ..." << std::endl;
+    fflush(stdout);
+  }
+  comm->trainer_broadcast(0, epoch);
+  comm->trainer_broadcast(0, step);
+
+  // Distributed ckpt
+  if(m_checkpoint_dist){
+    // prepend per rank directory with shared checkpoint dir name
+    // Per rank directory typically a cache location like node local SSDs
+    if(m_per_rank_dir.length() != 0){
+      /// @todo BVE FIXME this looks wrong  I think that the order
+      /// should be reversed
+      snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), get_checkpoint_dir().c_str());
+    } else {
+      strcpy(dir, get_checkpoint_dir().c_str());
+    }
+    makedir(dir);
+    // create directories per ranks
+    epochdir = get_distributed_checkpoint_dirname(t.get_name(),
+                                                  get_active_training_algorithm().get_name(),
+                                                  m->get_comm()->get_rank_in_trainer(),
+                                                  dir, c.get_execution_mode(), epoch, step);
+    /** @todo BVE FIXME this should be refactored to only open the
+        checkpoints files that we care about */
+    p.open_checkpoint(epochdir.c_str(), true);
+    // Make sure that the master has had a chance to create the directories
+    comm->trainer_barrier();
+    // Call top level save to checkpoint function in model, in turn calls save to checkpoint functions for other model classes (weights, layers)
+    if(p.get_cb_type() == callback_type::model_only || p.get_cb_type() == callback_type::full_checkpoint) {
+      m->save_to_checkpoint_distributed(p);
+    }
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint) {
+      t.save_to_checkpoint_distributed();
+    }
+    p.close_checkpoint();
+    // Print latest checkpoint to file
+    if (comm->am_trainer_master()) {
+      latest_file = get_last_distributed_checkpoint_filename(t.get_name(),
+                                                             get_active_training_algorithm().get_name(),
+                                                             dir);
+      write_latest(latest_file, c.get_execution_mode(), epoch, step);
+    }
+  }
+  // Shared checkpoint, logic identical to Distributed.
+  if(m_checkpoint_shared){
+    strcpy(dir, get_checkpoint_dir().c_str());
+    makedir(dir);
+    epochdir = get_shared_checkpoint_dirname(t.get_name(),
+                                             get_active_training_algorithm().get_name(),
+                                             dir, c.get_execution_mode(), epoch, step);
+    p.open_checkpoint(epochdir.c_str(), comm->am_trainer_master());
+    // Make sure that the master has had a chance to create the directories
+    comm->trainer_barrier();
+    if(p.get_cb_type() == callback_type::model_only || p.get_cb_type() == callback_type::full_checkpoint) {
+      m->save_to_checkpoint_shared(p);
+    }
+    if(p.get_cb_type() == callback_type::execution_context_only
+       || p.get_cb_type() == callback_type::full_checkpoint) {
+      t.save_to_checkpoint_shared();
+    }
+    // close our checkpoint
+    p.close_checkpoint();
+    if (comm->am_trainer_master()) {
+      latest_file = get_last_shared_checkpoint_filename(t.get_name(),
+                                                        get_active_training_algorithm().get_name(),
+                                                        dir);
+      write_latest(latest_file, c.get_execution_mode(), epoch, step);
+    }
+  }
+
+  uint64_t bytes_count = p.get_bytes();
+
+  if (comm->am_trainer_master()) {
+    EvalType secs = timer.Stop();
+    EvalType bw = 0;
+    if (secs > 0.0) {
+      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
+    }
+    std::cout << "[" << m->get_name()
+              << "." << comm->get_trainer_rank()
+              << "] Checkpoint [" << to_string(c.get_execution_mode())
+              << "] to " << get_checkpoint_dir()
+              << " complete: Epoch=" << epoch
+              << " Step=" << step
+              << " (" << secs << " secs, " << bytes_count << " bytes, "
+              << bw << " MB/sec)" << std::endl;
+    fflush(stdout);
+  }
+  // record last checkpoint time in case checkpoint_secs interval defined.
+  m_checkpoint_last = MPI_Wtime();
+  p.reset_bytes();
+  return true;
+}
+
+std::string checkpoint::find_latest_checkpoint(lbann_comm& comm,
+                                               const std::string& trainer_name,
+                                               const std::string& alg_name,
+                                               execution_mode& mode,
+                                               size_t &epoch,
+                                               size_t& step,
+                                               bool& shared) {
+  constexpr unsigned int max_len_dirname = 1024;
+  std::string dir;
+  size_t epoch_dist = 0;
+  size_t step_dist = 0;
+
+  // Grab latest checkpoint information, checks for latest in dist and shared, restarts from most recent between the two.
+  if (comm.am_trainer_master()) {
+    std::string latest_file;
+    if(m_per_rank_dir.length()){
+      dir = get_distributed_checkpoint_rootdir();
+      latest_file = get_last_distributed_checkpoint_filename(trainer_name, alg_name, dir);
+      read_latest(latest_file, &mode, &epoch_dist, &step_dist);
+    }
+    if(get_restart_dir().length()){
+      dir = get_shared_checkpoint_rootdir();
+      latest_file = get_last_shared_checkpoint_filename(trainer_name, alg_name, dir);
+      read_latest(latest_file, &mode, &epoch, &step);
+    }
+
+    if(epoch > epoch_dist){
+      dir = get_shared_checkpoint_rootdir();
+      shared = 1;
+    }
+    else if(epoch == epoch_dist && step > step_dist){
+      dir = get_shared_checkpoint_rootdir();
+      shared = 1;
+    }
+    else {
+      dir = get_distributed_checkpoint_rootdir();
+      step = step_dist;
+      epoch = epoch_dist;
+      shared = 0;
+    }
+  }
+  // Update other ranks on where we are loading from.
+  // TODO: we would want to prepend dir with the model name and model rank:
+  // m->get_name() + '.' + std::to_string(comm->get_trainer_rank()) + '.'
+  header_t<max_len_dirname> header;
+  std::memset(&header, 0x0, sizeof(header_t<max_len_dirname>));
+
+  if (comm.am_trainer_master()) {
+    header.mode = mode;
+    header.epoch = epoch;
+    header.step = step;
+    header.shared = shared;
+    dir.copy(header.dirname, dir.length(), 0);
+  }
+
+  comm.trainer_broadcast(0, header);
+
+  if (!comm.am_trainer_master()) {
+    mode = header.mode;
+    epoch = header.epoch;
+    step = header.step;
+    shared = header.shared;
+    dir = header.dirname;
+  }
+  return dir;
+}
+
+// Open latest Shared/Distributed checkpoint
+bool checkpoint::open_latest_checkpoint(
+  lbann_comm& comm,
+  const std::string& task_label,
+  const std::string& trainer_name,
+  const std::string& alg_name,
+  std::function<bool(persist&)> reload_shared_ckpt,
+  std::function<bool(persist&)> reload_distributed_ckpt) {
+  // if the checkpoint directory is not defined, bail
+  if (get_restart_dir().length() == 0 &&  m_per_rank_dir.length() == 0) {
+    return false;
+  }
+  auto& p = get_active_trainer().get_persist_obj();
+
+  // constexpr unsigned int max_len_dirname = 1024;
+  // get top level directory
+  // char dir[max_len_dirname];
+  size_t epoch = std::numeric_limits<size_t>::max();
+  size_t step = std::numeric_limits<size_t>::max();
+  bool shared = true;
+  execution_mode mode;
+
+  std::string dir = find_latest_checkpoint(comm,
+                                           trainer_name,
+                                           alg_name,
+                                           mode, epoch, step, shared);
+
+  // if we couldn't find the latest epoch, just return
+  if (epoch == std::numeric_limits<size_t>::max()) {
+    return false;
+  }
+  // time how long this takes
+  El::Timer timer;
+  // let user know we're restarting from a checkpoint
+  if (comm.am_trainer_master()) {
+    timer.Start();
+    std::cout << task_label << " from " << get_restart_dir() << " : mode " << to_string(mode) << " epoch " << epoch << " step " << step << " ..." << std::endl;
+  }
+
+  std::string epochdir;
+  // Create dir to restart from based off last recorded checkpoint (or overriden values in last.shared[distributed].checkpoint
+  if(!shared){
+    epochdir = get_distributed_checkpoint_dirname(trainer_name,
+                                                  alg_name,
+                                                  comm.get_rank_in_trainer(),
+                                                  dir, mode, epoch, step);
+    if(!file::directory_exists(epochdir)) {
+      LBANN_WARNING(epochdir + " does not exist");
+      return false;
+    }
+    p.open_restart(epochdir.c_str());
+    auto flag = reload_distributed_ckpt(p);
+    if(!flag) { LBANN_WARNING("Unable to reload distributed checkpoint ", epochdir); }
+    p.close_restart();
+  }
+  else {
+    epochdir = get_shared_checkpoint_dirname(trainer_name,
+                                             alg_name,
+                                             dir, mode, epoch, step);
+
+    if(!file::directory_exists(epochdir)) {
+      LBANN_WARNING(epochdir + " does not exist");
+      return false;
+    }
+    // if (comm->am_trainer_master()) {
+    /// @todo For the moment let all ranks open the checkpoint files
+    p.open_restart(epochdir.c_str());
+    // } else {
+    // // Ensure all ranks have access to checkpoint dir, needed for loading rank specific rng state
+    //   p.m_checkpoint_dir = epochdir;
+    // }
+    auto flag = reload_shared_ckpt(p);
+    if(!flag) { LBANN_WARNING("Unable to reload shared checkpoint ", epochdir); }
+    // if(comm->am_trainer_master()) {
+    /// @todo For the moment let all ranks open the checkpoint files
+    p.close_restart();
+    // }
+  }
+
+  // close our checkpoint
+  uint64_t bytes_count = p.get_bytes();
+  // let user know we've completed reading our restart
+  if (comm.am_trainer_master()) {
+    EvalType secs = timer.Stop();
+    EvalType bw = 0.0;
+    if (secs > 0.0) {
+      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
+    }
+    std::cout << "[" << trainer_name
+              << "] " << task_label
+              << " from " << get_restart_dir()
+              << " complete: Epoch=" << epoch
+              << " Step=" << step
+              << " (" << secs << " secs, " << bytes_count << " bytes, "
+              << bw << " MB/sec)" << std::endl;
+    fflush(stdout);
+  }
+  p.reset_bytes();
+  return true;
+}
+
+// Reload a model from a Shared/Distributed checkpoint
+bool checkpoint::reload_model(model *m) {
+  auto reload_shared_model = std::function<bool(/*const */persist&)>
+    ([m](/*const */persist& p_ref)
+     ->bool {
+      auto flag = m->load_from_checkpoint_shared(p_ref);
+      return flag;
+    });
+
+  auto reload_distributed_model = std::function<bool(/*const */persist&)>
+    ([m](/*const */persist& p_ref)
+     ->bool {
+      auto flag = m->load_from_checkpoint_distributed(p_ref);
+      return flag;
+    });
+
+
+  auto flag = open_latest_checkpoint(*(m->get_comm()),
+                                     "Reloading Model " + m->get_name(),
+                                     get_active_trainer().get_name(),
+                                     get_active_training_algorithm().get_name(),
+                                     reload_shared_model,
+                                     reload_distributed_model);
+  return flag;
+}
+
+// Reload a model from a Shared/Distributed checkpoint
+bool checkpoint::reload_trainer(trainer *t) {
+  auto reload_shared_trainer = [t](persist& p_ref) {
+      return t->load_from_checkpoint_shared(p_ref);
+    };
+
+  auto reload_distributed_trainer = [t](persist& p_ref) {
+      return t->load_from_checkpoint_distributed(p_ref);
+    };
+
+  auto flag = open_latest_checkpoint(*(t->get_comm()),
+                                     "Reloading Trainer",
+                                     t->get_name(),
+                                     "sgd",
+                                     reload_shared_trainer,
+                                     reload_distributed_trainer);
+  return flag;
+}
+
+// Restart previously saved Shared/Distributed execution contexts
+bool checkpoint::restart(model *m) {
+  // This function needs to read the checkpoint to see what execution
+  // contexts exists and create a valid execution context for each
+  // one.
+  // Then setup the model with the proper one
+  auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+
+  auto restart_shared_model = [&m, &c](/*const */persist& p_ref)
+    ->bool {
+    auto flag = c.get_trainer().load_from_checkpoint_shared(*m, c);
+    return flag;
+  };
+
+  auto restart_distributed_model = [&m, &c](/*const */persist& p_ref)
+    ->bool {
+    auto flag = c.get_trainer().load_from_checkpoint_distributed(*m, c);
+    return flag;
+  };
+
+
+  auto flag = open_latest_checkpoint(*(m->get_comm()),
+                                     "Restarting",
+                                     get_active_trainer().get_name(),
+                                     get_active_training_algorithm().get_name(),
+                                     restart_shared_model,
+                                     restart_distributed_model);
+
+  return flag;
+}
+
+std::unique_ptr<callback_base>
+build_checkpoint_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackCheckpoint&>(proto_msg);
+  return make_unique<checkpoint>(params.checkpoint_dir(),
+                                 params.restart_dir(),
+                                 params.checkpoint_epochs(),
+                                 params.checkpoint_steps(),
+                                 params.checkpoint_secs(),
+                                 params.per_rank_dir(),
+                                 params.ckpt_dist_epochs(),
+                                 params.ckpt_dist_steps());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/confusion_matrix.cpp b/src/callbacks/confusion_matrix.cpp
new file mode 100644
index 00000000000..e2cb015ae61
--- /dev/null
+++ b/src/callbacks/confusion_matrix.cpp
@@ -0,0 +1,264 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+///////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/confusion_matrix.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+
+#include <callbacks.pb.h>
+
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+
+namespace lbann {
+namespace callback {
+
+// ---------------------------------------------------------
+// Constructors
+// ---------------------------------------------------------
+
+confusion_matrix::confusion_matrix(std::string&& prediction_layer,
+                                   std::string&& label_layer,
+                                   std::string&& prefix)
+  : callback_base(1),
+    m_prediction_layer(std::move(prediction_layer)),
+    m_label_layer(std::move(label_layer)),
+    m_prefix(std::move(prefix)) {}
+
+confusion_matrix::confusion_matrix(std::string const& prediction_layer,
+                                   std::string const& label_layer,
+                                   std::string const& prefix)
+  : callback_base(1),
+    m_prediction_layer(prediction_layer),
+    m_label_layer(label_layer),
+    m_prefix(prefix) {}
+
+confusion_matrix::confusion_matrix(const confusion_matrix& other)
+  : callback_base(other),
+    m_prediction_layer(other.m_prediction_layer),
+    m_label_layer(other.m_label_layer),
+    m_prefix(other.m_prefix),
+    m_counts(other.m_counts),
+    m_predictions_v(other.m_predictions_v ? other.m_predictions_v->Copy() : nullptr),
+    m_labels_v(other.m_labels_v ? other.m_labels_v->Copy() : nullptr) {}
+
+confusion_matrix& confusion_matrix::operator=(const confusion_matrix& other) {
+  callback_base::operator=(other);
+  m_prediction_layer = other.m_prediction_layer;
+  m_label_layer = other.m_label_layer;
+  m_prefix = other.m_prefix;
+  m_counts = other.m_counts;
+  m_predictions_v.reset(other.m_predictions_v ? other.m_predictions_v->Copy() : nullptr);
+  m_labels_v.reset(other.m_labels_v ? other.m_labels_v->Copy() : nullptr);
+  return *this;
+}
+
+// ---------------------------------------------------------
+// Setup
+// ---------------------------------------------------------
+
+void confusion_matrix::setup(model* m) {
+  callback_base::setup(m);
+
+  // Initialize matrix views/copies
+  const auto& predictions = get_predictions(*m);
+  const auto& labels = get_labels(*m);
+  auto dist_data = predictions.DistData();
+  dist_data.device = El::Device::CPU;
+  m_predictions_v.reset(AbsDistMatType::Instantiate(dist_data));
+  m_labels_v.reset(AbsDistMatType::Instantiate(dist_data));
+
+  // Check output dimensions of prediction and label layers
+  if (predictions.Height() != labels.Height()) {
+    LBANN_ERROR("callback \"", name(), "\" "
+                "has prediction and label layers with different dimensions "
+                "(prediction layer \"", m_prediction_layer, "\" "
+                "outputs ", predictions.Height(), " entries, "
+                "label layer \"", m_label_layer, "\" "
+                "outputs ", labels.Height(), " entries)");
+  }
+
+}
+
+// ---------------------------------------------------------
+// Matrix access functions
+// ---------------------------------------------------------
+
+auto confusion_matrix::get_predictions(const model& m) const
+  -> const AbsDistMatType& {
+  for (const auto* l : m.get_layers()) {
+    if (l->get_name() == m_prediction_layer) {
+      auto const& dtl = dynamic_cast<data_type_layer<DataType> const&>(*l);
+      return dtl.get_activations();
+    }
+  }
+  LBANN_ERROR("callback \"", name(), "\" could not find "
+              "prediction layer \"", m_prediction_layer, "\"");
+}
+
+auto confusion_matrix::get_labels(const model& m) const
+  -> const AbsDistMatType& {
+  for (const auto* l : m.get_layers()) {
+    if (l->get_name() == m_label_layer) {
+      auto const& dtl = dynamic_cast<data_type_layer<DataType> const&>(*l);
+      return dtl.get_activations();
+    }
+  }
+  LBANN_ERROR("callback \"", name(), "\" could not find "
+              "label layer \"", m_prediction_layer, "\"");
+}
+
+// ---------------------------------------------------------
+// Count management functions
+// ---------------------------------------------------------
+
+void confusion_matrix::reset_counts(const model& m) {
+  const auto& c = m.get_execution_context();
+  auto& counts = m_counts[c.get_execution_mode()];
+  const auto& num_classes = get_predictions(m).Height();
+  counts.assign(num_classes * num_classes, 0);
+}
+
+void confusion_matrix::update_counts(const model& m) {
+  constexpr DataType zero = 0;
+
+  // Get predictions
+  const auto& predictions = get_predictions(m);
+  const auto& num_classes = predictions.Height();
+  m_predictions_v->Empty(false);
+  m_predictions_v->AlignWith(predictions);
+  if (m_predictions_v->DistData() == predictions.DistData()) {
+    El::LockedView(*m_predictions_v, predictions);
+  } else {
+    El::Copy(predictions, *m_predictions_v);
+  }
+  const auto& local_predictions = m_predictions_v->LockedMatrix();
+
+  // Get labels
+  const auto& labels = get_labels(m);
+  m_labels_v->Empty(false);
+  m_labels_v->AlignWith(predictions);
+  if (m_labels_v->DistData() == labels.DistData()) {
+    El::LockedView(*m_labels_v, labels);
+  } else {
+    El::Copy(labels, *m_labels_v);
+  }
+  const auto& local_labels = m_labels_v->LockedMatrix();
+
+  // Update counts
+  const auto& c = m.get_execution_context();
+  auto& counts = m_counts[c.get_execution_mode()];
+  const auto& local_height = local_predictions.Height();
+  const auto& local_width = local_predictions.Width();
+  for (El::Int local_col = 0; local_col < local_width; ++local_col) {
+    El::Int prediction_index = -1, label_index = -1;
+    LBANN_OMP_PARALLEL_FOR
+    for (El::Int local_row = 0; local_row < local_height; ++local_row) {
+      if (local_predictions(local_row, local_col) != zero) {
+        prediction_index = m_predictions_v->GlobalRow(local_row);
+      }
+      if (local_labels(local_row, local_col) != zero) {
+        label_index = m_labels_v->GlobalRow(local_row);
+      }
+    }
+    if (prediction_index >= 0 && label_index >= 0) {
+      counts[label_index + prediction_index * num_classes]++;
+    }
+  }
+
+}
+
+void confusion_matrix::save_confusion_matrix(const model& m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m.get_execution_context());
+
+  // Get counts
+  const auto& mode = c.get_execution_mode();
+  auto& counts = m_counts[mode];
+
+  // Accumulate counts in master process
+  // Note: Counts in non-root processes are set to zero, so this can
+  // be called multiple times without affecting correctness.
+  auto&& comm = *m.get_comm();
+  if (comm.am_trainer_master()) {
+    comm.trainer_reduce(static_cast<El::Int*>(MPI_IN_PLACE),
+                      counts.size(),
+                      counts.data());
+  } else {
+    comm.trainer_reduce(counts.data(), counts.size(),
+                      comm.get_trainer_master());
+    counts.assign(counts.size(), 0);
+  }
+
+  // Save confusion matrix on master process
+  if (comm.am_trainer_master()) {
+    const auto& num_classes = get_predictions(m).Height();
+    const auto& total_count = std::accumulate(counts.begin(), counts.end(), 0);
+    const auto& scale = DataType(1) / total_count;
+
+    // Construct output file name
+    std::string mode_string;
+    switch (mode) {
+    case execution_mode::training:
+      mode_string = "train-epoch" + std::to_string(c.get_epoch());
+      break;
+    case execution_mode::validation:
+      mode_string = "validation-epoch" + std::to_string(c.get_epoch());
+      break;
+    case execution_mode::testing:
+      mode_string = "test";
+      break;
+    default: return; // Exit immediately if execution mode is unknown
+    }
+
+    // Write to file
+    std::ofstream fs(m_prefix + mode_string + ".csv");
+    for (El::Int i = 0; i < num_classes; ++i) {
+      for (El::Int j = 0; j < num_classes; ++j) {
+        fs << (j > 0 ? "," : "") << counts[j + i * num_classes] * scale;
+      }
+      fs << "\n";
+    }
+    fs.close();
+
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_confusion_matrix_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackConfusionMatrix&>(proto_msg);
+  return make_unique<confusion_matrix>(params.prediction(),
+                                       params.label(),
+                                       params.prefix());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/debug.cpp b/src/callbacks/debug.cpp
new file mode 100644
index 00000000000..c6564f14d34
--- /dev/null
+++ b/src/callbacks/debug.cpp
@@ -0,0 +1,184 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+///////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/debug.hpp"
+#include "lbann/comm.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+#include "callbacks.pb.h"
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+/** Get human-readable string describing process rank. */
+std::string rank_string(const lbann_comm& comm) {
+  std::stringstream msg;
+  msg << "rank " << comm.get_rank_in_world();
+  if (comm.get_num_trainers() > 1) {
+    msg << " (rank " << comm.get_rank_in_trainer()
+        << " of model " << comm.get_trainer_rank() << ")";
+  }
+  return msg.str();
+}
+
+/** Get human-readable string describing layer. */
+std::string layer_string(const Layer& l) {
+  return l.get_type() + " layer \"" + l.get_name() + "\"";
+}
+
+/** Get human-readable string describing weights and optimizer. */
+template <typename TensorDataType>
+std::string weights_string(const data_type_weights<TensorDataType>& w) {
+  std::stringstream msg;
+  msg << "weights \"" << w.get_name() << "\" (";
+  const auto* opt = w.get_optimizer();
+  if (opt == nullptr) { msg << "no"; }
+  else { msg << opt->get_type(); }
+  msg << " optimizer)";
+  return msg.str();
+}
+
+/** Get human-readable string describing current batch step. */
+std::string batch_step_string(const model& m) {
+  const auto& c =
+    dynamic_cast<const sgd_execution_context&>(m.get_execution_context());
+  std::stringstream msg;
+  const auto& mode = c.get_execution_mode();
+  msg << to_string(mode) << " batch " << c.get_step();
+  msg << " (epoch " << c.get_epoch() << ")";
+  return msg.str();
+}
+
+} // namespace
+
+// Status updates for batch beginnings/endings
+void debug::on_batch_begin(model *m) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": "
+        << "starting " << batch_step_string(*m) << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_batch_end(model *m) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": "
+        << "ending " << batch_step_string(*m) << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_batch_evaluate_begin(model *m) {
+  on_batch_begin(m);
+}
+void debug::on_batch_evaluate_end(model *m) {
+  on_batch_end(m);
+}
+
+// Status updates for beginning/ending of layer forward/backward prop
+void debug::on_forward_prop_begin(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
+        << " is starting forward prop for " << batch_step_string(*m)
+        << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
+        << " is   ending forward prop for " << batch_step_string(*m)
+        << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_backward_prop_begin(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
+        << " is starting backward prop for " << batch_step_string(*m)
+        << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_backward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) {
+    std::stringstream msg;
+    msg << rank_string(*m->get_comm()) << ": " << layer_string(*l)
+        << " is   ending backward prop for " << batch_step_string(*m)
+        << std::endl;
+    std::cerr << msg.str();
+  }
+}
+void debug::on_evaluate_forward_prop_begin(model *m, Layer *l) {
+  on_forward_prop_begin(m, l);
+}
+void debug::on_evaluate_forward_prop_end(model *m, Layer *l) {
+  on_backward_prop_end(m, l);
+}
+
+// Status updates for optimization step
+void debug::on_optimize_begin(model *m, weights *w) {
+  auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+  std::stringstream msg;
+  msg << rank_string(*m->get_comm()) << ": " << weights_string(dtw)
+      << " is starting optimization step for " << batch_step_string(*m)
+      << std::endl;
+  std::cerr << msg.str();
+}
+void debug::on_optimize_end(model *m, weights *w) {
+  auto& dtw = dynamic_cast<data_type_weights<DataType>&>(*w);
+  std::stringstream msg;
+  msg << rank_string(*m->get_comm()) << ": " << weights_string(dtw)
+      << " is   ending optimization step for " << batch_step_string(*m)
+      << std::endl;
+  std::cerr << msg.str();
+}
+
+std::unique_ptr<callback_base>
+build_debug_callback_from_pbuf(const google::protobuf::Message& proto_msg,
+                               const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDebug&>(proto_msg);
+  const auto& modes =
+    parse_set<execution_mode>(params.phase());
+  return make_unique<debug>(modes);
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/debug_io.cpp b/src/callbacks/debug_io.cpp
new file mode 100644
index 00000000000..d3c1fdc2660
--- /dev/null
+++ b/src/callbacks/debug_io.cpp
@@ -0,0 +1,184 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// debug .hpp .cpp - Callback hooks to debug LBANN
+///////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/debug_io.hpp"
+
+#include "lbann/base.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <callbacks.pb.h>
+#include <iostream>
+#include <memory>
+
+namespace lbann {
+namespace callback {
+
+/// BVE FIXME @todo The use of execution_mode invalid needs to be reconsidered
+void debug_io::on_epoch_begin(model *m) {
+  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::training) {
+    print_phase_start(m, execution_mode::training);
+  }
+}
+
+void debug_io::on_forward_prop_begin(model *m, Layer *l) {
+  auto *input = dynamic_cast<generic_input_layer<DataType>*>(l);
+  if (input == nullptr || m_debug_lvl < 1) {
+    return;
+  }
+
+  const auto& c = m->get_execution_context();
+  if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) {
+    if(m_debug_phase == execution_mode::invalid || m_debug_phase == c.get_execution_mode()) {
+      print_fp_start(m, input);
+    }
+  }
+  /// BVE Note - what is hte role of hte current mini-batch index
+  /// versus the current position
+  /// I think that the reset mini batch index may be off
+}
+
+void debug_io::print_fp_start(model *m, generic_input_layer<DataType> *input) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  const auto& step = c.get_step();
+  std::cout << "[" << m->get_comm()->get_trainer_rank()
+            << "." << m->get_comm()->get_rank_in_trainer()
+            << "] @" << c.get_epoch() << "." << step
+            << " Phase: " << to_string(c.get_execution_mode())
+            << " starting forward propagation for layer " << input->get_name()
+            << " type: " << input->get_type()
+            << " iteration: " << input->get_data_reader()->get_current_mini_batch_index()
+            << " of " << input->get_num_iterations_per_epoch()
+            << " loading idx " << input->get_data_reader()->get_loaded_mini_batch_index()
+            << " bs=" << input->get_current_mini_batch_size() << "/"
+            << input->get_current_global_mini_batch_size()
+            << " @" << input->get_data_reader()->get_position()
+    //              << " %" << input->get_data_reader()->get_batch_stride()
+            << " ^" << input->get_data_reader()->get_sample_stride()
+            << std::endl;
+}
+
+//  179i @ 300s (=5m*60s) + 1i @ 100s (=5m*45s):offset <- num models
+void debug_io::print_phase_start(model *m, execution_mode mode) {
+  const auto& c = m->get_execution_context();
+  // Get data reader from first input layer in model
+  generic_data_reader* data_reader = nullptr;
+  for (auto&& l : m->get_layers()) {
+    auto&& input = dynamic_cast<generic_input_layer<DataType>*>(l);
+    if (input != nullptr) {
+      data_reader = input->get_data_reader(mode);
+      break;
+    }
+  }
+  if (data_reader == nullptr) { return; }
+
+  const auto& step = c.get_step();
+
+  if(data_reader->get_rank() < data_reader->get_num_parallel_readers()) {
+    std::cout << "[" << m->get_comm()->get_trainer_rank()
+              << "." << m->get_comm()->get_rank_in_trainer()
+              << "] @" << 0 << "." << step
+              << " Starting Phase: " << to_string(mode)
+              << " " << (data_reader->get_num_iterations_per_epoch() - 1)
+              << "i @ " << data_reader->get_global_mini_batch_size()
+              << "s (=" << m->get_comm()->get_num_trainers()
+              << "m *" << data_reader->get_mini_batch_size()
+              << "s [+" << data_reader->get_stride_to_next_mini_batch()
+              << "s]) + 1i @ " << data_reader->get_global_last_mini_batch_size()
+              << "s (=" << m->get_comm()->get_num_trainers()
+              << "m *" << data_reader->get_last_mini_batch_size()
+              << "s [+" << data_reader->get_stride_to_last_mini_batch()
+              << "s]):"
+              <<" base offset "<< data_reader->get_base_offset()
+              << " model offset " << data_reader->get_model_offset()
+              << " par. readers = " << data_reader->get_num_parallel_readers()
+              << "r"
+              << std::endl;
+  }else {
+    std::cout << "[" << m->get_comm()->get_trainer_rank()
+              << "." << m->get_comm()->get_rank_in_trainer()
+              << "] @" << 0 << "." << step
+              << " Starting Phase: " << to_string(mode)
+              << " " << (data_reader->get_num_iterations_per_epoch())
+              << "i "
+              << " par. readers = " << data_reader->get_num_parallel_readers()
+              << "r (Inactive Reader)"
+              << std::endl;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Evaluation phase debugging
+////////////////////////////////////////////////////////////////////////////////
+void debug_io::on_validation_begin(model *m) {
+  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::validation) {
+    print_phase_start(m, execution_mode::validation);
+  }
+}
+
+void debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  auto *input = dynamic_cast<generic_input_layer<DataType>*>(l);
+  if (input == nullptr || m_debug_lvl < 1) {
+    return;
+  }
+
+  if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) {
+    if(m_debug_phase == execution_mode::invalid || m_debug_phase == c.get_execution_mode()) {
+      print_fp_start(m, input);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Testing phase debugging
+////////////////////////////////////////////////////////////////////////////////
+void debug_io::on_test_begin(model *m) {
+  if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::testing) {
+    print_phase_start(m, execution_mode::testing);
+  }
+}
+
+std::unique_ptr<callback_base>
+build_debug_io_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDebugIO&>(proto_msg);
+  const auto& phase = exec_mode_from_string(params.phase());
+  const auto& lvl = params.lvl();
+  switch (phase) {
+  case execution_mode::training:
+  case execution_mode::validation:
+  case execution_mode::testing:
+    return make_unique<debug_io>(phase, lvl);
+  default:
+    return make_unique<debug_io>();
+  }
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/dump_error_signals.cpp b/src/callbacks/dump_error_signals.cpp
new file mode 100644
index 00000000000..c7678d634dc
--- /dev/null
+++ b/src/callbacks/dump_error_signals.cpp
@@ -0,0 +1,68 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/dump_error_signals.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+void dump_error_signals::on_backward_prop_end(model *m, Layer *l) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+
+  // Write each activation matrix to file
+  for (int i = 0; i < l->get_num_parents(); ++i) {
+
+    // File name
+    std::stringstream file;
+    file << m_basename
+         << "model" << m->get_comm()->get_trainer_rank() << "-"
+         << "epoch" << c.get_epoch() << "-"
+         << "step" << c.get_step() << "-"
+         << l->get_name() << "-"
+         << "ErrorSignals";
+    if (l->get_num_parents() > 1) { file << i; }
+
+    // Write activations to file
+    auto& dtl = dynamic_cast<data_type_layer<DataType>&>(*l);
+    El::Write(dtl.get_error_signals(i), file.str(), El::ASCII);
+
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_dump_error_signals_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDumpErrorSignals&>(proto_msg);
+  return make_unique<dump_error_signals>(params.basename());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/dump_gradients.cpp b/src/callbacks/dump_gradients.cpp
new file mode 100644
index 00000000000..52ba9b3d471
--- /dev/null
+++ b/src/callbacks/dump_gradients.cpp
@@ -0,0 +1,67 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_gradients .hpp .cpp - Callbacks to dump gradients
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/dump_gradients.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+
+#include <callbacks.pb.h>
+
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+void dump_gradients::on_backward_prop_end(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  for (weights *w : m->get_weights()) {
+    optimizer *opt = w->get_optimizer();
+    if (opt != nullptr) {
+      const std::string file
+        = (m_basename
+           + "model" + std::to_string(m->get_comm()->get_trainer_rank())
+           + "-epoch" + std::to_string(c.get_epoch())
+           + "-step" + std::to_string(c.get_step())
+           + "-" + w->get_name()
+           + "-Gradient");
+      auto* dt_opt = dynamic_cast<data_type_optimizer<DataType>*>(opt);
+      El::Write(dt_opt->get_gradient(), file, El::ASCII);
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_dump_gradients_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDumpGradients&>(proto_msg);
+  return make_unique<dump_gradients>(params.basename(),
+                                                    params.interval());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/dump_minibatch_sample_indices.cpp b/src/callbacks/dump_minibatch_sample_indices.cpp
new file mode 100644
index 00000000000..f5895822b75
--- /dev/null
+++ b/src/callbacks/dump_minibatch_sample_indices.cpp
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_minibatch_sample_indices .hpp .cpp - Callbacks
+// to dump the list of indices per minibatch
+////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include "lbann/callbacks/dump_minibatch_sample_indices.hpp"
+#include "lbann/layers/io/input/input_layer.hpp"
+
+#include <callbacks.pb.h>
+
+#include <iomanip>
+#include <cstdlib>
+
+namespace lbann {
+namespace callback {
+
+void dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t step) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  // Print minibatch sample indices of input layers
+  auto *input = dynamic_cast<generic_input_layer<DataType>*>(l);
+  if (input != nullptr) {
+    El::Matrix<El::Int>* indices = l->get_sample_indices_per_mb();
+    if (indices == nullptr
+        || indices->Height() == 0
+        || indices->Width() == 0) {
+      return;
+    }
+
+    std::ostringstream s;
+    s << "mkdir -p " << m_basename;
+    const int dir= system(s.str().c_str());
+    if (dir< 0) {
+      LBANN_ERROR("callback_dump_minibatch_sample_indices is unable to create the target director");
+    }
+
+    const std::string file
+      = (m_basename
+         + to_string(c.get_execution_mode())
+         + "-model" + std::to_string(m->get_comm()->get_trainer_rank())
+         + "-rank" + std::to_string(m->get_comm()->get_rank_in_trainer())
+         + "-epoch" + std::to_string(c.get_epoch())
+         + "-step" + std::to_string(c.get_step())
+         + "-" + l->get_name()
+         + "-MB_Sample_Indices");
+    El::Write(*indices, file, El::ASCII);
+  }
+}
+
+void dump_minibatch_sample_indices::on_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  dump_to_file(m, l, c.get_step());
+}
+
+void dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer *l) {
+  const auto& c = m->get_execution_context();
+  dump_to_file(m, l, c.get_step());
+}
+
+std::unique_ptr<callback_base>
+build_dump_mb_indices_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDumpMBIndices&>(proto_msg);
+  return make_unique<dump_minibatch_sample_indices>(
+    params.basename(),
+    params.interval());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/dump_outputs.cpp b/src/callbacks/dump_outputs.cpp
new file mode 100644
index 00000000000..9d1f3aa67b3
--- /dev/null
+++ b/src/callbacks/dump_outputs.cpp
@@ -0,0 +1,198 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/dump_outputs.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/trainer_file_utils.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+
+#include <callbacks.pb.h>
+
+#ifdef LBANN_HAS_CNPY
+#include <cnpy.h>
+#endif // LBANN_HAS_CNPY
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+/** Save text file.
+ *
+ *  Each line corresponds to a mini-batch sample. This is the
+ *  transpose of our internal column-major matrix representation.
+ */
+void save_text(const std::string& file_name,
+               std::string delimiter,
+               const CPUMat& data) {
+  std::ofstream fs(file_name.c_str());
+  if (!fs.is_open()) {
+    LBANN_ERROR("failed to open output file (" + file_name + ")");
+  }
+  for (El::Int col = 0; col < data.Width(); ++col) {
+    for (El::Int row = 0; row < data.Height(); ++row) {
+      fs << (row > 0 ? delimiter : "") << data(row, col);
+    }
+    fs << "\n";
+  }
+}
+
+
+/** Save NumPy binary file. */
+void save_npy(const std::string& file_name,
+              const std::vector<int>& dims,
+              const CPUMat& data) {
+#ifndef LBANN_HAS_CNPY
+  LBANN_ERROR("CNPY not detected");
+#else
+  if (!data.Contiguous()) {
+    LBANN_ERROR("expected contiguous data matrix");
+  }
+  std::vector<size_t> shape;
+  shape.push_back(data.Width());
+  for (const auto& d : dims) { shape.push_back(d); }
+  cnpy::npy_save(file_name, data.LockedBuffer(), shape);
+#endif // LBANN_HAS_CNPY
+}
+
+/** Save NumPy zip file. */
+void save_npz(const std::string& file_name,
+              const std::string& tensor_name,
+              const std::vector<int>& dims,
+              const CPUMat& data) {
+#ifndef LBANN_HAS_CNPY
+  LBANN_ERROR("CNPY not detected");
+#else
+  if (!data.Contiguous()) {
+    LBANN_ERROR("expected contiguous data matrix");
+  }
+  std::vector<size_t> shape;
+  shape.push_back(data.Width());
+  for (const auto& d : dims) { shape.push_back(d); }
+  cnpy::npz_save(file_name, tensor_name, data.LockedBuffer(), shape);
+#endif // LBANN_HAS_CNPY
+}
+
+} // namespace
+
+dump_outputs::dump_outputs(std::set<std::string> layer_names,
+                           std::set<execution_mode> modes,
+                           El::Int batch_interval,
+                           std::string directory,
+                           std::string file_format)
+  : callback_base(std::max(batch_interval, El::Int(1))),
+    m_layer_names(std::move(layer_names)),
+    m_modes(std::move(modes)),
+    m_directory(std::move(directory)),
+    m_file_format(std::move(file_format)) {
+  std::stringstream err;
+
+  // Initialize directory for output files
+  // Note: Default directory is current working directory. Make sure
+  // pathname has trailing slash.
+  if (m_directory.empty()) { m_directory = "./"; }
+  if (m_directory.back() != '/') { m_directory += "/"; }
+
+  // Initialize file format
+  if (m_file_format.empty()) { m_file_format = "csv"; }
+#ifndef LBANN_HAS_CNPY
+  if (m_file_format == "npy" || m_file_format == "npz") {
+    err << "callback \"" << this->name() << "\" attempted "
+        << "to use NumPy file format (" << m_file_format << "), "
+        << "but CNPY was not detected";
+    LBANN_ERROR(err.str());
+  }
+#endif // LBANN_HAS_CNPY
+  if (m_file_format != "csv" && m_file_format != "tsv"
+      && m_file_format != "npy" && m_file_format != "npz") {
+    err << "callback \"" << this->name() << "\" attempted "
+        << "to use invalid file format (" << m_file_format << ")";
+    LBANN_ERROR(err.str());
+  }
+
+}
+
+void dump_outputs::do_dump_outputs(const model& m, const Layer& l) {
+  const auto& c = static_cast<const sgd_execution_context&>(m.get_execution_context());
+
+  // Get mini-batch step information
+  const auto& mode = c.get_execution_mode();
+
+  // Quit if output dump isn't needed
+  if (!m_modes.empty() && m_modes.count(mode) == 0) { return; }
+  if (!m_layer_names.empty()
+      && m_layer_names.count(l.get_name()) == 0) { return; }
+
+  // Create directory
+  const std::string root_file_path = get_multi_trainer_model_path(m, m_directory);
+  file::trainer_master_make_directory(root_file_path, m.get_comm());
+
+  // Save layer outputs on root process
+  for (int i = 0; i < l.get_num_children(); ++i) {
+    const auto& dtl = dynamic_cast<const data_type_layer<DataType>&>(l);
+    const CircMat<El::Device::CPU> circ_data(dtl.get_activations(i));
+    if (circ_data.CrossRank() == circ_data.Root()) {
+      const auto& data = static_cast<const CPUMat&>(circ_data.LockedMatrix());
+      const std::string file_name = (root_file_path
+                                     + c.get_state_string() + "_"
+                                     + l.get_name()
+                                     + "_output" + std::to_string(i)
+                                     + "." + m_file_format);
+      if (m_file_format == "csv") {
+        save_text(file_name, ",", data);
+      } else if (m_file_format == "tsv") {
+        save_text(file_name, "\t", data);
+      } else if (m_file_format == "npy") {
+        save_npy(file_name, dtl.get_output_dims(i), data);
+      } else if (m_file_format == "npz") {
+        save_npz(file_name,
+                 l.get_name() + "_output" + std::to_string(i),
+                 dtl.get_output_dims(i),
+                 data);
+      }
+    }
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_dump_outputs_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDumpOutputs&>(proto_msg);
+  const auto& layer_names = parse_set<std::string>(params.layers());
+  const auto& modes =
+    parse_set<execution_mode>(params.execution_modes());
+  return make_unique<dump_outputs>(layer_names,
+                                                  modes,
+                                                  params.batch_interval(),
+                                                  params.directory(),
+                                                  params.format());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/dump_weights.cpp b/src/callbacks/dump_weights.cpp
new file mode 100644
index 00000000000..dc61eb69e66
--- /dev/null
+++ b/src/callbacks/dump_weights.cpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// dump_weights .hpp .cpp - Callbacks to dump weight matrices
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/dump_weights.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+//#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/trainer_file_utils.hpp"
+
+#include <callbacks.pb.h>
+
+#include <string>
+
+namespace lbann {
+namespace callback {
+
+void dump_weights::on_train_begin(model *m) {
+  do_dump_weights(*m, "initial");
+}
+
+void dump_weights::on_epoch_end(model *m) {
+  do_dump_weights(*m);
+}
+
+void dump_weights::do_dump_weights(const model& m, std::string s) {
+  const auto& c = static_cast<const sgd_execution_context&>(m.get_execution_context());
+
+  if(c.get_epoch() % m_epoch_interval != 0)  return;
+
+  // Create directory
+  const std::string root_file_path = get_multi_trainer_model_path(m, m_directory);
+  file::trainer_master_make_directory(root_file_path, m.get_comm());
+
+  //  makedir(m_directory.c_str());
+  for (weights *w : m.get_weights()) {
+    const std::string file = (root_file_path
+                              + c.get_state_string()
+                              + "-" + w->get_name()
+                              + "-Weights");
+    const auto* dtw = dynamic_cast<const data_type_weights<DataType>*>(w);
+    El::Write(dtw->get_values(), file, El::ASCII);
+  }
+}
+
+std::unique_ptr<callback_base>
+build_dump_weights_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDumpWeights&>(proto_msg);
+  return make_unique<dump_weights>(params.directory(), params.epoch_interval());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/early_stopping.cpp b/src/callbacks/early_stopping.cpp
new file mode 100644
index 00000000000..218845cc132
--- /dev/null
+++ b/src/callbacks/early_stopping.cpp
@@ -0,0 +1,78 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_early_stopping .hpp .cpp - Callback hooks for early stopping
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/early_stopping.hpp"
+
+#include <callbacks.pb.h>
+
+#include <iostream>
+
+namespace lbann {
+namespace callback {
+
+early_stopping::early_stopping(int64_t patience) :
+  callback_base(), m_patience(patience) {}
+
+/// Monitor the objective function to see if the validation score
+/// continues to improve
+void early_stopping::on_validation_end(model *m) {
+  auto& c = m->get_execution_context();
+  execution_mode mode = c.get_execution_mode();
+  EvalType score = m->get_objective_function()->get_mean_value(mode);
+  if (score < m_last_score) {
+    if (m->get_comm()->am_trainer_master()) {
+      std::cout << "Model " << m->get_comm()->get_trainer_rank() <<
+        " early stopping: score is improving " << m_last_score << " >> " <<
+        score << std::endl;
+    }
+    m_last_score = score;
+    m_wait = 0;
+  } else {
+    if (m_wait >= m_patience) {
+      c.set_terminate_training(true);
+      if (m->get_comm()->am_trainer_master()) {
+        std::cout << "Model " << m->get_comm()->get_trainer_rank() <<
+          " terminating training due to early stopping: " << score <<
+          " score and " << m_last_score << " last score" << std::endl;
+      }
+    } else {
+      ++m_wait;
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_early_stopping_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackEarlyStopping&>(proto_msg);
+  return make_unique<early_stopping>(params.patience());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp
new file mode 100644
index 00000000000..4100f2f677d
--- /dev/null
+++ b/src/callbacks/gpu_memory_usage.cpp
@@ -0,0 +1,96 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/gpu_memory_usage.hpp"
+#include <iomanip>
+#include <sstream>
+
+namespace {
+template <typename T>
+T get_mean(const std::vector<T> &v) {
+  return std::accumulate(v.begin(), v.end(), 0.0) /
+      v.size();
+}
+template <typename T>
+T get_median(const std::vector<T> &v) {
+  std::vector<T> tmp = v;
+  int median_idx = tmp.size() / 2 - 1 + tmp.size() % 2;
+  std::nth_element(tmp.begin(), tmp.begin() + median_idx, tmp.end());
+  return tmp[median_idx];
+}
+template <typename T>
+T get_max(const std::vector<T> &v) {
+  return *std::max_element(v.begin(), v.end());
+}
+template <typename T>
+T get_min(const std::vector<T> &v) {
+  return *std::min_element(v.begin(), v.end());
+}
+}
+
+namespace lbann {
+namespace callback {
+
+void gpu_memory_usage::on_epoch_begin(model *m) {
+#ifdef LBANN_HAS_CUDA
+  size_t available;
+  size_t total;
+  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
+  size_t used = total - available;
+  auto comm = m->get_comm();
+  if (comm->am_trainer_master()) {
+    auto num_procs = comm->get_procs_per_trainer();
+    std::vector<size_t> used_list(num_procs);
+    comm->trainer_gather(used, used_list.data());
+    double used_mean = get_mean(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_median = get_median(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_max = get_max(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_min = get_min(used_list) / 1024.0 / 1024.0 / 1024.0;
+    std::stringstream ss;
+    ss << "Model " << m->get_comm()->get_trainer_rank()
+       << " GPU memory usage statistics : "
+       << std::setprecision(3)
+       << used_mean  << " GiB mean, "
+       << std::setprecision(3)
+       << used_median  << " GiB median, "
+       << std::setprecision(3)
+       << used_max  << " GiB max, "
+       << std::setprecision(3)
+       << used_min  << " GiB min "
+       << "("
+       << std::setprecision(3)
+       << (total / 1024.0 / 1024.0 / 1024.0)
+       << " GiB total)" << std::endl;
+    std::cout << ss.str();
+  } else {
+    comm->trainer_gather(used, comm->get_trainer_master());
+  }
+#endif
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/hang.cpp b/src/callbacks/hang.cpp
new file mode 100644
index 00000000000..70b53f5de8e
--- /dev/null
+++ b/src/callbacks/hang.cpp
@@ -0,0 +1,56 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/hang.hpp"
+
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+void hang::setup(model* m)
+{
+  if (m->get_comm()->am_world_master()) {
+    if (m_rank_to_hang == -1) {
+      std::cout << "*** HANGING EVERY RANK IN HANG CALLBACK ***"
+                << std::endl;
+    } else {
+      std::cout << "*** HANGING RANK " << m_rank_to_hang
+                << " IN HANG CALLBACK ***" << std::endl;
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_hang_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackHang&>(proto_msg);
+  return make_unique<hang>(params.rank());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/helpers.hpp b/src/callbacks/helpers.hpp
new file mode 100644
index 00000000000..e612ef21850
--- /dev/null
+++ b/src/callbacks/helpers.hpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/exception.hpp"
+
+#include <string>
+#include <vector>
+
+namespace lbann {
+namespace {
+template <typename T>
+std::vector<T*> select_things_by_name(
+  std::vector<T*> const& things,
+  std::vector<std::string> const& thing_names) {
+
+  std::vector<T*> out_things;
+  for (auto const& name : thing_names) {
+    auto it = std::find_if(
+      things.begin(), things.end(),
+      [&name](const T* t) { return t->get_name() == name; });
+    if (it != things.end())
+      out_things.push_back(*it);
+    else
+      LBANN_ERROR(std::string("Requested thing \"") + name
+                  + "\" does not exist in the list of things.");
+  }
+  return out_things;
+}
+} // namespace
+} // namespace lbann
diff --git a/src/callbacks/imcomm.cpp b/src/callbacks/imcomm.cpp
new file mode 100644
index 00000000000..6d13765c69d
--- /dev/null
+++ b/src/callbacks/imcomm.cpp
@@ -0,0 +1,184 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// imcomm .hpp .cpp - Send gradient updates between models
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/imcomm.hpp"
+
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+#include <callbacks.pb.h>
+
+#include <typeinfo>
+#include <typeindex>
+
+namespace lbann {
+namespace callback {
+
+imcomm::imcomm(imcomm::comm_type ct,
+    const std::shared_ptr<lbann_summary>& summarizer) :
+  m_default_ct(ct), m_summarizer(summarizer) {}
+
+imcomm::imcomm(imcomm::comm_type ct,
+    std::unordered_set<weights *> weights_list,
+    const std::shared_ptr<lbann_summary>& summarizer) :
+  imcomm(ct, summarizer) {
+  for (weights *w : weights_list) {
+    m_weights_params[w] = {};
+    m_weights_params[w].ct = ct;
+  }
+}
+
+void imcomm::set_weights_comm(weights *w, comm_type ct) {
+  m_weights_params[w] = {};
+  m_weights_params[w].ct = ct;
+}
+
+void imcomm::setup(model *m) {
+  for (weights *w : m->get_weights()) {
+
+    // Add weights if not already in list
+    if (m_weights_params.find(w) == m_weights_params.end()) {
+      m_weights_params[w] = {};
+      m_weights_params[w].ct = (w->has_optimizer() ?
+                                m_default_ct :
+                                NONE);
+    }
+    // Setup imcomm parameters if needed
+    imcomm_params& params = m_weights_params[w];
+    if (params.ct != NONE) {
+      if (!w->has_optimizer()) {
+        LBANN_ERROR(
+          "imcomm: trying to do inter-model gradient communication on ",
+          w->get_name(), ", which has no optimizer");
+      }
+    }
+  }
+}
+
+void imcomm::on_train_begin(model *m) {
+  lbann_comm *comm = m->get_comm();
+  if (comm->get_num_trainers() == 1) {
+    return;  // No point with only one model.
+  }
+  for (weights *w : m->get_weights()) {
+    auto& real_w = dynamic_cast<data_type_weights<DataType>&>(*w);
+    auto values = to_unique_ptr(real_w.get_values().Copy());
+    comm->intertrainer_broadcast_matrix(*values, 0);
+    real_w.set_values(*values);
+  }
+}
+
+void imcomm::on_backward_prop_end(model *m) {
+  const auto& c = m->get_execution_context();
+  lbann_comm *comm = m->get_comm();
+  if (comm->get_num_trainers() == 1 ||
+      c.get_execution_mode() != execution_mode::training) {
+    return;  // No point with only one model.
+  }
+  for (weights *w : m->get_weights()) {
+    auto& real_w = dynamic_cast<data_type_weights<DataType>&>(*w);
+    EvalType start_time = get_time();
+    imcomm_params& params = m_weights_params[w];
+    if (params.ct == NONE || !w->has_optimizer()) {
+      continue;
+    }
+    auto *opt = real_w.get_optimizer();
+    auto& real_opt = dynamic_cast<data_type_optimizer<DataType>&>(*opt);
+    auto gradient = to_unique_ptr(real_opt.get_gradient().Copy());
+    auto& local_gradients = gradient->Matrix();
+    switch (params.ct) {
+    case NORMAL:
+      comm->intertrainer_sum_matrix(local_gradients);
+      break;
+    default:
+      LBANN_ERROR("imcomm: unknown comm type");
+    }
+    real_opt.clear_gradient();
+    real_opt.add_to_gradient(*gradient);
+    EvalType im_time = get_time() - start_time;
+    do_summary(*m, real_w, im_time);
+  }
+}
+
+template <typename TensorDataType>
+void imcomm::do_summary(model const& m,
+                        data_type_weights<TensorDataType>& w,
+                        EvalType im_time) {
+  if (m_summarizer == nullptr) {
+    return;
+  }
+  const auto& c = m.get_execution_context();
+  std::string prefix = w.get_name() + "/imcomm_";
+  m_summarizer->reduce_scalar(prefix + "time",
+                              im_time, c.get_step());
+  // Use the same approximation the comm layer does.
+  auto const& local_gradients =
+    static_cast<const El::Matrix<TensorDataType, El::Device::CPU>&>(
+      w.get_optimizer()->get_gradient().LockedMatrix());
+  size_t bytes_sent =
+    sizeof(DataType) * local_gradients.Height() * local_gradients.Width();
+  size_t bytes_received =
+    sizeof(DataType) * local_gradients.Height() * local_gradients.Width();
+  m_summarizer->reduce_scalar(prefix + "bytes_sent",
+                              bytes_sent, c.get_step());
+  m_summarizer->reduce_scalar(prefix + "bytes_received",
+                              bytes_received, c.get_step());
+}
+
+/* Returns a string representation of the weight_initialization */
+std::string get_comm_type_name(typename imcomm::comm_type m) {
+  switch (m) {
+  case imcomm::NONE: return "none";
+  case imcomm::NORMAL: return "normal";
+  default:
+    LBANN_ERROR("Unknown value for comm_type");
+  }
+}
+
+std::unique_ptr<callback_base>
+build_imcomm_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>& summarizer) {
+  using param_msg_type = lbann_data::Callback::CallbackImComm;
+  const auto& params = dynamic_cast<const param_msg_type&>(proto_msg);
+  const auto& type_str = params.intertrainer_comm_method();
+  typename imcomm::comm_type type = imcomm::comm_type::NONE;
+  if (type_str == "none") {
+    type = imcomm::comm_type::NONE;
+  } else if (type_str == "normal") {
+    type = imcomm::comm_type::NORMAL;
+  } else {
+    LBANN_ERROR("invalid inter-model communication type (", type_str, ")");
+  }
+  std::unordered_set<weights*> selected_weights; /// @todo Initialize weights
+  return make_unique<imcomm>(type, selected_weights, summarizer);
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/learning_rate.cpp b/src/callbacks/learning_rate.cpp
new file mode 100644
index 00000000000..0001ff8be93
--- /dev/null
+++ b/src/callbacks/learning_rate.cpp
@@ -0,0 +1,398 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/learning_rate.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+#include "callback_helpers.hpp"
+
+#include <callbacks.pb.h>
+
+#include <algorithm>
+#include <cmath> // std::pow
+#include <iostream>
+#include <limits>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+float learning_rate::m_cur_global_lr = 0.0f;
+
+learning_rate::learning_rate() {}
+
+learning_rate::learning_rate(
+  std::vector<std::string> weights_names)
+  : m_weights_names(std::move(weights_names)) {}
+
+void learning_rate::setup(model *m) {
+
+  // Add all weights if list of weights is not initialized
+  std::vector<weights*> weights_list =
+    select_things_by_name(m->get_weights(), m_weights_names);
+  if (weights_list.empty()) {
+    weights_list = m->get_weights();
+  }
+
+  // Remove weights that are not being optimized
+  std::unordered_set<weights*>().swap(m_weights);
+  for (weights *w : weights_list) {
+    if (w->has_optimizer()) {
+      m_weights.insert(w);
+      // Initialize the global learning rate, exactly once.
+      if (m_cur_global_lr == 0.0f) {
+        m_cur_global_lr =
+          dynamic_cast<data_type_optimizer<DataType>*>(w->get_optimizer())->get_learning_rate();
+      }
+    }
+  }
+
+}
+
+void learning_rate::on_epoch_end(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  const float new_lr = global_schedule(m);
+  const float old_global_lr = m_cur_global_lr;
+  m_cur_global_lr = new_lr;
+  lbann_comm *comm = m->get_comm();
+  if (comm->am_trainer_master() && new_lr != old_global_lr) {
+    std::cout << "Model " << comm->get_trainer_rank() << ": "
+              << "changing global learning rate to " << new_lr
+              << " at epoch " << c.get_epoch() << std::endl;
+  }
+  for (weights* w : this->get_weights()) {
+    auto *opt = dynamic_cast<data_type_optimizer<DataType>*>(w->get_optimizer());
+    const float old_lr = opt->get_learning_rate();
+    if (old_lr != new_lr) {
+      opt->set_learning_rate(new_lr);
+    }
+  }
+}
+
+void learning_rate::on_backward_prop_end(model *m) {
+  for (weights *w : this->get_weights()) {
+    auto &opt = dynamic_cast<data_type_optimizer<DataType>&>(*w->get_optimizer());
+    const float old_lr = opt.get_learning_rate();
+    const float new_lr = optimizer_schedule(m, opt);
+    if (old_lr != new_lr) {
+      opt.set_learning_rate(new_lr);
+    }
+  }
+}
+
+float learning_rate::optimizer_schedule(model *m, optimizer &opt) {
+  return dynamic_cast<data_type_optimizer<DataType>&>(opt).get_learning_rate();
+}
+
+step_learning_rate::step_learning_rate(
+  size_t step, float amt) :
+  learning_rate(), m_step(step), m_amt(amt) {}
+
+step_learning_rate::step_learning_rate(
+  size_t step, float amt, std::vector<std::string> weights_names) :
+  learning_rate(std::move(weights_names)),
+  m_step(step), m_amt(amt) {}
+
+float step_learning_rate::global_schedule(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  if (c.get_epoch() % m_step == 0) {
+    return step_learning_rate::get_current_global_learning_rate() * m_amt;
+  } else {
+    return step_learning_rate::get_current_global_learning_rate();
+  }
+}
+
+adaptive_learning_rate::adaptive_learning_rate(
+  size_t patience, float amt) :
+  adaptive_learning_rate(patience, amt,
+                                        std::vector<std::string>()) {}
+
+adaptive_learning_rate::adaptive_learning_rate(
+  size_t patience, float amt, std::vector<std::string> weights_list) :
+  learning_rate(std::move(weights_list)),
+  m_patience(patience), m_amt(amt) {}
+
+float adaptive_learning_rate::global_schedule(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  // Determine behavior the first time this is called in an epoch
+  if (m_cur_epoch != c.get_epoch()) {
+    m_cur_epoch = c.get_epoch();
+    const auto mode = c.get_execution_mode();
+    const EvalType score = m->get_objective_function()->get_mean_value(mode);
+    if (score < m_last_score) {
+      // Reset wait counter if score has decreased
+      m_last_score = score;
+      m_wait = 0;
+      m_adjust_learning_rate = false;
+    } else if (m_wait >= m_patience) {
+      // Adjust learning rate if patience has been exceeded
+      m_last_score = score;
+      m_wait = 0;
+      m_adjust_learning_rate = true;
+    } else {
+      // Otherwise increment wait counter
+      m_wait++;
+      m_adjust_learning_rate = false;
+    }
+  }
+
+  // Adjust learning rate if needed
+  if (m_adjust_learning_rate) {
+    return adaptive_learning_rate::get_current_global_learning_rate() * m_amt;
+  } else {
+    return adaptive_learning_rate::get_current_global_learning_rate();
+  }
+}
+
+drop_fixed_learning_rate::drop_fixed_learning_rate(
+  std::vector<size_t> drop_epochs, float amt) :
+  drop_fixed_learning_rate(std::move(drop_epochs), amt,
+                                          std::vector<std::string>()) {}
+
+drop_fixed_learning_rate::drop_fixed_learning_rate(
+  std::vector<size_t> drop_epochs, float amt, std::vector<std::string> weights_names) :
+  learning_rate(std::move(weights_names)),
+  m_amt(amt), m_drop_epochs(std::move(drop_epochs)) {
+  // Sort in reverse order.
+  std::sort(m_drop_epochs.rbegin(), m_drop_epochs.rend());
+}
+
+float drop_fixed_learning_rate::global_schedule(model* m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  // Delete last drop epoch if we have already passed it
+  while (!m_drop_epochs.empty()
+         && c.get_epoch() > m_drop_epochs.back()) {
+    m_drop_epochs.pop_back();
+  }
+
+  // Adjust learning rate if at a drop epoch
+  if (!m_drop_epochs.empty() && c.get_epoch() == m_drop_epochs.back()) {
+    return drop_fixed_learning_rate::get_current_global_learning_rate() * m_amt;
+  } else {
+    return drop_fixed_learning_rate::get_current_global_learning_rate();
+  }
+}
+
+linear_growth_learning_rate::linear_growth_learning_rate(
+  float target, size_t num_epochs) :
+  linear_growth_learning_rate(target, num_epochs, 0,
+                                             std::vector<std::string>()) {}
+
+linear_growth_learning_rate::linear_growth_learning_rate(
+  float target, size_t num_epochs, size_t delay) :
+  linear_growth_learning_rate(target, num_epochs, delay,
+                                             std::vector<std::string>()) {}
+
+linear_growth_learning_rate::linear_growth_learning_rate(
+  float target, size_t num_epochs, size_t delay,
+  std::vector<std::string> weights_names) :
+  learning_rate(std::move(weights_names)),
+  m_target(target), m_inc(0),
+  m_num_epochs(num_epochs), m_delay(delay) {}
+
+void linear_growth_learning_rate::setup(model *m) {
+  learning_rate::setup(m);
+  // Compute the learning rate increase.
+  if (!this->get_weights().empty()) {
+    // Assumes all optimizers have the same initial learning rate.
+    m_base_lr = linear_growth_learning_rate::get_current_global_learning_rate();
+    m_inc = (m_target - m_base_lr) / m_num_epochs;
+  }
+}
+
+float linear_growth_learning_rate::global_schedule(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  if (c.get_epoch() < m_delay) {
+    return linear_growth_learning_rate::get_current_global_learning_rate();
+  } else if (c.get_epoch() <= m_num_epochs + m_delay) {
+    int num_left = m_num_epochs + m_delay - c.get_epoch();
+    return m_base_lr + m_inc*(m_num_epochs - num_left);
+  } else {
+    return linear_growth_learning_rate::get_current_global_learning_rate();
+  }
+}
+
+/**
+ * This constructor takes the policy specific parameters, the exponent (p)
+ * and the maximum number of iterations (max_iter).
+ * In case that max_iter is set to 0, it is calculated from the number of
+ * epochs (n_epochs). n_epochs is not used otherwise.
+ */
+poly_learning_rate::poly_learning_rate(
+  double p, size_t n_epochs, size_t max_iter)
+  : learning_rate(std::vector<std::string>()),
+    m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter),
+    m_end_lr(0.0f),
+    m_lr(1.0f), m_last_epoch_lr(1.0f) {}
+
+poly_learning_rate::poly_learning_rate(
+  double p, size_t n_epochs, size_t max_iter, double end_lr,  std::vector<std::string> weights_names)
+  : learning_rate(std::move(weights_names)),
+    m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter),
+    m_end_lr(end_lr),
+    m_lr(1.0f), m_last_epoch_lr(1.0f) {}
+
+/**
+ * Check if the maximum number of iterations is set. If not, compute it by the
+ * number of epochs and the number of iterations per epoch.
+ */
+void poly_learning_rate::setup(model *m) {
+  learning_rate::setup(m);
+  if (m_max_iter == 0ull) {
+    m_max_iter = m_num_epochs * m->get_num_iterations_per_epoch(execution_mode::training);
+  }
+}
+
+/**
+ * Keep the record of the learning rate at the end of the current epoch.
+ */
+float poly_learning_rate::global_schedule(model *m) {
+  const float scale = m_lr / m_last_epoch_lr;
+  m_last_epoch_lr = m_lr;
+  return (poly_learning_rate::get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr;
+}
+
+/**
+ * Compute the learning rate for the next iteration.
+ */
+float poly_learning_rate::optimizer_schedule(model *m, optimizer &opt) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  const size_t cur_iter = c.get_step();
+  if (m_max_iter > cur_iter) {
+    m_lr = static_cast<float>(std::pow(static_cast<double>(m_max_iter - cur_iter)/m_max_iter, m_p));
+  }
+  const float scale = m_lr / m_last_epoch_lr;
+  return (poly_learning_rate::get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr;
+}
+
+optimizerwise_adaptive_learning_rate::
+optimizerwise_adaptive_learning_rate(
+  float scale) :
+  optimizerwise_adaptive_learning_rate(
+    scale,
+    std::vector<std::string>()) {}
+
+optimizerwise_adaptive_learning_rate::
+optimizerwise_adaptive_learning_rate(
+  float scale, std::vector<std::string> weights_names) :
+  learning_rate(std::move(weights_names)), m_scale(scale) {}
+
+float optimizerwise_adaptive_learning_rate::optimizer_schedule(
+  model *m, optimizer &opt) {
+  auto& dto = dynamic_cast<data_type_optimizer<DataType>&>(opt);
+  DataType param_norm = El::Nrm2(dto.get_weights().get_values());
+  DataType param_grad_norm = El::Nrm2(dto.get_gradient());
+  if (param_norm > DataType(0) && param_grad_norm > DataType(0)) {
+    // TODO: Should incorporate weight decay, etc. here.
+    return optimizerwise_adaptive_learning_rate::get_current_global_learning_rate()
+      * m_scale * param_norm / param_grad_norm;
+  } else {
+    return dto.get_learning_rate();
+  }
+}
+
+std::unique_ptr<callback_base>
+build_step_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackStepLearningRate&>(proto_msg);
+  return make_unique<step_learning_rate>(
+    params.step(),
+    params.amt(),
+    parse_list<std::string>(params.weights()));
+}
+
+std::unique_ptr<callback_base>
+build_adaptive_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackAdaptiveLearningRate&>(proto_msg);
+  return make_unique<adaptive_learning_rate>(
+    params.patience(),
+    params.amt(),
+    parse_list<std::string>(params.weights()));
+}
+
+std::unique_ptr<callback_base>
+build_drop_fixed_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDropFixedLearningRate&>(proto_msg);
+  std::vector<size_t> drop_epochs;
+  for (int i = 0; i < params.drop_epoch_size(); ++i) {
+    drop_epochs.push_back(params.drop_epoch(i));
+  }
+  return make_unique<drop_fixed_learning_rate>(
+    std::move(drop_epochs),
+    params.amt(),
+    parse_list<std::string>(params.weights()));
+}
+
+std::unique_ptr<callback_base>
+build_linear_growth_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,const std::shared_ptr<lbann_summary>&) {
+  using MsgType = lbann_data::Callback::CallbackLinearGrowthLearningRate;
+  using CallbackType = linear_growth_learning_rate;
+  const auto& params =
+    dynamic_cast<const MsgType&>(proto_msg);
+  return make_unique<CallbackType>(params.target(),
+                                   params.num_epochs(),
+                                   params.delay(),
+                                   parse_list<std::string>(params.weights()));
+}
+
+std::unique_ptr<callback_base>
+build_optimizerwise_adaptive_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,const std::shared_ptr<lbann_summary>&) {
+  using MsgType = lbann_data::Callback::CallbackOptimizerwiseAdaptiveLearningRate;
+  using CallbackType = optimizerwise_adaptive_learning_rate;
+  const auto& params = dynamic_cast<const MsgType&>(proto_msg);
+  return make_unique<CallbackType>(params.scale(),
+                                   parse_list<std::string>(params.weights()));
+}
+
+std::unique_ptr<callback_base>
+build_poly_learning_rate_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackPolyLearningRate&>(proto_msg);
+  return make_unique<poly_learning_rate>(
+    params.power(),
+    params.num_epochs(),
+    params.max_iter(),
+    params.end_lr(),
+    parse_list<std::string>(params.weights()));
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/load_model.cpp b/src/callbacks/load_model.cpp
new file mode 100644
index 00000000000..3997d5df641
--- /dev/null
+++ b/src/callbacks/load_model.cpp
@@ -0,0 +1,131 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// load_model .hpp .cpp - Callbacks to load pretrained model(s)
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/load_model.hpp"
+#include "lbann/callbacks/checkpoint.hpp"
+#include "lbann/training_algorithms/training_algorithm.hpp"
+
+#include <callbacks.pb.h>
+#include <model.pb.h>
+
+
+#include <unistd.h>
+#include <dirent.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace lbann {
+namespace callback {
+
+
+void load_model::on_train_begin(model *m) {
+  if(!m_loaded) {
+    for (const auto& d : m_dirs) {
+      m_loaded = load_model_weights(d, "", m, true);
+      if(!m_loaded)  LBANN_ERROR("Unable to reload model on train begin");
+    }
+  }
+}
+
+void load_model::on_test_begin(model *m) {
+  if(!m_loaded) {
+    for (const auto& d : m_dirs) {
+      m_loaded = load_model_weights(d, "", m, true);
+      if(!m_loaded)  LBANN_ERROR("Unable to reload model on test begin");
+    }
+  }
+}
+
+
+bool load_model::load_model_weights(const std::string& ckpt_dir,
+                                    const std::string& alg_name,
+                                    model *m,
+                                    bool ckptdir_is_fullpath) {
+  std::vector<std::string> weight_list = std::vector<std::string>();
+  std::string active_ckpt_dir;
+  if(ckptdir_is_fullpath) {
+    active_ckpt_dir = add_delimiter(ckpt_dir);
+  }else {
+    size_t epochLast = std::numeric_limits<size_t>::max();;
+    size_t stepLast = std::numeric_limits<size_t>::max();;
+    execution_mode mode = execution_mode::invalid;
+    active_ckpt_dir = get_last_shared_checkpoint_filename(alg_name, ckpt_dir);
+
+    // get last epoch and step saved.
+    int success = read_latest(active_ckpt_dir, &mode, &epochLast, &stepLast);
+    if(!success) {
+      LBANN_WARNING("Unable to find the latest checkpoint ", active_ckpt_dir);
+      return false;
+    }
+    active_ckpt_dir = get_shared_checkpoint_dirname(alg_name, ckpt_dir, mode, epochLast, stepLast) + m->get_name() + '/';
+  }
+
+  lbann_comm *comm = m->get_comm();
+  if(comm->am_trainer_master()) {
+    std::cout << "Loading model weights from " << active_ckpt_dir << std::endl;
+  }
+
+  DIR *weight_dir = opendir(active_ckpt_dir.c_str());
+  if(weight_dir == nullptr)
+  {
+    LBANN_WARNING("error opening ",  active_ckpt_dir);
+    return false;
+  }
+  // Populate weight list
+  struct dirent *weight_file;
+  while ((weight_file = readdir(weight_dir)) != nullptr){
+    if(!strncmp(weight_file->d_name,"model_weights_",14))
+      weight_list.push_back(std::string(weight_file->d_name));
+  }
+  closedir(weight_dir);
+
+  // load weights that appear in weight list.
+  m->reload_weights(active_ckpt_dir, weight_list);
+  return true;
+}
+
+std::unique_ptr<callback_base>
+build_load_model_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackLoadModel&>(proto_msg);
+  if(params.extension().size() != 0) {
+    return make_unique<load_model>(
+      parse_list<std::string>(params.dirs()),
+      params.extension());
+  }
+  else {
+    return make_unique<load_model>(
+      parse_list<std::string>(params.dirs()));
+  }
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp
new file mode 100644
index 00000000000..ace193e3fb9
--- /dev/null
+++ b/src/callbacks/ltfb.cpp
@@ -0,0 +1,588 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/ltfb.hpp"
+#include "lbann/callbacks/imcomm.hpp"
+#include "lbann/utils/random_number_generators.hpp"
+#include "lbann/optimizers/sgd.hpp"
+#include "lbann/optimizers/adam.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+#include <callbacks.pb.h>
+
+#include <sstream>
+#include <string>
+#include <tuple>
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+/** @brief Generate partner trainer assignments.
+ *
+ *  Requires a scatter from the world master process. If there are an
+ *  odd number of trainers, one of them is partnered with itself.
+ */
+El::Int get_partner_trainer(lbann_comm& comm,
+                            const std::string& message_prefix) {
+  if (comm.am_world_master()) { // Root process
+
+    // Assign partner trainers
+    // Note: The first trainer in 'trainers' is paired with the
+    // second, the third with the fourth, and so on. If there are an
+    // odd number of trainers, the last one is partnered with itself.
+    const El::Int num_trainers = comm.get_num_trainers();
+    const El::Int procs_per_trainer = comm.get_procs_per_trainer();
+    std::vector<El::Int> trainers(num_trainers);
+    std::iota(trainers.begin(), trainers.end(), 0);
+    std::shuffle(trainers.begin(), trainers.end(), get_fast_generator());
+
+    // Print partner assignments to standard output
+    std::stringstream msg;
+    msg << message_prefix << "tournament partners -";
+    for (El::Int i = 0; i < num_trainers; i += 2) {
+      msg << (i > 0 ? "," : "")
+          << " {" << trainers[i];
+      if (i+1 < num_trainers) {
+        msg << "," << trainers[i+1];
+      }
+      msg << "}";
+    }
+    msg << "\n";
+    std::cout << msg.str();
+
+    // Send partner assignments to all processes
+    std::vector<El::Int> send_buffer(num_trainers * procs_per_trainer);
+    for (El::Int i = 0; i < num_trainers; i += 2) {
+      const auto& trainer1 = trainers[i];
+      const auto& trainer2 = (i+1 < num_trainers) ? trainers[i+1] : trainer1;
+      std::fill_n(&send_buffer[trainer1 * procs_per_trainer],
+                  procs_per_trainer, trainer2);
+      std::fill_n(&send_buffer[trainer2 * procs_per_trainer],
+                  procs_per_trainer, trainer1);
+    }
+    return comm.scatter(send_buffer.data(), comm.get_world_comm());
+
+  } else { // Non-root process
+    return comm.scatter<El::Int>(comm.get_world_master(),
+                                 comm.get_world_comm());
+  }
+}
+
+/// See @c lbann::callbacks::ltfb::communication_algorithm::sendrecv_weights
+namespace sendrecv_weights {
+
+/** @param weights_names    Names of weights to exchange. If empty,
+ *                          then all weights are exchanged.
+ *  @param send_weights     Weights values sent to partner.
+ *  @param recv_weights     Weights values recieved from partner.
+ */
+template <typename TensorDataType>
+void exchange_models(lbann_comm& comm,
+                     El::Int partner_trainer,
+                     const std::set<std::string>& weights_names,
+                     const std::vector<data_type_weights<TensorDataType>*>& send_weights,
+                     std::vector<data_type_weights<TensorDataType>*>& recv_weights,
+                     bool exchange_hyperparameters) {
+
+  // Get partner process
+  const El::Int rank_in_trainer = comm.get_rank_in_trainer();
+  const El::Int procs_per_trainer = comm.get_procs_per_trainer();
+  const El::Int partner_rank_in_world = (partner_trainer * procs_per_trainer
+                                         + rank_in_trainer);
+
+  // Exchange weights with partner
+  for (size_t i = 0; i < send_weights.size(); ++i) {
+    const auto& send = *send_weights[i];
+    auto& recv = *recv_weights[i];
+    if (weights_names.empty()
+        || (std::find(weights_names.begin(), weights_names.end(),
+                      send.get_name())
+            != weights_names.end())) {
+
+      // Exchange weights values
+      El::SendRecv(send.get_values().LockedMatrix(),
+                   recv.get_values().Matrix(),
+                   comm.get_world_comm(),
+                   partner_rank_in_world,
+                   partner_rank_in_world);
+
+      // Exchange optimizer state
+      const auto* send_opt = send.get_optimizer();
+      auto* recv_opt = recv.get_optimizer();
+      const auto* send_sgd = dynamic_cast<const sgd<TensorDataType>*>(send_opt);
+      auto* recv_sgd = dynamic_cast<sgd<TensorDataType>*>(recv_opt);
+      if (send_sgd != nullptr && recv_sgd != nullptr) {
+        if(exchange_hyperparameters) {
+          using hyperparameters_type = std::tuple<TensorDataType, TensorDataType, bool>;
+          hyperparameters_type hyperparameters(send_sgd->get_learning_rate(),
+                                               send_sgd->get_momentum(),
+                                               send_sgd->using_nesterov());
+          El::mpi::SendRecv(reinterpret_cast<El::byte*>(&hyperparameters),
+                            sizeof(hyperparameters_type),
+                            partner_rank_in_world,
+                            partner_rank_in_world,
+                            comm.get_world_comm(),
+                            El::SyncInfo<El::Device::CPU>{});
+          recv_sgd->set_learning_rate(std::get<0>(hyperparameters));
+          recv_sgd->set_momentum(std::get<1>(hyperparameters));
+          recv_sgd->set_nesterov(std::get<2>(hyperparameters));
+        }
+        El::SendRecv(send_sgd->get_velocity().LockedMatrix(),
+                     recv_sgd->get_velocity().Matrix(),
+                     comm.get_world_comm(),
+                     partner_rank_in_world,
+                     partner_rank_in_world);
+      }
+      const auto* send_adam = dynamic_cast<const adam<TensorDataType>*>(send_opt);
+      auto* recv_adam = dynamic_cast<adam<TensorDataType>*>(recv_opt);
+      if (send_adam != nullptr && recv_adam != nullptr) {
+        if(exchange_hyperparameters) {
+          using hyperparameters_type = std::tuple<TensorDataType, TensorDataType, TensorDataType,
+                                                  TensorDataType, TensorDataType, TensorDataType>;
+          hyperparameters_type hyperparameters(send_adam->get_learning_rate(),
+                                               send_adam->get_beta1(),
+                                               send_adam->get_beta2(),
+                                               send_adam->get_eps(),
+                                               send_adam->get_current_beta1(),
+                                               send_adam->get_current_beta2());
+          El::mpi::SendRecv(reinterpret_cast<El::byte*>(&hyperparameters),
+                            sizeof(hyperparameters_type),
+                            partner_rank_in_world,
+                            partner_rank_in_world,
+                            comm.get_world_comm(),
+                            El::SyncInfo<El::Device::CPU>{});
+          recv_adam->set_learning_rate(std::get<0>(hyperparameters));
+          recv_adam->set_beta1(std::get<1>(hyperparameters));
+          recv_adam->set_beta2(std::get<2>(hyperparameters));
+          recv_adam->set_eps(std::get<3>(hyperparameters));
+          recv_adam->set_current_beta1(std::get<4>(hyperparameters));
+          recv_adam->set_current_beta2(std::get<5>(hyperparameters));
+        }
+        El::SendRecv(send_adam->get_moment1().LockedMatrix(),
+                     recv_adam->get_moment1().Matrix(),
+                     comm.get_world_comm(),
+                     partner_rank_in_world,
+                     partner_rank_in_world);
+        El::SendRecv(send_adam->get_moment2().LockedMatrix(),
+                     recv_adam->get_moment2().Matrix(),
+                     comm.get_world_comm(),
+                     partner_rank_in_world,
+                     partner_rank_in_world);
+      }
+
+    }
+  }
+
+}
+
+} // namespace sendrecv_weights
+
+/// See @c lbann::callbacks::ltfb::communication_algorithm::checkpoint_file
+namespace checkpoint_file {
+
+/** @param weights_names    Names of weights to exchange. If empty,
+ *                          then all weights are exchanged.
+ *  @param local_weight     Copies of weights. Used to restore weights
+ *                          that we don't want to exchange.
+ */
+template <typename TensorDataType>
+void exchange_models(lbann_comm& comm,
+                     El::Int partner_trainer,
+                     model& m,
+                     El::Int step,
+                     const std::set<std::string>& weights_names,
+                     const std::vector<data_type_weights<TensorDataType>*>& local_weights,
+                     const std::string& ckpt_basedir) {
+
+  // Checkpoint directories
+  const auto basedir = (ckpt_basedir.empty()?
+                          std::string("") :
+                          add_delimiter(ckpt_basedir));
+  const auto local_trainer = comm.get_trainer_rank();
+  const std::string send_dir = (basedir
+                                + m.get_name()
+                                + "_trainer" + std::to_string(local_trainer)
+                                + "_step" + std::to_string(step));
+  const std::string recv_dir = (basedir
+                                + m.get_name()
+                                + "_trainer" + std::to_string(partner_trainer)
+                                + "_step" + std::to_string(step));
+
+  // Save model checkpoint
+  {
+    persist p;
+    p.set_cb_type(callback_type::model_only);
+    p.open_checkpoint(send_dir, comm.am_trainer_master());
+    comm.trainer_barrier();
+    m.save_to_checkpoint_shared(p);
+    p.close_checkpoint();
+  }
+
+  // Synchronize with partner trainer
+  comm.trainer_barrier();
+  if (comm.am_trainer_master()) {
+    int send{0}, recv{0};
+    comm.sendrecv(&send, 1, partner_trainer, 0,
+                  &recv, 1, partner_trainer, 0,
+                  El::SyncInfo<El::Device::CPU>{});
+  }
+  comm.trainer_barrier();
+
+  // Load model checkpoint from partner trainer
+  {
+    persist p;
+    p.set_cb_type(callback_type::model_only);
+    p.open_restart(recv_dir);
+    m.load_from_checkpoint_shared(p);
+    p.close_restart();
+  }
+
+  // Restore weights that shouldn't be exchanged
+  if (!weights_names.empty()) {
+    const auto& model_weights = m.get_weights();
+    for (size_t i = 0; i < model_weights.size(); ++i) {
+      if (std::find(weights_names.begin(),
+                    weights_names.end(),
+                    model_weights[i]->get_name())
+          == weights_names.end()) {
+        using dtw_type = data_type_weights<TensorDataType>;
+        dynamic_cast<dtw_type&>(*model_weights[i]) = *local_weights[i];
+      }
+    }
+  }
+
+}
+
+void restore_local_model(lbann_comm& comm,
+                         model& m,
+                         El::Int step,
+                         const std::string& ckpt_basedir) {
+
+  // Checkpoint directories
+  const auto basedir = (ckpt_basedir.empty()?
+                          std::string("") :
+                          add_delimiter(ckpt_basedir));
+  const auto local_trainer = comm.get_trainer_rank();
+  const std::string checkpoint_dir = (basedir
+                                      + m.get_name()
+                                      + "_trainer" + std::to_string(local_trainer)
+                                      + "_step" + std::to_string(step));
+
+  // Load local model checkpoint
+  persist p;
+  p.set_cb_type(callback_type::model_only);
+  p.open_restart(checkpoint_dir);
+  m.load_from_checkpoint_shared(p);
+  p.close_restart();
+
+}
+
+} // namespace checkpoint_file
+
+/** Get mean metric value with validation set. */
+EvalType evaluate(model& m, const std::string& metric_name) {
+  auto& c = m.get_execution_context();
+  // Make sure data readers finish asynchronous work
+  const auto original_mode = c.get_execution_mode();
+  m.collect_background_data_fetch(original_mode);
+
+  // Mark the data store as loading - Note that this is a temporary fix
+  // for the current use of the tournament
+  m.mark_data_store_explicitly_loading(execution_mode::validation);
+
+  // Evaluate model on validation set
+  c.get_trainer().evaluate(&m, execution_mode::validation);
+
+  // Get metric value
+  bool found_metric = false;
+  EvalType metric_value = 0;
+  for (const auto& met : m.get_metrics()) {
+    if (met->name() == metric_name) {
+      found_metric = true;
+      metric_value = met->get_mean_value(execution_mode::validation);
+      break;
+    }
+  }
+  if (!found_metric) {
+    LBANN_ERROR("could not find metric \"",metric_name,"\" ",
+                "in model \"",m.get_name(),"\"");
+  }
+
+  // Mark the data store as loaded - Note that this is a temporary fix
+  // for the current use of the tournament
+  m.make_data_store_preloaded(execution_mode::validation);
+
+  // Clean up and return metric value
+  c.set_execution_mode(original_mode);
+  return metric_value;
+
+}
+
+} // namespace <anon>
+
+ltfb::ltfb(El::Int batch_interval,
+           std::string metric_name,
+           std::set<std::string> weights_names,
+           bool low_score_wins,
+           communication_algorithm comm_algo,
+           const std::string& ckpt_basedir,
+           bool exchange_hyperparameters)
+  : callback_base(batch_interval),
+    m_metric_name(std::move(metric_name)),
+    m_weights_names(std::move(weights_names)),
+    m_low_score_wins(low_score_wins),
+    m_comm_algo(comm_algo),
+    m_ckpt_basedir(ckpt_basedir),
+    m_exchange_hyperparameters(exchange_hyperparameters) {}
+
+ltfb::ltfb(const ltfb& other) :
+  callback_base(other),
+  m_metric_name(other.m_metric_name),
+  m_weights_names(other.m_weights_names),
+  m_low_score_wins(other.m_low_score_wins),
+  m_comm_algo(other.m_comm_algo),
+  m_ckpt_basedir(other.m_ckpt_basedir),
+  m_exchange_hyperparameters(other.m_exchange_hyperparameters) {
+
+  // Deep copy
+  m_workspace_weights.clear();
+  m_workspace_weights.reserve(other.m_workspace_weights.size());
+  for (const auto& w : other.m_workspace_weights) {
+    m_workspace_weights.emplace_back(w->clone());
+  }
+
+}
+
+ltfb& ltfb::operator=(const ltfb& other) {
+  callback_base::operator=(other);
+
+  // Shallow copies
+  m_metric_name = other.m_metric_name;
+  m_weights_names = other.m_weights_names;
+  m_low_score_wins = other.m_low_score_wins;
+  m_comm_algo = other.m_comm_algo;
+  m_ckpt_basedir = other.m_ckpt_basedir;
+  m_exchange_hyperparameters = other.m_exchange_hyperparameters;
+
+  // Deep copy
+  m_workspace_weights.clear();
+  m_workspace_weights.reserve(other.m_workspace_weights.size());
+  for (const auto& w : other.m_workspace_weights) {
+    m_workspace_weights.emplace_back(w->clone());
+  }
+
+  return *this;
+}
+
+void ltfb::setup(model *m) {
+
+  // Create workspace objects
+  const auto& model_weights = m->get_weights();
+  m_workspace_weights.clear();
+  m_workspace_weights.reserve(model_weights.size());
+  for (const auto& w : model_weights) {
+    m_workspace_weights.emplace_back(w->clone());
+  }
+
+  // Make sure model does not have inter-trainer communication callback
+  for (auto&& cb : m->get_callbacks()) {
+    if (dynamic_cast<imcomm*>(cb) != nullptr) {
+      LBANN_ERROR("Detected both LTFB and imcomm callbacks. ");
+    }
+  }
+
+}
+
+void ltfb::on_train_begin(model *m) {
+  auto&& comm = *m->get_comm();
+
+  if (comm.am_world_master()) {
+    std::cout << "starting synchronizing trainers...\n";
+  }
+  double tm1 = get_time();
+  /// Make sure that all of the trainers are ready to go before starting
+  comm.intertrainer_barrier();
+
+  if (comm.am_world_master()) {
+    std::cout << "synchronizing trainers... " << get_time()-tm1 <<"s\n";
+  }
+}
+
+void ltfb::on_batch_begin(model *m) {
+  const auto& c = m->get_execution_context();
+  auto&& comm = *m->get_comm();
+
+  // Check whether to start LTFB round
+  const auto mode = c.get_execution_mode();
+  const auto step = c.get_step();
+  if (mode != execution_mode::training || step == 0) { return; }
+
+  // Print message
+  const auto message_prefix = (std::string{} + "LTFB ("
+                               + "model \"" + m->get_name() + "\", "
+                               + "step " + std::to_string(step)
+                               + "): ");
+  if (comm.am_world_master()) {
+    std::cout << message_prefix + "starting tournament...\n";
+  }
+
+  // Determine partner model for tournament
+  const El::Int local_trainer = comm.get_trainer_rank();
+  const El::Int partner_trainer = get_partner_trainer(comm, message_prefix);
+
+  // Evaluate local model
+  if (comm.am_world_master()) {
+    std::cout << message_prefix + "evaluating local model...\n";
+  }
+  const auto local_score = evaluate(*m, m_metric_name);
+
+  // Store local model data
+  auto&& model_weights_tmp = m->get_weights();
+  std::vector<data_type_weights<DataType>*> local_weights, model_weights;
+  local_weights.reserve(model_weights_tmp.size());
+  model_weights.reserve(model_weights_tmp.size());
+  for (size_t i = 0; i < model_weights_tmp.size(); ++i) {
+    auto* wsp = dynamic_cast<data_type_weights<DataType>*>(
+      m_workspace_weights[i].get());
+    auto* mlw = dynamic_cast<data_type_weights<DataType>*>(
+      model_weights_tmp[i]);
+    if (!wsp || !mlw)
+      LBANN_ERROR("Detected bad weights");
+    local_weights.push_back(wsp);
+    model_weights.push_back(mlw);
+    *local_weights.back() = *model_weights.back();
+  }
+
+  // Exchange model data with partner trainer
+  if (comm.am_world_master()) {
+    std::cout << message_prefix + "exchanging model data...\n";
+  }
+  switch (m_comm_algo) {
+  case communication_algorithm::sendrecv_weights:
+    sendrecv_weights::exchange_models(comm,
+                                      partner_trainer,
+                                      m_weights_names,
+                                      local_weights,
+                                      model_weights,
+                                      m_exchange_hyperparameters);
+    break;
+  case communication_algorithm::checkpoint_file:
+    checkpoint_file::exchange_models(comm,
+                                     partner_trainer,
+                                     *m,
+                                     step,
+                                     m_weights_names,
+                                     local_weights,
+                                     m_ckpt_basedir);
+    break;
+  default:
+    LBANN_ERROR("invalid LTFB communication algorithm");
+  }
+
+  // Evaluate partner model
+  if (comm.am_world_master()) {
+    std::cout << message_prefix + "evaluating partner model...\n";
+  }
+  const auto& partner_score = evaluate(*m, m_metric_name);
+
+  // Choose tournament winner
+  // Note: restore local model data if it got a better score.
+  El::Int tournament_winner = partner_trainer;
+  if ((m_low_score_wins && local_score <= partner_score) ||
+      (!m_low_score_wins && local_score >= partner_score)) {
+    tournament_winner = local_trainer;
+    switch (m_comm_algo) {
+    case communication_algorithm::sendrecv_weights:
+      for (size_t i = 0; i < model_weights.size(); ++i) {
+        *model_weights[i] = *local_weights[i];
+      }
+      break;
+    case communication_algorithm::checkpoint_file:
+      checkpoint_file::restore_local_model(comm, *m, step, m_ckpt_basedir);
+      break;
+    default:
+      LBANN_ERROR("invalid LTFB communication algorithm");
+    }
+  }
+
+  // Report tournament winner
+  if (comm.am_trainer_master()) {
+    std::stringstream msg;
+    msg << message_prefix
+        << "trainer " << local_trainer << " "
+        << "selected model from trainer " << tournament_winner
+        << " (trainer " << local_trainer << " score "
+        << "= " << local_score << ", "
+        << "trainer " << partner_trainer << " score "
+        << "= " << partner_score << ")" << "\n";
+    std::cout << msg.str();
+  }
+}
+
+typename ltfb::communication_algorithm
+ltfb::string_to_comm_algo(const std::string& str) {
+  if (str.empty() || str == "sendrecv_weights") {
+    return communication_algorithm::sendrecv_weights;
+  }
+  if (str == "checkpoint_file") {
+    return communication_algorithm::checkpoint_file;
+  }
+
+  // Invalid LTFB communication algorithm
+  LBANN_ERROR("invalid LTFB communication algorithm (",str,")");
+  return communication_algorithm::sendrecv_weights;
+
+}
+
+void ltfb::set_ckpt_basedir(const std::string& dir) {
+  m_ckpt_basedir = dir;
+}
+
+std::string ltfb::get_ckpt_basedir() const {
+  return m_ckpt_basedir;
+}
+
+std::unique_ptr<callback_base>
+build_ltfb_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackLTFB&>(proto_msg);
+  return make_unique<ltfb>(
+    params.batch_interval(),
+    params.metric(),
+    parse_set<std::string>(params.weights()),
+    params.low_score_wins(),
+    ltfb::string_to_comm_algo(params.communication_algorithm()),
+    params.checkpoint_basedir(),
+    params.exchange_hyperparameters());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/mixup.cpp b/src/callbacks/mixup.cpp
new file mode 100644
index 00000000000..0632830ce94
--- /dev/null
+++ b/src/callbacks/mixup.cpp
@@ -0,0 +1,117 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <algorithm>
+#include "lbann/callbacks/mixup.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/beta.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/image.hpp"
+
+#include <callbacks.pb.h>
+
+#include <unordered_set>
+
+namespace lbann {
+namespace callback {
+
+void mixup::on_forward_prop_end(model *m, Layer *l) {
+  if (!m_layers.count(l->get_name())) {
+    return;
+  }
+  const auto& c =
+    dynamic_cast<const sgd_execution_context&>(m->get_execution_context());
+  if (c.get_execution_mode() != execution_mode::training) {
+    return;  // No mixup outside of training.
+  }
+
+  auto* dtl = dynamic_cast<data_type_layer<DataType>*>(l);
+  auto& samples_orig = dtl->get_local_activations(0);
+  auto& labels_orig = dtl->get_local_activations(1);
+  if (samples_orig.GetDevice() != El::Device::CPU ||
+      labels_orig.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("Mixup requires CPU data.");
+  }
+  // Copy samples.
+  // Assumes data are on CPU.
+  CPUMat samples, labels;
+  El::Copy(samples_orig, samples);
+  El::Copy(labels_orig, labels);
+  El::Int mbsize = samples.Width();
+  const El::Int samples_height = samples.Height();
+  const El::Int labels_height = labels.Height();
+  auto& gen = get_fast_generator();
+  beta_distribution<float> dist(m_alpha, m_alpha);
+
+  // For now, data must be on the CPU.
+  if (samples.GetDevice() != El::Device::CPU ||
+      labels.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("mixup only works with CPU data");
+  }
+
+  // Decide how to mix the mini-batch.
+  std::vector<El::Int> shuffled_indices(mbsize);
+  std::iota(shuffled_indices.begin(), shuffled_indices.end(), 0);
+  std::shuffle(shuffled_indices.begin(), shuffled_indices.end(), gen);
+
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < mbsize; ++i) {
+    const El::Int j = shuffled_indices[i];
+    if (i == j) {
+      continue;
+    }
+    float lambda = dist(gen);
+    lambda = std::max(lambda, 1.0f - lambda);
+    const float lambda_sub = 1.0f - lambda;
+    const DataType* __restrict__ x1_buf = samples.LockedBuffer(0, i);
+    const DataType* __restrict__ x2_buf = samples.LockedBuffer(0, j);
+    DataType* __restrict__ x = samples_orig.Buffer(0, i);
+    const DataType* __restrict__ y1_buf = labels.LockedBuffer(0, i);
+    const DataType* __restrict__ y2_buf = labels.LockedBuffer(0, j);
+    DataType* __restrict__ y = labels_orig.Buffer(0, i);
+    for (El::Int k = 0; k < samples_height; ++k) {
+      x[k] = lambda*x1_buf[k] + lambda_sub*x2_buf[k];
+    }
+    for (El::Int k = 0; k < labels_height; ++k) {
+      y[k] = lambda*y1_buf[k] + lambda_sub*y2_buf[k];
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_mixup_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackMixup&>(proto_msg);
+  const auto& layers_list = parse_list<std::string>(params.layers());
+  std::unordered_set<std::string> layers(layers_list.begin(),
+                                         layers_list.end());
+  return make_unique<mixup>(layers, params.alpha());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/monitor_io.cpp b/src/callbacks/monitor_io.cpp
new file mode 100644
index 00000000000..8ac10c3655d
--- /dev/null
+++ b/src/callbacks/monitor_io.cpp
@@ -0,0 +1,87 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// monitor_io .hpp .cpp - Callback hooks for I/O monitoring
+////////////////////////////////////////////////////////////////////////////////
+
+#include <utility>
+
+#include "lbann/callbacks/monitor_io.hpp"
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/proto/proto_common.hpp"
+
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+void monitor_io::on_epoch_end(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  lbann_comm *comm = m->get_comm();
+  for (Layer *layer : m->get_layers()) {
+    if(m_layers.size() == 0
+       || m_layers.find(layer->get_name()) != m_layers.end()) {
+      auto *input = dynamic_cast<generic_input_layer<DataType> *> (layer);
+      if(input != nullptr) {
+        std::cout << "Rank " << comm->get_trainer_rank() << "."
+                  << comm->get_rank_in_trainer() << " processed "
+                  << input->get_num_samples_trained() << " training samples of "
+                  << input->get_total_num_training_samples() << " ("
+                  << input->get_num_samples_trained() / c.get_epoch() << " per epoch)" << std::endl;
+      }
+    }
+  }
+}
+
+void monitor_io::on_test_end(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  lbann_comm *comm = m->get_comm();
+  for (Layer *layer : m->get_layers()) {
+    if(m_layers.size() == 0
+       || m_layers.find(layer->get_name()) != m_layers.end()) {
+      auto *input = dynamic_cast<generic_input_layer<DataType> *> (layer);
+      if(input != nullptr) {
+        std::cout << "Rank " << comm->get_trainer_rank() << "."
+                  << comm->get_rank_in_trainer() << " processed "
+                  << input->get_num_samples_tested() << " test samples of "
+                  << input->get_total_num_testing_samples() << " ("
+                  << input->get_num_samples_tested() / c.get_epoch()
+                  << " per epoch)" << std::endl;
+      }
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_monitor_io_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackDispIOStats&>(proto_msg);
+  return make_unique<monitor_io>(
+    parse_list<std::string>(params.layers()));
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/perturb_adam.cpp b/src/callbacks/perturb_adam.cpp
new file mode 100644
index 00000000000..39920d4609a
--- /dev/null
+++ b/src/callbacks/perturb_adam.cpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/perturb_adam.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/random_number_generators.hpp"
+
+#include <callbacks.pb.h>
+
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <set>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+perturb_adam::perturb_adam(DataType learning_rate_factor,
+                           DataType beta1_factor,
+                           DataType beta2_factor,
+                           DataType eps_factor,
+                           bool perturb_during_training,
+                           El::Int batch_interval,
+                           std::set<std::string> weights_names)
+  : callback_base(batch_interval),
+    m_learning_rate_factor(learning_rate_factor),
+    m_beta1_factor(beta1_factor),
+    m_beta2_factor(beta2_factor),
+    m_eps_factor(eps_factor),
+    m_perturb_during_training(perturb_during_training),
+    m_weights_names(std::move(weights_names)) {}
+
+void perturb_adam::setup(model* m) {
+  perturb(*m);
+}
+
+void perturb_adam::on_batch_begin(model* m) {
+  const auto& c = m->get_execution_context();
+  if (m_perturb_during_training && c.get_step() > 0) {
+    perturb(*m);
+  }
+}
+
+void perturb_adam::perturb(model& m) const {
+  auto* comm = m.get_comm();
+  for (auto* w : m.get_weights()) {
+    if (w == nullptr) {
+      LBANN_ERROR("callback \"", name(), "\" "
+                  "got a weights pointer that is a null pointer");
+    }
+    if (m_weights_names.empty()
+        || m_weights_names.count(w->get_name()) > 0) {
+
+      // Check if weights has Adam optimizer
+      auto* opt = dynamic_cast<adam<DataType>*>(w->get_optimizer());
+      if (!m_weights_names.empty() && opt == nullptr) {
+        auto* opt_ = w->get_optimizer();
+        LBANN_ERROR(
+          "callback \"", name(), "\" "
+          "expected weights \"", w->get_name(), "\" "
+          "to have an Adam optimizer, but found ",
+          (opt_ ? opt_->get_type() : "no optimizer"));
+      }
+
+      // Perturb Adam optimizer
+      if (opt) {
+        perturb(*comm, *opt);
+      }
+
+    }
+  }
+}
+
+void perturb_adam::perturb(lbann_comm& comm, adam<DataType>& opt) const {
+
+  // Perturb hyperparameters on master process
+  std::vector<DataType> hyperparameters(4);
+  if (comm.am_trainer_master()) {
+
+    // Useful constants
+    // Note: half_epsilon is the difference between 1.0 and the next
+    // smallest representable value.
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    constexpr DataType min_val = std::numeric_limits<DataType>::min();
+    constexpr DataType half_epsilon = std::numeric_limits<DataType>::epsilon() / 2;
+
+    // RNG
+    auto& gen = get_generator();
+    std::normal_distribution<DataType> dist(zero, one);
+
+    // Perturb log(learning_rate)
+    auto learning_rate = opt.get_learning_rate();
+    if (m_learning_rate_factor != zero && learning_rate >= zero) {
+      auto log_val = std::log(std::max(learning_rate, min_val));
+      log_val += m_learning_rate_factor * dist(gen);
+      learning_rate = std::exp(log_val);
+    }
+    hyperparameters[0] = learning_rate;
+
+    // Perturb log(1 - beta1)
+    auto beta1 = opt.m_beta1;
+    if (m_beta1_factor != zero && zero <= beta1 && beta1 <= one) {
+      auto log_val = std::log(std::max(one - beta1, half_epsilon));
+      log_val += m_beta1_factor * dist(gen);
+      beta1 = std::max(one - std::exp(log_val), zero);
+    }
+    hyperparameters[1] = beta1;
+
+    // Perturb log(1 - beta2)
+    auto beta2 = opt.m_beta2;
+    if (m_beta2_factor != zero && zero <= beta2 && beta2 <= one) {
+      auto log_val = std::log(std::max(one - beta2, half_epsilon));
+      log_val += m_beta2_factor * dist(gen);
+      beta2 = std::max(one - std::exp(log_val), zero);
+    }
+    hyperparameters[2] = beta2;
+
+    // Perturb log(eps)
+    auto eps = opt.m_eps;
+    if (m_eps_factor != zero && eps >= zero) {
+      auto log_val = std::log(std::max(eps, min_val));
+      log_val += m_eps_factor * dist(gen);
+      eps = std::exp(log_val);
+    }
+    hyperparameters[3] = eps;
+
+  }
+
+  // Communicate hyperparameters from master processes
+  comm.trainer_broadcast(comm.get_trainer_master(),
+                       hyperparameters.data(),
+                       hyperparameters.size());
+
+  // Update hyperparameters
+  opt.set_learning_rate(hyperparameters[0]);
+  opt.m_beta1 = hyperparameters[1];
+  opt.m_beta2 = hyperparameters[2];
+  opt.m_eps = hyperparameters[3];
+
+}
+
+std::unique_ptr<callback_base>
+build_perturb_adam_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackPerturbAdam&>(proto_msg);
+  return make_unique<perturb_adam>(
+    params.learning_rate_factor(),
+    params.beta1_factor(),
+    params.beta2_factor(),
+    params.eps_factor(),
+    params.perturb_during_training(),
+    params.batch_interval(),
+    parse_set<std::string>(params.weights()));
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/perturb_dropout.cpp b/src/callbacks/perturb_dropout.cpp
new file mode 100644
index 00000000000..8240888b2e8
--- /dev/null
+++ b/src/callbacks/perturb_dropout.cpp
@@ -0,0 +1,134 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/perturb_dropout.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/random_number_generators.hpp"
+
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+perturb_dropout::perturb_dropout(EvalType keep_prob_factor,
+                                                         std::set<std::string> layer_names)
+  : callback_base(1),
+    m_keep_prob_factor(keep_prob_factor),
+    m_layer_names(std::move(layer_names)) {}
+
+void perturb_dropout::setup(model* m) {
+  perturb(*m);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+dropout<TensorDataType, T_layout, Dev>* perturb_dropout::get_dropout_layer(Layer* l) {
+  if(auto d_layer = dynamic_cast<dropout<TensorDataType, T_layout, Dev>*>(l)) return d_layer;
+  else return nullptr;
+}
+
+void perturb_dropout::perturb(model& m) {
+  auto* comm = m.get_comm();
+  for (auto* l : m.get_layers()) {
+    if (l == nullptr) {
+      std::stringstream err;
+      err << "callback \"" << name() << "\" "
+          << "got a layer pointer that is a null pointer";
+      LBANN_ERROR(err.str());
+    }
+    if (m_layer_names.empty()
+        || m_layer_names.count(l->get_name()) > 0) {
+
+      auto d_dp_cpu = get_dropout_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>(l);
+      auto d_mp_cpu = get_dropout_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>(l);
+      #ifdef LBANN_HAS_GPU
+      auto d_dp_gpu = get_dropout_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>(l);
+      auto d_mp_gpu = get_dropout_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>(l);
+      #endif
+      // Perturb dropout layer
+        if(d_dp_cpu != nullptr || d_mp_cpu != nullptr
+           #ifdef LBANN_HAS_GPU
+           || d_dp_gpu != nullptr || d_mp_gpu != nullptr
+           #endif
+          ) {
+        EvalType new_keep_prob;
+        if (comm->am_trainer_master()) {
+
+          // Useful constants
+          constexpr EvalType zero = 0;
+          constexpr EvalType one = 1;
+          constexpr EvalType min_val = std::numeric_limits<EvalType>::min();
+
+          // RNG
+          auto& gen = get_generator();
+          std::normal_distribution<EvalType> dist(zero, one);
+
+          // Perturb log(keep_prob)
+          EvalType old_keep_prob = 0;
+          if (d_dp_cpu) old_keep_prob = d_dp_cpu->get_keep_prob();
+          if (d_mp_cpu) old_keep_prob = d_mp_cpu->get_keep_prob();
+          #ifdef LBANN_HAS_GPU
+          if (d_dp_gpu) old_keep_prob = d_dp_gpu->get_keep_prob();
+          if (d_mp_gpu) old_keep_prob = d_mp_gpu->get_keep_prob();
+          #endif
+          if (m_keep_prob_factor > zero) {
+            auto log_val = std::log(one - std::max(old_keep_prob, min_val));
+            log_val += m_keep_prob_factor * dist(gen);
+            new_keep_prob = std::max(EvalType(0.5), std::min(one - std::exp(log_val),one));
+            std::cout << "Trainer [ " << comm->get_trainer_rank() << " ] keep prob changed from "
+                << old_keep_prob << " to " << new_keep_prob << std::endl;
+          }
+
+        }
+
+        // Communicate new keep prob from trainer master processes
+        comm->trainer_broadcast(comm->get_trainer_master(), new_keep_prob);
+
+        // Update keep prob
+        if (d_dp_cpu) d_dp_cpu->set_keep_prob(new_keep_prob);
+        if (d_mp_cpu) d_mp_cpu->set_keep_prob(new_keep_prob);
+        #ifdef LBANN_HAS_GPU
+        if (d_dp_gpu) d_dp_gpu->set_keep_prob(new_keep_prob);
+        if (d_mp_gpu) d_mp_gpu->set_keep_prob(new_keep_prob);
+        #endif
+
+      }
+
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_perturb_dropout_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackPerturbDropout&>(proto_msg);
+  return make_unique<perturb_dropout>(
+    params.keep_dropout_factor(),
+    parse_set<std::string>(params.layers()));
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/print_model_description.cpp b/src/callbacks/print_model_description.cpp
new file mode 100644
index 00000000000..b44d64547aa
--- /dev/null
+++ b/src/callbacks/print_model_description.cpp
@@ -0,0 +1,49 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/print_model_description.hpp"
+#include "lbann/models/model.hpp"
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+void print_model_description::on_setup_end(model *m) {
+  if (m->get_comm()->am_world_master()) {
+    std::cout << "\n"
+              << m->get_description()
+              << std::endl;
+  }
+}
+
+std::unique_ptr<callback_base>
+build_print_model_description_callback_from_pbuf(
+  const google::protobuf::Message&, const std::shared_ptr<lbann_summary>&) {
+  return make_unique<print_model_description>();
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/print_statistics.cpp b/src/callbacks/print_statistics.cpp
new file mode 100644
index 00000000000..595e6e832a7
--- /dev/null
+++ b/src/callbacks/print_statistics.cpp
@@ -0,0 +1,270 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// print_statistics .hpp .cpp - Callback hooks to print information
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/print_statistics.hpp"
+
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <callbacks.pb.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+void print_statistics::setup(model *m) {
+#ifdef LBANN_VERSION
+  lbann_comm *comm = m->get_comm();
+  if (comm->am_world_master()) {
+    std::cout << "Training with LLNL LBANN version "
+              << LBANN_MAKE_STR(LBANN_VERSION) << std::endl;
+  }
+#endif
+}
+
+void print_statistics::on_epoch_begin(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  lbann_comm *comm = m->get_comm();
+  if (comm->am_world_master()) {
+
+    // Get first input layer in model
+    generic_input_layer<DataType>* input = nullptr;
+    for (auto&& l : m->get_layers()) {
+      input = dynamic_cast<generic_input_layer<DataType>*>(l);
+      if (input != nullptr) { break; }
+    }
+    if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
+
+    // Print message
+    std::cout << "--------------------------------------------------------------------------------"
+              << std::endl;
+    std::cout << "[" << c.get_epoch() << "] Epoch : stats formated [tr/v/te]"
+              << " iter/epoch ="
+              << " ["
+              << input->get_num_iterations_per_epoch(execution_mode::training)
+              << "/"
+              << input->get_num_iterations_per_epoch(execution_mode::validation)
+              << "/"
+              << input->get_num_iterations_per_epoch(execution_mode::testing)
+              << "]"
+              << std::endl;
+    std::cout << std::setfill(' ') << std::setw(23)
+              << " global MB ="
+              << " ["
+              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::training)
+              << "/"
+              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::validation)
+              << "/"
+              << std::setw(4) << input->get_global_mini_batch_size(execution_mode::testing)
+              << "]"
+              << " global last MB ="
+              << " ["
+              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::training)
+              << std::setw(2) << " "
+              << "/"
+              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::validation)
+              << std::setw(2) << " "
+              << "/"
+              << std::setw(4) << input->get_global_last_mini_batch_size(execution_mode::testing)
+              << std::setw(2) << " "
+              << "]"
+              << std::endl;
+    std::cout << std::setfill(' ') << std::setw(23)
+              << "  local MB ="
+              << " ["
+              << std::setw(4) << input->get_mini_batch_size(execution_mode::training)
+              << "/"
+              << std::setw(4) << input->get_mini_batch_size(execution_mode::validation)
+              << "/"
+              << std::setw(4) << input->get_mini_batch_size(execution_mode::testing)
+              << "]"
+              << "  local last MB ="
+              << " ["
+              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::training)
+              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::training)
+              << "/"
+              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::validation)
+              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::validation)
+              << "/"
+              << std::setw(4) << input->get_last_mini_batch_size(execution_mode::testing)
+              << "+" << input->get_world_master_mini_batch_adjustment(execution_mode::testing)
+              << "]"
+              << std::endl;
+    std::cout << "--------------------------------------------------------------------------------"
+              << std::endl;
+  }
+}
+
+void print_statistics::on_epoch_end(model *m) {
+  report_results(m);
+}
+
+void print_statistics::on_validation_end(model *m) {
+  report_results(m);
+}
+
+void print_statistics::on_test_end(model *m) {
+  report_results(m);
+}
+
+void print_statistics::report_results(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  lbann_comm *comm = m->get_comm();
+
+  // Get string for execution mode
+  const execution_mode mode = c.get_execution_mode();
+  std::string mode_string;
+  switch (mode) {
+  case execution_mode::training:
+    mode_string = "training epoch " + std::to_string(c.get_epoch()-1);
+    break;
+  case execution_mode::validation:
+    mode_string = "validation";
+    break;
+  case execution_mode::testing:
+    mode_string = "test";
+    break;
+  default:
+    std::stringstream err;
+    err << __FILE__ << " " << __LINE__ << " :: "
+        << "invalid execution mode for reporting results";
+    throw lbann_exception(err.str());
+  }
+
+  if (comm->am_trainer_master()) {
+    const int num_trainers = comm->get_num_trainers();
+
+    // Report objective function value
+    const EvalType obj_fn = m->get_objective_function()->get_mean_value(mode);
+    const int obj_fn_samples = m->get_objective_function()->get_statistics_num_samples(mode);
+    if (comm->am_world_master()) {
+      std::vector<EvalType> obj_fn_list(comm->get_num_trainers());
+      std::vector<int> num_samples_list(comm->get_num_trainers());
+      comm->intertrainer_gather(obj_fn, obj_fn_list);
+      comm->intertrainer_gather(obj_fn_samples, num_samples_list);
+      if(!m_print_global_stat_only) {
+        for (int i = 0; i < num_trainers; ++i) {
+          std::cout << m->get_name() << " (instance " <<  i <<  ") "  << mode_string << " "
+                    << "objective function : " << obj_fn_list[i]
+                    << std::endl;
+        }
+      }
+      if (num_trainers > 1) {
+        const EvalType avg_obj_fn = (std::inner_product(num_samples_list.begin(),
+                                                        num_samples_list.end(),
+                                                        obj_fn_list.begin(),
+                                                        EvalType(0))
+                                     / std::accumulate(num_samples_list.begin(),
+                                                       num_samples_list.end(),
+                                                       0));
+        std::cout << m->get_name() << " global average " << mode_string << " "
+                  << "objective function : " << avg_obj_fn
+                  << std::endl;
+      }
+    } else {
+      comm->intertrainer_gather(obj_fn, comm->get_world_master());
+      comm->intertrainer_gather(obj_fn_samples, comm->get_world_master());
+    }
+
+    // Report score for each metric
+    for (const auto& met : m->get_metrics()) {
+      const EvalType score = met->get_mean_value(mode);
+      const int score_samples = met->get_statistics_num_samples(mode);
+      if (comm->am_world_master()) {
+        std::vector<EvalType> score_list(comm->get_num_trainers());
+        std::vector<int> num_samples_list(comm->get_num_trainers());
+        comm->intertrainer_gather(score, score_list);
+        comm->intertrainer_gather(score_samples, num_samples_list);
+        if(!m_print_global_stat_only) {
+          for (int i = 0; i < num_trainers; ++i) {
+            std::cout << m->get_name() << " (instance " << i <<  ") " << mode_string << " "
+                      << met->name() << " : "
+                      << score_list[i] << met->get_unit()
+                      << std::endl;
+          }
+        }
+        if (num_trainers > 1) {
+          const EvalType min_score = *std::min_element(score_list.begin(), score_list.end());
+          const EvalType avg_score = (std::inner_product(num_samples_list.begin(),
+                                                         num_samples_list.end(),
+                                                         score_list.begin(),
+                                                         EvalType(0))
+                                      / std::accumulate(num_samples_list.begin(),
+                                                        num_samples_list.end(),
+                                                        0));
+          const EvalType max_score = *std::max_element(score_list.begin(), score_list.end());
+          EvalType scores_stdev = EvalType(0);
+          for (const auto& t : score_list) {
+            const auto& diff = t - avg_score;
+            scores_stdev += diff * diff;
+          }
+          scores_stdev /= score_list.size() - 1;
+          scores_stdev = El::Sqrt(std::max(scores_stdev, EvalType(0)));
+          std::cout << m->get_name() << " (global average) "  << mode_string << " "
+                    << met->name() << " : "
+                    << avg_score << met->get_unit()
+                    << std::endl;
+          std::cout << m->get_name() << " (global min) "  << mode_string << " "
+                    << met->name() << " : "
+                    << min_score << met->get_unit()
+                    << std::endl;
+          std::cout << m->get_name() << " (global max) "  << mode_string << " "
+                    << met->name() << " : "
+                    << max_score << met->get_unit()
+                    << std::endl;
+          std::cout << m->get_name() << " (global stdev) "  << mode_string << " "
+                    << met->name() << " : "
+                    << scores_stdev << met->get_unit()
+                    << std::endl;
+        }
+      } else {
+        comm->intertrainer_gather(score, comm->get_intertrainer_master());
+        comm->intertrainer_gather(score_samples, comm->get_intertrainer_master());
+      }
+    }
+
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_print_statistics_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackPrint&>(proto_msg);
+  return make_unique<print_statistics>(params.interval(),
+                                           params.print_global_stat_only());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp
index 05c0fddb215..d95cb7e05ba 100644
--- a/src/callbacks/profiler.cpp
+++ b/src/callbacks/profiler.cpp
@@ -23,12 +23,14 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 //
-// lbann_callback_timer .hpp .cpp - Callback hooks to time training
+// timer .hpp .cpp - Callback hooks to time training
 ///////////////////////////////////////////////////////////////////////////////
 
-#include <algorithm>
 #include "lbann/callbacks/profiler.hpp"
 #include "lbann/utils/profiling.hpp"
+
+#include <callbacks.pb.h>
+
 #ifdef LBANN_NVPROF
 #include "nvToolsExt.h"
 #include "nvToolsExtCuda.h"
@@ -36,10 +38,14 @@
 #include "cuda_runtime.h"
 #endif
 
+#include <algorithm>
+#include <string>
+
 namespace lbann {
+namespace callback {
 
-lbann_callback_profiler::lbann_callback_profiler(bool sync, bool skip_init) :
-    lbann_callback(), m_sync(sync), m_skip_init(skip_init) {
+profiler::profiler(bool sync, bool skip_init) :
+    callback_base(), m_sync(sync), m_skip_init(skip_init) {
 #ifdef LBANN_NVPROF
   nvtxNameCudaStreamA(El::GPUManager::Stream(), "Hydrogen");
 #endif
@@ -48,93 +54,103 @@ lbann_callback_profiler::lbann_callback_profiler(bool sync, bool skip_init) :
   }
 }
 
-void lbann_callback_profiler::on_epoch_begin(model *m) {
+void profiler::on_epoch_begin(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
   // Skip the first epoch
-  if (m_skip_init && m->get_epoch() == 1) {
+  if (m_skip_init && c.get_epoch() == 1) {
     prof_start();
   }
-  prof_region_begin(("epoch " + std::to_string(m->get_epoch())).c_str(),
+  prof_region_begin(("epoch " + std::to_string(c.get_epoch())).c_str(),
                     prof_colors[0], m_sync);
 }
 
-void lbann_callback_profiler::on_epoch_end(model *m) {
-  prof_region_end(("epoch " + std::to_string(m->get_epoch())).c_str(),
+void profiler::on_epoch_end(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  prof_region_end(("epoch " + std::to_string(c.get_epoch())).c_str(),
                   m_sync);
 }
 
-void lbann_callback_profiler::on_validation_begin(model *m) {
-  prof_region_begin(("val " + std::to_string(m->get_epoch())).c_str(),
+void profiler::on_validation_begin(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  prof_region_begin(("val " + std::to_string(c.get_epoch())).c_str(),
                     prof_colors[0], m_sync);
 }
 
-void lbann_callback_profiler::on_validation_end(model *m) {
-  prof_region_end(("val " + std::to_string(m->get_epoch())).c_str(),
+void profiler::on_validation_end(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  prof_region_end(("val " + std::to_string(c.get_epoch())).c_str(),
                   m_sync);
 }
 
-void lbann_callback_profiler::on_test_begin(model *m) {
-  prof_region_begin(("test " + std::to_string(m->get_epoch())).c_str(),
+void profiler::on_test_begin(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  prof_region_begin(("test " + std::to_string(c.get_epoch())).c_str(),
                     prof_colors[0], m_sync);
 }
 
-void lbann_callback_profiler::on_test_end(model *m) {
-  prof_region_end(("test " + std::to_string(m->get_epoch())).c_str(),
+void profiler::on_test_end(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  prof_region_end(("test " + std::to_string(c.get_epoch())).c_str(),
                   m_sync);
 }
 
-void lbann_callback_profiler::on_batch_begin(model *m) {
-  prof_region_begin(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(),
+void profiler::on_batch_begin(model *m) {
+  const auto& c = m->get_execution_context();
+  prof_region_begin(("batch " + std::to_string(c.get_step())).c_str(),
                     prof_colors[1], m_sync);
 }
 
-void lbann_callback_profiler::on_batch_end(model *m) {
-  prof_region_end(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(),
+void profiler::on_batch_end(model *m) {
+  const auto& c = m->get_execution_context();
+  prof_region_end(("batch " + std::to_string(c.get_step())).c_str(),
                   m_sync);
 }
 
-void lbann_callback_profiler::on_batch_evaluate_begin(model *m) {
-  prof_region_begin(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(),
+void profiler::on_batch_evaluate_begin(model *m) {
+  const auto& c = m->get_execution_context();
+  prof_region_begin(("batch eval " + std::to_string(c.get_step())).c_str(),
                     prof_colors[1], m_sync);
 }
 
-void lbann_callback_profiler::on_batch_evaluate_end(model *m) {
-  prof_region_end(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(),
+void profiler::on_batch_evaluate_end(model *m) {
+  const auto& c = m->get_execution_context();
+  prof_region_end(("batch eval " + std::to_string(c.get_step())).c_str(),
                   m_sync);
 }
 
-void lbann_callback_profiler::on_forward_prop_begin(model *m) {
+void profiler::on_forward_prop_begin(model *m) {
   prof_region_begin("forward", prof_colors[2], m_sync);
 }
 
-void lbann_callback_profiler::on_forward_prop_end(model *m) {
+void profiler::on_forward_prop_end(model *m) {
   prof_region_end("forward", m_sync);
 }
 
-void lbann_callback_profiler::on_evaluate_forward_prop_begin(model *m) {
+void profiler::on_evaluate_forward_prop_begin(model *m) {
   prof_region_begin("forward", prof_colors[2], m_sync);
 }
 
-void lbann_callback_profiler::on_evaluate_forward_prop_end(model *m) {
+void profiler::on_evaluate_forward_prop_end(model *m) {
   prof_region_end("forward", m_sync);
 }
 
-void lbann_callback_profiler::on_backward_prop_begin(model *m) {
+void profiler::on_backward_prop_begin(model *m) {
   prof_region_begin("backward", prof_colors[3], m_sync);
 }
 
-void lbann_callback_profiler::on_backward_prop_end(model *m) {
+void profiler::on_backward_prop_end(model *m) {
   prof_region_end("backward", m_sync);
 }
 
-void lbann_callback_profiler::on_optimize_begin(model *m) {
+void profiler::on_optimize_begin(model *m) {
   prof_region_begin("optimize", prof_colors[4], m_sync);
 }
 
-void lbann_callback_profiler::on_optimize_end(model *m) {
+void profiler::on_optimize_end(model *m) {
   prof_region_end("optimize", m_sync);
 }
 
-int lbann_callback_profiler::get_color(Layer *l) {
+int profiler::get_color(Layer *l) {
   const std::string &lname = l->get_type();
   int idx = 5;
   if (lname == "fully connected") {
@@ -161,36 +177,46 @@ int lbann_callback_profiler::get_color(Layer *l) {
   return prof_colors[idx % num_prof_colors];
 }
 
-void lbann_callback_profiler::on_forward_prop_begin(model *m, Layer *l) {
+void profiler::on_forward_prop_begin(model *m, Layer *l) {
   prof_region_begin(("fw " + l->get_name()).c_str(), get_color(l), m_sync);
 }
 
-void lbann_callback_profiler::on_forward_prop_end(model *m, Layer *l) {
+void profiler::on_forward_prop_end(model *m, Layer *l) {
   prof_region_end(("fw " + l->get_name()).c_str(), m_sync);
 }
 
-void lbann_callback_profiler::on_evaluate_forward_prop_begin(model *m, Layer *l) {
+void profiler::on_evaluate_forward_prop_begin(model *m, Layer *l) {
   prof_region_begin(("fw " + l->get_name()).c_str(), get_color(l), m_sync);
 }
 
-void lbann_callback_profiler::on_evaluate_forward_prop_end(model *m, Layer *l) {
+void profiler::on_evaluate_forward_prop_end(model *m, Layer *l) {
   prof_region_end(("fw " + l->get_name()).c_str(), m_sync);
 }
 
-void lbann_callback_profiler::on_backward_prop_begin(model *m, Layer *l) {
+void profiler::on_backward_prop_begin(model *m, Layer *l) {
   prof_region_begin(("bw " + l->get_name()).c_str(), get_color(l), m_sync);
 }
 
-void lbann_callback_profiler::on_backward_prop_end(model *m, Layer *l) {
+void profiler::on_backward_prop_end(model *m, Layer *l) {
   prof_region_end(("bw " + l->get_name()).c_str(), m_sync);
 }
 
-void lbann_callback_profiler::on_optimize_begin(model *m, weights *w) {
+void profiler::on_optimize_begin(model *m, weights *w) {
   prof_region_begin(("opt " + w->get_name()).c_str(), prof_colors[5], m_sync);
 }
 
-void lbann_callback_profiler::on_optimize_end(model *m, weights *w) {
+void profiler::on_optimize_end(model *m, weights *w) {
   prof_region_end(("opt " + w->get_name()).c_str(), m_sync);
 }
 
-}  // namespace lbann
+std::unique_ptr<callback_base>
+build_profiler_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackProfiler&>(proto_msg);
+  return make_unique<profiler>(params.sync(),
+                                              params.skip_init());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/replace_weights.cpp b/src/callbacks/replace_weights.cpp
new file mode 100644
index 00000000000..7ea6f73e5f4
--- /dev/null
+++ b/src/callbacks/replace_weights.cpp
@@ -0,0 +1,76 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/replace_weights.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+
+#include "callback_helpers.hpp"
+
+#include <callbacks.pb.h>
+
+#include <string>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+void replace_weights::setup(model *m) {
+  auto const layers = m->get_layers();
+  m_src_layers = select_things_by_name(layers, m_src_layer_names);
+  m_dst_layers = select_things_by_name(layers, m_dst_layer_names);
+
+  // Pretend the extra storage space matters
+  std::vector<std::string>().swap(m_src_layer_names);
+  std::vector<std::string>().swap(m_dst_layer_names);
+}
+
+void replace_weights::on_batch_end(model *m) {
+  const auto& c = m->get_execution_context();
+  const auto& step = c.get_step();
+  if(step % m_batch_interval == 0) {
+    for(size_t i = 0; i < m_src_layers.size(); i++) {
+      if (!m_src_layers[i])
+        LBANN_ERROR("Source layer pointer ", i, " is null. "
+                    "It probably shouldn't be.");
+      m_dst_layers[i]->replace_weights(*m_src_layers[i]);
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_replace_weights_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackReplaceWeights&>(proto_msg);
+  return make_unique<replace_weights>(
+    parse_list<std::string>(params.source_layers()),
+    parse_list<std::string>(params.destination_layers()),
+    params.batch_interval());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/save_images.cpp b/src/callbacks/save_images.cpp
new file mode 100644
index 00000000000..a9b3a8670b8
--- /dev/null
+++ b/src/callbacks/save_images.cpp
@@ -0,0 +1,177 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/save_images.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+
+#include "lbann/proto/proto_common.hpp"
+
+#include <callbacks.pb.h>
+
+#ifdef LBANN_HAS_OPENCV
+#include <opencv2/imgcodecs.hpp>
+#endif // LBANN_HAS_OPENCV
+
+namespace lbann {
+namespace callback {
+
+namespace {
+
+void save_image(std::string prefix,
+                std::string format,
+                const std::vector<Layer*>& layers,
+                const std::vector<std::string>& layer_names) {
+#ifdef LBANN_HAS_OPENCV
+  for (const auto* l : layers) {
+
+    // Only save outputs of layers in list
+    const auto& name = l->get_name();
+    if (std::find(layer_names.begin(), layer_names.end(), name)
+        == layer_names.end()) {
+      continue;
+    }
+
+    auto const* dtl = dynamic_cast<data_type_layer<DataType> const*>(l);
+    // Check that tensor dimensions are valid for images
+    const auto& dims = dtl->get_output_dims();
+    El::Int num_channels(0), height(0), width(0);
+    if (dims.size() == 2) {
+      num_channels = 1;
+      height = dims[0];
+      width = dims[1];
+    } else if (dims.size() == 3) {
+      num_channels = dims[0];
+      height = dims[1];
+      width = dims[2];
+    }
+    if (!(num_channels == 1 || num_channels == 3)
+        || height < 1 || width < 1) {
+      std::stringstream err;
+      err << "images are assumed to either be "
+          << "2D tensors in HW format or 3D tensors in CHW format, "
+          << "but the output of layer \"" << l->get_name() << "\" "
+          << "has dimensions ";
+        for (size_t i = 0; i < dims.size(); ++i) {
+          err << (i > 0 ? "" : " x ") << dims[i];
+        }
+      LBANN_ERROR(err.str());
+    }
+
+    // Get tensor data
+    const auto& raw_data = dtl->get_activations();
+    std::unique_ptr<El::AbstractDistMatrix<DataType>> raw_data_v(raw_data.Construct(raw_data.Grid(), raw_data.Root()));
+    El::LockedView(*raw_data_v, raw_data, El::ALL, El::IR(0));
+    CircMat<El::Device::CPU> circ_data(raw_data_v->Grid(), raw_data_v->Root());
+    circ_data = *raw_data_v;
+
+    // Export tensor as image
+    if (circ_data.CrossRank() == circ_data.Root()) {
+      const auto& data = circ_data.LockedMatrix();
+
+      // Data will be scaled to be in [0,256]
+      DataType lower = data(0, 0);
+      DataType upper = data(0, 0);
+      for (El::Int i = 1; i < data.Height(); ++i) {
+        lower = std::min(lower, data(i, 0));
+        upper = std::max(upper, data(i, 0));
+      }
+      const auto& scale = ((upper > lower) ?
+                           256 / (upper - lower) :
+                           DataType(1));
+
+      // Copy data into OpenCV matrix
+      int type = -1;
+      if (num_channels == 1) { type = CV_8UC1; }
+      if (num_channels == 3) { type = CV_8UC3; }
+      cv::Mat img(height, width, type);
+      for (El::Int row = 0; row < height; ++row) {
+        for (El::Int col = 0; col < width; ++col) {
+          const auto& offset = row * width + col;
+          if (num_channels == 1) {
+            img.at<uchar>(row, col)
+              = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
+          } else if (num_channels == 3) {
+            cv::Vec3b pixel;
+            pixel[0] = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
+            pixel[1] = cv::saturate_cast<uchar>(scale * (data(height*width + offset, 0) - lower));
+            pixel[2] = cv::saturate_cast<uchar>(scale * (data(2*height*width + offset, 0) - lower));
+            img.at<cv::Vec3b>(row, col) = pixel;
+          }
+        }
+      }
+
+      // Write image to file
+      cv::imwrite(prefix + "-" + name + "." + format, img);
+
+    }
+
+  }
+#endif // LBANN_HAS_OPENCV
+}
+
+} // namespace
+
+save_images::save_images(std::vector<std::string> layer_names,
+                                         std::string image_format,
+                                         std::string image_prefix)
+  : callback_base(),
+    m_layer_names(std::move(layer_names)),
+    m_image_format(image_format.empty() ? "jpg" : image_format),
+    m_image_prefix(std::move(image_prefix)) {
+#ifndef LBANN_HAS_OPENCV
+  LBANN_ERROR("OpenCV not detected");
+#endif // LBANN_HAS_OPENCV
+}
+
+void save_images::on_epoch_end(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  save_image(build_string(m_image_prefix, "epoch", c.get_epoch()),
+             m_image_format,
+             m->get_layers(),
+             m_layer_names);
+}
+
+void save_images::on_test_end(model *m) {
+  save_image(build_string(m_image_prefix, "test"),
+             m_image_format,
+             m->get_layers(),
+             m_layer_names);
+}
+
+std::unique_ptr<callback_base>
+build_save_images_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSaveImages&>(proto_msg);
+  return make_unique<save_images>(
+    parse_list<>(params.layers()),
+    params.image_format(),
+    params.image_prefix());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/save_model.cpp b/src/callbacks/save_model.cpp
new file mode 100644
index 00000000000..52b8af210f0
--- /dev/null
+++ b/src/callbacks/save_model.cpp
@@ -0,0 +1,157 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// save_model .hpp .cpp - Callbacks to save a models description and weights
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/save_model.hpp"
+#include "lbann/callbacks/checkpoint.hpp" // Reuse the checkpoint naming scheme
+#include "lbann/training_algorithms/training_algorithm.hpp"
+
+#include <callbacks.pb.h>
+#include <model.pb.h>
+
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+
+#include <unistd.h>
+#include <dirent.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace lbann {
+namespace callback {
+
+
+/// Save the model's prototext and weights
+void save_model::on_train_end(model *m) {
+  if(!m_disable_save_after_training){
+    do_save_model(m);
+  }
+}
+
+void save_model::write_proto_binary(const lbann_data::Model& proto,
+                                                   const std::string filename) {
+  std::fstream output(filename.c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
+  proto.SerializeToOstream(&output);
+}
+
+void save_model::write_proto_text(const lbann_data::Model& proto,
+                                                 const std::string filename) {
+  int fd = openwrite(filename.c_str());
+  auto output = new google::protobuf::io::FileOutputStream(fd);
+  google::protobuf::TextFormat::Print(proto, output);
+  delete output;
+  close(fd);
+}
+
+bool save_model::do_save_model(model *m) {
+  lbann_data::Model model_param;
+
+  p.set_cb_type(callback_type::weights_only);
+  do_save_model_weights(m);
+  p.set_cb_type(callback_type::invalid);
+
+#if 0 /// @todo BVE FIXME this method for writing out the prototext does not seem to work
+  m->write_proto(&model_param);
+  t->write_proto(&trainer_param);
+  std::string filename = m->get_name() + "." + m_extension;
+  std::string fullpath = m_dir + "/" + filename;
+  //@todo flag to save as either binary or text
+  if(m_extension == "bin") write_proto_binary(model_param,fullpath);
+  else write_proto_text(model_param,fullpath);
+#endif
+
+  return true;
+}
+
+// Save model weights
+bool save_model::do_save_model_weights(model *m) {
+  const auto& c = static_cast<sgd_execution_context&>(m->get_execution_context());
+  // if the checkpoint directory is not defined, bail
+  if (m_dir.length() == 0) {
+    return false;
+  }
+  // time how long this takes
+  // read current epoch and step counters from model
+  El::Timer timer;
+  lbann_comm *comm = m->get_comm();
+  comm->trainer_barrier();
+  // let user know we're saving the weights
+  int epoch = c.get_epoch();
+  int step = c.get_step();
+  if (comm->am_trainer_master()) {
+    timer.Start();
+    printf("[%s.%d] Saving model weights: epoch %d step %d ...\n", m->get_name().c_str(), comm->get_trainer_rank(), epoch, step);
+    fflush(stdout);
+  }
+
+  // Shared checkpoint, logic identical to Distributed.i
+  makedir(m_dir.c_str());
+  std::string epochdir = get_save_model_dirname(c.get_trainer().get_name(),
+                                                m->get_name(),
+                                                m_dir.c_str());
+  p.open_checkpoint_dir(epochdir.c_str(), comm->am_trainer_master());
+  m->save_weights(p);
+
+  uint64_t bytes_count = p.get_bytes();
+
+  if (comm->am_trainer_master()) {
+    EvalType secs = timer.Stop();
+    EvalType bw = 0;
+    if (secs > 0.0) {
+      bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0);
+    }
+    printf("[%s.%d] Saving model weights complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n",
+           m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw);
+    fflush(stdout);
+  }
+  p.reset_bytes();
+  return true;
+}
+
+std::unique_ptr<callback_base>
+build_save_model_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSaveModel&>(proto_msg);
+  if(params.extension().size() != 0) {
+    return make_unique<save_model>(
+      params.dir(),
+      params.disable_save_after_training(),
+      params.extension());
+  }
+  else {
+    return make_unique<save_model>(
+      params.dir(),
+      params.disable_save_after_training());
+  }
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/save_topk_models.cpp b/src/callbacks/save_topk_models.cpp
new file mode 100644
index 00000000000..6523910cbdb
--- /dev/null
+++ b/src/callbacks/save_topk_models.cpp
@@ -0,0 +1,113 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// save_topk_models .hpp .cpp - Callback hooks to save_topk_models information
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/save_topk_models.hpp"
+
+#include <callbacks.pb.h>
+
+#include <algorithm>
+#include <functional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+void save_topk_models::on_test_end(model *m) {
+  bool in_topk = false;
+  if(m->get_comm()->am_trainer_master()) {
+    in_topk = am_in_topk(m);
+  }
+  m->get_comm()->trainer_broadcast(0, in_topk);
+  if(in_topk) do_save_model(m);
+}
+
+bool save_topk_models::am_in_topk(model *m) {
+  const auto& c = static_cast<const execution_context&>(m->get_execution_context());
+  lbann_comm *comm = m->get_comm();
+  const int num_trainers = comm->get_num_trainers();
+  std::string mode_string = "test";
+  bool found_metric = false;
+  EvalType score = 0;
+  for (const auto& met : m->get_metrics()) {
+    if (met->name() == m_metric_name) {
+      found_metric = true;
+      score = met->get_mean_value(c.get_execution_mode());
+      break;
+    }
+  }
+  //sanity check
+  if (!found_metric) {
+    std::stringstream err;
+    err << "could not find metric \"" << m_metric_name << "\""
+        << "in model \"" << m->get_name() << "\"";
+    LBANN_ERROR(err.str());
+  }
+
+  if (m_k > num_trainers) {
+    std::stringstream err;
+    err << "k ( " << m_k << ") "
+        << " can not be greater than number of trainers ("
+        << num_trainers << ") " ;
+    LBANN_ERROR(err.str());
+  }
+
+  std::vector<EvalType> score_list(comm->get_num_trainers());
+  comm->all_gather<EvalType>(score, score_list,comm->get_intertrainer_comm());
+  std::vector<EvalType> top_scores = score_list;
+  //top-k in an ascending order
+  if(m_ascending_ordering) std::sort(top_scores.begin(), top_scores.end(),std::less<EvalType>());
+  //top-k in an descending order
+  else  std::sort(top_scores.begin(), top_scores.end(),std::greater<EvalType>());
+  top_scores.resize(m_k);
+
+  if (comm->am_world_master()) {
+    std::cout << "Top " << m_k << " " << m_metric_name << " average "
+              << std::accumulate(top_scores.begin(), top_scores.end(), EvalType(0))/m_k << std::endl;
+  }
+  if(std::find(top_scores.begin(), top_scores.end(),
+                 score_list[comm->get_trainer_rank()]) != top_scores.end()) {
+    return true;
+  }
+  return false;
+}
+
+std::unique_ptr<callback_base>
+build_save_topk_models_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSaveTopKModels&>(proto_msg);
+  return make_unique<save_topk_models>(
+    params.dir(),
+    params.k(),
+    params.metric(),
+    params.ascending_ordering());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/set_weights_value.cpp b/src/callbacks/set_weights_value.cpp
new file mode 100644
index 00000000000..06ab5b132a8
--- /dev/null
+++ b/src/callbacks/set_weights_value.cpp
@@ -0,0 +1,94 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/set_weights_value.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+
+#include <callbacks.pb.h>
+
+
+namespace lbann {
+namespace callback {
+
+set_weights_value::set_weights_value(
+  std::string weights_name,
+  double value,
+  size_t step)
+  : callback_base(),
+    m_weights_name(std::move(weights_name)),
+    m_value{value},
+    m_step{step} {}
+
+set_weights_value* set_weights_value::copy() const {
+  return new set_weights_value(*this);
+}
+
+std::string set_weights_value::name() const {
+  return "set weights value";
+}
+
+void set_weights_value::on_batch_begin(model *m) {
+
+  // Check whether to set weights value at current mini-batch step
+  const auto& context = m->get_execution_context();
+  if (context.get_step() != m_step) { return; }
+  if (context.get_execution_mode() != execution_mode::training) { return; }
+
+  // Find weights and set value
+  for (weights* w : m->get_weights()) {
+    if (w->get_name() == m_weights_name) {
+      /// @todo Handle weights with other data types
+      auto* dtw = dynamic_cast<data_type_weights<float>*>(w);
+      if (dtw == nullptr) {
+        LBANN_ERROR("\"",this->name(),"\" callback ",
+                    "attempted to set value of ",
+                    "weights \"",m_weights_name,"\", "
+                    "which has an invalid data type");
+      }
+      El::Fill(dtw->get_values(), float(m_value));
+      return;
+    }
+    LBANN_ERROR("\"",this->name(),"\" callback ",
+                "could not find ",
+                "weights \"",m_weights_name,"\", "
+                "in model \"",m->get_name(),"\"");
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_set_weights_value_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSetWeightsValue&>(proto_msg);
+  return make_unique<set_weights_value>(
+    params.weights(),
+    params.value(),
+    params.step());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/summarize_images.cpp b/src/callbacks/summarize_images.cpp
new file mode 100644
index 00000000000..946c79936d6
--- /dev/null
+++ b/src/callbacks/summarize_images.cpp
@@ -0,0 +1,356 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// summarize_images .hpp .cpp - Callback hooks to dump
+// results of image testing to event files
+////////////////////////////////////////////////////////////////////////////////
+
+#include <lbann_config.hpp>
+#include "lbann/callbacks/summarize_images.hpp"
+
+#include <lbann/layers/io/input/generic_input_layer.hpp>
+#include <lbann/proto/helpers.hpp>
+#include <lbann/utils/factory.hpp>
+#include <lbann/utils/image.hpp>
+#include <lbann/utils/summary.hpp>
+
+#include <callbacks.pb.h>
+
+#include <iostream>
+
+namespace lbann {
+namespace callback {
+
+// Strategy construction
+namespace {
+
+// Define the factory type.
+using factory_type = lbann::generic_factory<
+  image_output_strategy,
+  std::string,
+  proto::generate_builder_type<image_output_strategy,
+                               google::protobuf::Message const&>,
+  default_key_error_policy>;
+
+void register_default_builders(factory_type& factory) {
+  factory.register_builder("CategoricalAccuracyStrategy",
+                           build_categorical_accuracy_strategy_from_pbuf);
+  factory.register_builder("TrackSampleIDsStrategy",
+                           build_track_sample_ids_strategy_from_pbuf);
+}
+
+// Manage a global factory
+struct factory_manager {
+  factory_type factory_;
+
+  factory_manager() {
+    register_default_builders(factory_);
+  }
+};
+
+factory_manager factory_mgr_;
+factory_type const& get_strategy_factory() noexcept {
+  return factory_mgr_.factory_;
+}
+
+std::unique_ptr<image_output_strategy>
+construct_strategy(google::protobuf::Message const& proto_msg) {
+  auto const& factory = get_strategy_factory();
+  auto const& msg =
+    proto::helpers::get_oneof_message(proto_msg, "strategy_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg);
+}
+
+}// namespace (Strategy construction)
+
+
+// categorical_accuracy_strategy
+std::vector<std::pair<size_t, El::Int>>
+categorical_accuracy_strategy::get_image_indices(model const& m) const {
+  static size_t img_counter = 0;
+  static size_t epoch_counter = 0;
+  auto const& exe_ctx = dynamic_cast<sgd_execution_context const&>(m.get_execution_context());
+  if(exe_ctx.get_epoch() > epoch_counter){
+    epoch_counter++;
+    img_counter = 0;
+  }
+  std::vector<std::pair<size_t, El::Int>> img_indices;
+
+  auto const& cat_accuracy_layer = get_layer_by_name(m, m_cat_accuracy_layer_name);
+
+  const BaseDistMat& categorized_correctly_dist =
+    cat_accuracy_layer.get_activations(*(cat_accuracy_layer.get_child_layers().front()));
+  auto const& distdata = categorized_correctly_dist.DistData();
+  CircMat<El::Device::CPU> categorized_correctly(
+    *(distdata.grid), distdata.root);
+  El::Copy(categorized_correctly_dist, categorized_correctly);
+
+  if (categorized_correctly.Height() != El::Int(1))
+    LBANN_ERROR("categorical_accuracy_strategry expected to find a tensor of size 1, ",
+                "but found a tensor of size ",categorized_correctly.Height());
+
+  // Fill return value if root process
+  if (categorized_correctly.CrossRank() == categorized_correctly.Root()) {
+    // Loop over all samples -- samples are the *width* of the matrix
+    auto const num_samples = categorized_correctly.LocalWidth();
+    for (auto sample = decltype(num_samples){0}; sample < num_samples; ++sample) {
+      auto const& correctness_value = categorized_correctly.LockedMatrix()(0, sample);
+
+      if ((correctness_value != DataType(0))
+          && (correctness_value != DataType(1))) {
+        LBANN_ERROR("Invalid data from ", cat_accuracy_layer.get_name(),
+                    ". Received ", correctness_value, ", expected 0 or 1.");
+      }
+
+      if(img_indices.size() > static_cast<size_t>(num_samples) || img_counter >= m_num_images){
+        break;
+      }
+
+      if (meets_criteria(correctness_value)){
+        img_indices.push_back(std::make_pair(sample, El::Int(++img_counter)));
+      }
+    }
+  }
+
+  return img_indices;
+}
+
+bool categorical_accuracy_strategy::meets_criteria(
+  const DataType& match) const noexcept {
+  switch (m_match_type)
+  {
+  case MatchType::MATCH:
+    return (match == 1);
+  case MatchType::NOMATCH:
+    return (match == 0);
+  case MatchType::ALL:
+    return true;
+  }
+  return false;
+}
+
+std::string categorical_accuracy_strategy::get_tag(std::string const& layer_name,
+                                                   El::Int index, El::Int epoch) const {
+  // Sort by epoch
+  return build_string("epoch ", epoch, "/layer: ", layer_name,
+                      "/sample_index-", index);
+}
+
+// Builder function
+std::unique_ptr<image_output_strategy>
+build_categorical_accuracy_strategy_from_pbuf(google::protobuf::Message const& msg) {
+  using callback_type = lbann_data::Callback::CallbackSummarizeImages;
+  using strategy_type = callback_type::SelectionStrategy::CategoricalAccuracyStrategy;
+  using proto_match_type = strategy_type::MatchType;
+
+  auto ConvertToLbannType = [](proto_match_type a) {
+    return static_cast<categorical_accuracy_strategy::MatchType>(a);
+  };
+
+  const auto& strategy_msg = dynamic_cast<const strategy_type&>(msg);
+  return make_unique<categorical_accuracy_strategy>(
+    strategy_msg.accuracy_layer_name(),
+    ConvertToLbannType(strategy_msg.match_type()),
+    strategy_msg.num_images_per_epoch());
+}
+// End categorical_accuracy_strategy
+
+std::vector<std::pair<size_t, El::Int>>
+autoencoder_strategy::get_image_indices(model const& m) const {
+
+  // Find the input layer
+  auto const& input_layer = dynamic_cast<generic_input_layer<DataType> const&>(
+    get_layer_by_name(m, m_input_layer_name));
+
+  // Grab the data reader
+  auto const& data_reader =
+    *(input_layer.get_data_reader(m.get_execution_context().get_execution_mode()));
+
+  // Get the indices for this minibatch
+  bool const i_am_root = m.get_comm()->am_trainer_master();
+  auto const& exe_mode = m.get_execution_context().get_execution_mode();
+  auto const& total_steps = m.get_num_iterations_per_epoch(exe_mode);
+  auto const& current_step = ((m.get_execution_context().get_step() - 1) % total_steps) + 1;
+  bool const last_mb = (current_step == total_steps);
+  size_t const mb_size =
+    (last_mb
+     ? data_reader.get_global_last_mini_batch_size()
+     : data_reader.get_global_mini_batch_size());
+
+  // FIXME (trb 08/20/19): Based on my testing, the data reader will
+  // reshuffle its indices before the end-of-batch callbacks are
+  // called in the final epoch. This is the simplest hack around that,
+  // though not very efficient.
+  if (current_step == decltype(current_step){1}) {
+    auto const& tmp_inds = data_reader.get_shuffled_indices();
+    m_shuffled_indices[&m].assign(tmp_inds.cbegin(), tmp_inds.cend());
+  }
+  auto const& shuffled_indices = m_shuffled_indices[&m];
+
+  size_t const minibatch_start_index =
+    (current_step - 1) * data_reader.get_global_mini_batch_size();
+  size_t const minibatch_end_index =
+    std::min(minibatch_start_index + mb_size, shuffled_indices.size());
+
+  auto* sample_indices =
+    const_cast<generic_input_layer<DataType>&>(input_layer).get_sample_indices_per_mb();
+  if (sample_indices == nullptr)
+    LBANN_ERROR("Sample indices is NULL.");
+
+  std::vector<std::pair<size_t, El::Int>> img_indices;
+  if (i_am_root) {
+    using index_type = typename std::decay<decltype(shuffled_indices)>::type::value_type;
+    if (shuffled_indices[minibatch_start_index] != index_type(sample_indices->Get(0,0))) {
+      LBANN_ERROR("KABOOM. Interval = [",
+                  minibatch_start_index, ", ", minibatch_end_index, "]");
+    }
+
+    for (size_t ii = 0; ii < mb_size; ++ii) {
+      auto const& sample_index = shuffled_indices[minibatch_start_index + ii];
+
+      if (m_tracked_images.find(sample_index) != m_tracked_images.end()){
+        img_indices.push_back(std::make_pair(ii, sample_index));
+      }
+      else if(m_tracked_images.size() < m_num_images) {
+        m_tracked_images.insert(sample_index);
+        img_indices.push_back(std::make_pair(ii, sample_index));
+      }
+    }
+  }
+  return img_indices;
+
+}
+
+std::string autoencoder_strategy::get_tag(std::string const& layer_name,
+                                          El::Int index, El::Int epoch) const {
+  // Sort by index
+  return build_string("image id ", index, "/layer: ", layer_name,
+                      "/epoch ", epoch);
+
+}// End autoencoder strategy
+
+// Builder function
+std::unique_ptr<image_output_strategy>
+build_track_sample_ids_strategy_from_pbuf(google::protobuf::Message const& msg) {
+  using callback_type = lbann_data::Callback::CallbackSummarizeImages;
+  using strategy_type = callback_type::SelectionStrategy::TrackSampleIDsStrategy;
+
+  const auto& strategy_msg = dynamic_cast<const strategy_type&>(msg);
+  return make_unique<autoencoder_strategy>(
+    strategy_msg.input_layer_name(),
+    strategy_msg.num_tracked_images());
+}
+
+summarize_images::summarize_images(std::shared_ptr<lbann_summary> const& summarizer,
+                                   std::unique_ptr<image_output_strategy> strategy,
+                                   std::string const& img_layer_name,
+                                   uint64_t epoch_interval,
+                                   std::string const& img_format)
+  : callback_base(/*batch interval=*/1),
+    m_summarizer(summarizer),
+    m_strategy(std::move(strategy)),
+    m_img_source_layer_name(img_layer_name),
+    m_epoch_interval(std::max(epoch_interval, uint64_t{1})),
+    m_img_format(img_format)
+{
+#ifndef LBANN_HAS_OPENCV
+  LBANN_ERROR("OpenCV not detected");
+#endif // LBANN_HAS_OPENCV
+}
+
+void summarize_images::on_batch_evaluate_end(model* m) {
+
+  auto const& exe_ctx = dynamic_cast<sgd_execution_context const&>(m->get_execution_context());
+  if (exe_ctx.get_epoch() % m_epoch_interval != 0)
+    return;
+
+  if (m->get_execution_context().get_execution_mode() == execution_mode::validation)
+    dump_images_to_summary(*m);
+}
+
+void summarize_images::dump_images_to_summary(model const& m) const {
+
+  auto img_indices = m_strategy->get_image_indices(m);
+
+  const auto& layer = get_layer_by_name(m, m_img_source_layer_name);
+  const auto& layer_activations =
+    layer.get_activations(*(layer.get_child_layers().front()));
+  const auto& layer_distdata = layer_activations.DistData();
+  CircMat<El::Device::CPU> all_images(
+    *(layer_distdata.grid), layer_distdata.root);
+  El::Copy(layer_activations, all_images);
+
+  if (all_images.CrossRank() == all_images.Root()) {
+    auto const& local_images = all_images.LockedMatrix();
+    auto dims = layer.get_output_dims();
+
+    for (const auto& img_id : img_indices) {
+      auto const& col_index = img_id.first;
+      auto const& sample_index = img_id.second;
+      if (col_index >= size_t(local_images.Width())) {
+        LBANN_ERROR(
+          "Column index ", col_index, " is greater than Matrix width ",
+          local_images.Width());
+      }
+      auto const& exe_ctx = dynamic_cast<sgd_execution_context const&>(
+        m.get_execution_context());
+      auto image_tag =  m_strategy->get_tag(m_img_source_layer_name,
+                                            sample_index, exe_ctx.get_epoch());
+      auto const local_image = local_images(El::ALL, El::IR(col_index));
+      this->m_summarizer->report_image(
+        image_tag, m_img_format, local_image, dims, m.get_execution_context().get_step());
+    }
+  }
+}
+
+Layer const& get_layer_by_name(model const& m,
+                               std::string const& layer_name)
+{
+  for (El::Int ii = 0; ii < m.get_num_layers(); ++ii) {
+    auto const& l = m.get_layer(ii);
+    if (l.get_name() == layer_name)
+      return l;
+  }
+  LBANN_ERROR("Did not find a layer with name \"", layer_name, "\" in model.");
+  return m.get_layer(0); // Silence compiler warning
+}
+
+std::unique_ptr<callback_base>
+build_summarize_images_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>& summarizer) {
+
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSummarizeImages&>(proto_msg);
+
+  return make_unique<summarize_images>(
+    summarizer,
+    construct_strategy(params.selection_strategy()),
+    params.image_source_layer_name(),
+    params.epoch_interval());
+}
+
+}// interval callback
+}// namespace lbann
diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp
new file mode 100644
index 00000000000..e2b731b8deb
--- /dev/null
+++ b/src/callbacks/summary.cpp
@@ -0,0 +1,182 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// summary .hpp .cpp - Callback hooks to summarize to Tensorboard
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/summary.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/optimizers/data_type_optimizer.hpp"
+
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/profiling.hpp"
+
+#include <callbacks.pb.h>
+
+#include <algorithm>
+#include <string>
+
+namespace lbann {
+namespace callback {
+
+summary::summary(const std::shared_ptr<lbann_summary>& summarizer,
+                 int batch_interval,
+                 int mat_interval) :
+  callback_base(batch_interval),
+  m_summarizer(summarizer),
+  m_mat_interval(mat_interval) {}
+
+void summary::on_train_begin(model *m) {
+  save_histograms(m);
+}
+
+void summary::on_batch_end(model *m) {
+  if(!m_summarizer){
+    LBANN_ERROR("Summary callback failed: m_summarizer does not exist.");
+  }
+
+  const auto& c = m->get_execution_context();
+
+  prof_region_begin("summary-batch", prof_colors[0], false);
+  m->summarize_stats(*m_summarizer);
+  if (m_mat_interval > 0 && c.get_step() % m_mat_interval == 0) {
+    m->summarize_matrices(*m_summarizer);
+  }
+  lbann_comm *comm = m->get_comm();
+  size_t bytes_sent = comm->get_bytes_sent();
+  size_t bytes_received = comm->get_bytes_received();
+  size_t trainer_barriers = comm->get_num_trainer_barriers();
+  size_t intertrainer_barriers = comm->get_num_intertrainer_barriers();
+  size_t global_barriers = comm->get_num_global_barriers();
+  comm->reset_stats_counters();
+  m_summarizer->sum_reduce_scalar("bytes_sent", bytes_sent, c.get_step());
+  m_summarizer->sum_reduce_scalar("bytes_received", bytes_received,
+                                  c.get_step());
+  m_summarizer->reduce_scalar("trainer_barriers", trainer_barriers,
+                              c.get_step());
+  m_summarizer->reduce_scalar("intertrainer_barriers", intertrainer_barriers,
+                              c.get_step());
+  m_summarizer->reduce_scalar("global_barriers", global_barriers,
+                              c.get_step());
+  prof_region_end("summary-batch", false);
+}
+
+void summary::on_epoch_end(model *m) {
+  if(!m_summarizer){
+    LBANN_ERROR("Summary callback failed: m_summarizer does not exist.");
+  }
+
+  const auto& c = m->get_execution_context();
+  prof_region_begin("summary-epoch", prof_colors[0], false);
+  for (const auto& met : m->get_metrics()) {
+    EvalType train_score = met->get_mean_value(c.get_execution_mode());
+    // Replace spaces with _ for consistency.
+    std::string metric_name = met->name();
+    std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(),
+                   [] (char c_) { return c_ == ' ' ? '_' : c_; });
+    std::string phase = "train_" + metric_name;
+    m_summarizer->reduce_scalar(phase, train_score, c.get_step());
+  }
+  save_histograms(m);
+  m_summarizer->flush();
+  prof_region_end("summary-epoch", false);
+}
+
+void summary::on_test_end(model *m) {
+
+  if(!m_summarizer){
+    LBANN_ERROR("Summary callback failed: m_summarizer does not exist.");
+  }
+  const auto& c = m->get_execution_context();
+  prof_region_begin("summary-test", prof_colors[0], false);
+  lbann_comm *comm = m->get_comm();
+  for (auto&& met : m->get_metrics()) {
+    EvalType test_score = met->get_mean_value(c.get_execution_mode());
+    // Replace spaces with _ for consistency.
+    std::string metric_name = met->name();
+    std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(),
+                   [] (char c_) { return c_ == ' ' ? '_' : c_; });
+    std::string phase = "test_" + metric_name;
+    m_summarizer->reduce_scalar(phase, test_score, c.get_step());
+  }
+  // Reset counters incremented during test phase.
+  comm->reset_stats_counters();
+  for (auto&& layer : m->get_layers()) {
+    layer->reset_counters();
+  }
+  prof_region_end("summary-test", false);
+}
+
+void summary::save_histograms(model *m) {
+  using LayerType = data_type_layer<DataType>;
+  using OptimizerType = data_type_optimizer<DataType>;
+  using WeightsType = data_type_weights<DataType>;
+
+  if(!m_summarizer){
+    LBANN_ERROR("Summary callback failed: m_summarizer does not exist.");
+  }
+  const auto& c = m->get_execution_context();
+  for (const auto& layer : m->get_layers()) {
+    const std::string prefix = layer->get_name() + "/";
+    for (int i = 0; i < layer->get_num_children(); ++i) {
+      auto* dtl = dynamic_cast<LayerType*>(layer);
+      AbsDistMatReadProxy<El::Device::CPU> acts(dtl->get_activations(i));
+      m_summarizer->reduce_histogram(prefix + "activations" + std::to_string(i),
+                                     acts.GetLocked(),
+                                     c.get_step());
+    }
+  }
+  for (const auto& w : m->get_weights()) {
+    const std::string prefix = w->get_name() + "/";
+    auto* dtw = dynamic_cast<WeightsType*>(w);
+    AbsDistMatReadProxy<El::Device::CPU> weights(dtw->get_values());
+    m_summarizer->reduce_histogram(prefix + "weights",
+                                   weights.GetLocked(),
+                                   c.get_step());
+    optimizer *opt = w->get_optimizer();
+    if (opt != nullptr) {
+      auto* dt_opt = dynamic_cast<OptimizerType*>(opt);
+      AbsDistMatReadProxy<El::Device::CPU> gradients(dt_opt->get_gradient());
+      m_summarizer->reduce_histogram(prefix + "weights_gradient",
+                                     gradients.GetLocked(),
+                                     c.get_step());
+    }
+  }
+}
+
+std::unique_ptr<callback_base>
+build_summary_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>& summarizer) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSummary&>(proto_msg);
+  return make_unique<summary>(summarizer,
+                              params.batch_interval(),
+                              params.mat_interval());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/sync_layers.cpp b/src/callbacks/sync_layers.cpp
new file mode 100644
index 00000000000..f2f3efdeb0f
--- /dev/null
+++ b/src/callbacks/sync_layers.cpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// callback_sync_layers.cpp - Callback to synchronize layers
+///////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/sync_layers.hpp"
+
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/timer.hpp"
+
+#include <callbacks.pb.h>
+
+namespace lbann {
+namespace callback {
+
+void sync_layers::on_forward_prop_end(model *m, Layer *l) {
+  if (m_only_input && dynamic_cast<generic_input_layer<DataType>*>(l) == nullptr) {
+    return;  // Skip non-input layers.
+  }
+  double start = get_time();
+  do_sync(l);
+  l->m_fp_time += get_time() - start;
+}
+
+void sync_layers::on_backward_prop_end(model *m, Layer *l) {
+  if (m_only_input) {
+    return;
+  }
+  double start = get_time();
+  do_sync(l);
+  l->m_bp_time += get_time() - start;
+}
+
+void sync_layers::do_sync(Layer *l) {
+  #ifdef LBANN_HAS_CUDNN
+  if (m_sync_gpus) {
+    El::GPUManager::SynchronizeDevice();
+  }
+  #endif
+  if (m_sync_mpi) {
+    l->get_comm()->global_barrier();
+  }
+}
+
+std::unique_ptr<callback_base>
+build_sync_layers_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, const std::shared_ptr<lbann_summary>&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackSyncLayers&>(proto_msg);
+  return make_unique<sync_layers>(params.sync_gpus(),
+                                                 params.sync_mpi(),
+                                                 params.only_input());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/timeline.cpp b/src/callbacks/timeline.cpp
new file mode 100644
index 00000000000..766a702ab3d
--- /dev/null
+++ b/src/callbacks/timeline.cpp
@@ -0,0 +1,119 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/timeline.hpp"
+
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/timer.hpp"
+
+#include <callbacks.pb.h>
+
+#include <fstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace lbann {
+namespace callback {
+
+void timeline::on_train_begin(model *m) {
+  // Set up layers and weights.
+  for (const auto& l : m->get_layers()) {
+    m_fp_times.emplace(l->get_name(), std::vector<std::pair<EvalType,EvalType>>());
+    m_bp_times.emplace(l->get_name(), std::vector<std::pair<EvalType,EvalType>>());
+  }
+  for (const auto& w : m->get_weights()) {
+    m_opt_times.emplace(w->get_name(), std::vector<std::pair<EvalType,EvalType>>());
+  }
+  // Ensure the model is synchronized at the start.
+  m->get_comm()->trainer_barrier();
+  m_start_time = get_time();
+}
+
+void timeline::on_train_end(model *m) {
+  const std::string path = m_outdir + "/timeline.m" +
+    std::to_string(m->get_comm()->get_trainer_rank()) + "." +
+    std::to_string(m->get_comm()->get_rank_in_trainer()) + ".txt";
+  std::ofstream f(path);
+  for (const auto& kv : m_fp_times) {
+    const std::string layer_name = "fp-" + kv.first;
+    for (const auto& time : kv.second) {
+      f << layer_name << ":" << time.first << ":" << time.second << '\n';
+    }
+  }
+  for (const auto& kv : m_bp_times) {
+    const std::string layer_name = "bp-" + kv.first;
+    for (const auto& time : kv.second) {
+      f << layer_name << ":" << time.first << ":" << time.second << '\n';
+    }
+  }
+  for (const auto& kv : m_opt_times) {
+    const std::string weights_name = "opt-" + kv.first;
+    for (const auto& time : kv.second) {
+      f << weights_name << ":" << time.first << ":" << time.second << '\n';
+    }
+  }
+}
+
+void timeline::on_forward_prop_begin(model *m, Layer *l) {
+  m_fp_start_time = get_rel_time();
+}
+
+void timeline::on_forward_prop_end(model *m, Layer *l) {
+  EvalType end = get_rel_time();
+  m_fp_times[l->get_name()].emplace_back(m_fp_start_time, end);
+}
+
+void timeline::on_backward_prop_begin(model *m, Layer *l) {
+  m_bp_start_time = get_rel_time();
+}
+
+void timeline::on_backward_prop_end(model *m, Layer *l) {
+  EvalType end = get_rel_time();
+  m_bp_times[l->get_name()].emplace_back(m_bp_start_time, end);
+}
+
+void timeline::on_optimize_begin(model *m, weights *w) {
+  m_opt_start_time = get_rel_time();
+}
+
+void timeline::on_optimize_end(model *m, weights *w) {
+  EvalType end = get_rel_time();
+  m_opt_times[w->get_name()].emplace_back(m_opt_start_time, end);
+}
+
+std::unique_ptr<callback_base>
+build_timeline_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackTimeline&>(proto_msg);
+  return make_unique<timeline>(params.directory());
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/timer.cpp b/src/callbacks/timer.cpp
new file mode 100644
index 00000000000..44b184957d6
--- /dev/null
+++ b/src/callbacks/timer.cpp
@@ -0,0 +1,182 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+///////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/timer.hpp"
+#include "lbann/utils/timer.hpp"
+#include <algorithm>
+
+namespace lbann {
+namespace callback {
+
+void timer::batch_timing_begin(const model& m) {
+  const auto& c = m.get_execution_context();
+  const auto& mode = c.get_execution_mode();
+  m_batch_start_times[mode] = get_time();
+}
+
+void timer::batch_timing_end(const model& m) {
+  const auto& c = m.get_execution_context();
+  const auto& mode = c.get_execution_mode();
+  const auto& batch_time = get_time() - m_batch_start_times[mode];
+  m_batch_times[mode].push_back(batch_time);
+  if (m_summarizer != nullptr) {
+    m_summarizer->reduce_scalar("minibatch_time", batch_time, c.get_step()-1);
+    m_summarizer->reduce_scalar_all("minibatch_time", batch_time, c.get_step()-1);
+  }
+}
+
+void timer::timing_begin(const model& m) {
+  const auto& c = m.get_execution_context();
+  const auto& mode = c.get_execution_mode();
+  m_start_times[mode] = get_time();
+  m_batch_times[mode].clear();
+}
+
+void timer::timing_end(model& m) {
+  const auto& c = static_cast<sgd_execution_context&>(m.get_execution_context());
+  constexpr EvalType zero = 0;
+
+  // Get run time
+  const auto& mode = c.get_execution_mode();
+  const auto& run_time = get_time() - m_start_times[mode];
+
+  // Compute minibatch statistics
+  const auto& batch_times = m_batch_times[mode];
+  const auto& num_batches = batch_times.size();
+  EvalType batch_time_mean = std::nan("");
+  EvalType batch_time_min = std::nan("");
+  EvalType batch_time_max = std::nan("");
+  EvalType batch_time_stdev = std::nan("");
+  if (num_batches > 0) {
+    batch_time_mean = std::accumulate(batch_times.begin(),
+                                      batch_times.end(),
+                                      zero) / num_batches;
+    batch_time_min = *std::min_element(batch_times.begin(),
+                                       batch_times.end());
+    batch_time_max = *std::max_element(batch_times.begin(),
+                                       batch_times.end());
+  }
+  if (num_batches > 1) {
+    batch_time_stdev = zero;
+    for (const auto& bt : batch_times) {
+      const auto& diff = bt - batch_time_mean;
+      batch_time_stdev += diff * diff;
+    }
+    batch_time_stdev /= num_batches - 1;
+    batch_time_stdev = El::Sqrt(std::max(batch_time_stdev, zero));
+  }
+
+  // Get string for execution mode
+  std::string mode_string;
+  switch(mode) {
+  case execution_mode::training:
+    mode_string = "training epoch " + std::to_string(c.get_epoch()-1);
+    break;
+  case execution_mode::validation:
+    mode_string = "validation";
+    break;
+  case execution_mode::testing:
+    mode_string = "test";
+    break;
+  default:
+    LBANN_ERROR("invalid execution mode");
+  }
+
+  // Report timing results
+  auto& comm = *m.get_comm();
+  const El::Int num_models = comm.get_num_trainers();
+  if (comm.am_trainer_master()) {
+
+    // Gather timing results in world master
+    std::vector<EvalType> run_time_list(num_models);
+    std::vector<EvalType> mean_list(num_models);
+    std::vector<EvalType> min_list(num_models);
+    std::vector<EvalType> max_list(num_models);
+    std::vector<EvalType> stdev_list(num_models);
+    if (comm.am_world_master()) {
+      comm.intertrainer_gather(run_time, run_time_list);
+      comm.intertrainer_gather(batch_time_mean, mean_list);
+      comm.intertrainer_gather(batch_time_min, min_list);
+      comm.intertrainer_gather(batch_time_max, max_list);
+      comm.intertrainer_gather(batch_time_stdev, stdev_list);
+    } else {
+      const auto& world_master = comm.get_intertrainer_master();
+      comm.intertrainer_gather(run_time, world_master);
+      comm.intertrainer_gather(batch_time_mean, world_master);
+      comm.intertrainer_gather(batch_time_min, world_master);
+      comm.intertrainer_gather(batch_time_max, world_master);
+      comm.intertrainer_gather(batch_time_stdev, world_master);
+    }
+
+    // Print results
+    if (comm.am_world_master()) {
+      for (El::Int i = 0; i < num_models; ++i) {
+        std::cout << m.get_name() << " (instance "<< i << ") " << mode_string << " "
+                  << "run time : " << run_time_list[i] << "s"
+                  << std::endl;
+      }
+      for (El::Int i = 0; i < num_models; ++i) {
+        std::cout << m.get_name() << " (instance " << i << ") " << mode_string << " "
+                  << "mini-batch time statistics : ";
+        if (std::isnan(mean_list[i])) {
+          std::cout << "N/A";
+        } else {
+          std::cout << mean_list[i] << "s";
+        }
+        std::cout << " mean, ";
+        if (std::isnan(max_list[i])) {
+          std::cout << "N/A";
+        } else {
+          std::cout << max_list[i] << "s";
+        }
+        std::cout << " max, ";
+        if (std::isnan(min_list[i])) {
+          std::cout << "N/A";
+        } else {
+          std::cout << min_list[i] << "s";
+        }
+        std::cout << " min, ";
+        if (std::isnan(stdev_list[i])) {
+          std::cout << "N/A";
+        } else {
+          std::cout << stdev_list[i] << "s";
+        }
+        std::cout << " stdev" << std::endl;
+      }
+
+    }
+  }
+
+}
+
+std::unique_ptr<callback_base>
+build_timer_callback_from_pbuf(
+  const google::protobuf::Message&, std::shared_ptr<lbann_summary> const& summarizer) {
+  return make_unique<timer>(summarizer);
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/callbacks/variable_minibatch.cpp b/src/callbacks/variable_minibatch.cpp
new file mode 100644
index 00000000000..c224aa1adad
--- /dev/null
+++ b/src/callbacks/variable_minibatch.cpp
@@ -0,0 +1,227 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/variable_minibatch.hpp"
+
+#include "lbann/layers/io/input/input_layer.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include <callbacks.pb.h>
+
+#include <iostream>
+#include <utility>
+
+namespace lbann {
+namespace callback {
+
+variable_minibatch::variable_minibatch(
+  size_t starting_mbsize) : m_starting_mbsize(starting_mbsize),
+                         m_current_mini_batch_size(starting_mbsize) {}
+
+void variable_minibatch::on_train_begin(model *m) {
+  // Avoid issues with the train method being called multiple times.
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  if (c.get_epoch() != 0) { return; }
+  const auto& t = c.get_trainer();
+
+  // Get first input layer in model
+  generic_input_layer<DataType>* input = nullptr;
+  for (auto&& l : m->get_layers()) {
+    input = dynamic_cast<generic_input_layer<DataType>*>(l);
+    if (input != nullptr) { break; }
+  }
+  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
+
+  if (m_starting_mbsize > t.get_max_mini_batch_size()) {
+    throw lbann_exception(
+      "variable_minibatch: starting mini-batch size is larger than max");
+  }
+  if (m_starting_mbsize == t.get_max_mini_batch_size()) {
+    if (m->get_comm()->am_world_master()) {
+      std::cout << "WARNING: starting mini-batch size equals max mini-batch "
+                << "size and using variable-sized mini-batches" << std::endl;
+    }
+  }
+  m->get_execution_context().get_trainer().get_data_coordinator().calculate_num_iterations_per_epoch(m_starting_mbsize);
+}
+
+void variable_minibatch::on_epoch_end(model *m) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  const auto& t = c.get_trainer();
+
+  // Get first input layer in model
+  generic_input_layer<DataType>* input = nullptr;
+  for (auto&& l : m->get_layers()) {
+    input = dynamic_cast<generic_input_layer<DataType>*>(l);
+    if (input != nullptr) { break; }
+  }
+  if (input == nullptr) { LBANN_ERROR("could not get input layer"); }
+
+  lbann_comm *comm = m->get_comm();
+  size_t new_mbsize = 0;
+  float new_lr = 0.0f;
+  size_t ramp_time = 0;
+  if (schedule(m, new_mbsize, new_lr, ramp_time)) {
+    if (new_mbsize > t.get_max_mini_batch_size()) {
+      if (comm->am_trainer_master()) {
+        std::cout << "Model " << comm->get_trainer_rank() << ": WARNING " <<
+          "requested new mini-batch size " << new_mbsize <<
+          " is greater than the model maximum mini-batch size " <<
+          t.get_max_mini_batch_size() << std::endl;
+      }
+      new_mbsize = t.get_max_mini_batch_size();
+    }
+    m->get_execution_context().get_trainer().get_data_coordinator().calculate_num_iterations_per_epoch(new_mbsize);
+    m_current_mini_batch_size = new_mbsize;
+    m_ramp_count = ramp_time;
+    if (new_lr != 0.0f) {
+      if (ramp_time == 0) {
+        // Change learning rate immediately.
+        change_learning_rate(m, new_lr);
+      } else {
+        // Compute the per-epoch learning rate increment.
+        float cur_lr = get_current_learning_rate(m);
+        m_lr_incr = (new_lr - cur_lr) / ramp_time;
+      }
+      if (comm->am_trainer_master()) {
+        std::cout << "Model " << comm->get_trainer_rank() <<
+          ": Changing mini-batch size to " << new_mbsize <<
+          " and learning rate to " << new_lr << " at epoch " <<
+          c.get_epoch() << std::endl;
+      }
+    } else if (comm->am_trainer_master()) {
+      std::cout << "Model " << comm->get_trainer_rank() <<
+        ": Changing mini-batch size to " << new_mbsize <<
+        " at epoch " << c.get_epoch() << std::endl;
+    }
+  }
+  // Ramp the learning rate, if needed.
+  if (m_ramp_count > 0) {
+    --m_ramp_count;
+    float target_lr = get_current_learning_rate(m) + m_lr_incr;
+    change_learning_rate(m, target_lr);
+    if (comm->am_trainer_master()) {
+      std::cout << "Model " << comm->get_trainer_rank() <<
+        ": Variable-size mini-batch ramping learning rate to " <<
+        target_lr << std::endl;
+    }
+  }
+}
+
+void variable_minibatch::change_learning_rate(
+  model *m, float new_lr) const {
+  for (weights *w : m->get_weights()) {
+    optimizer *opt = w->get_optimizer();
+    if (opt != nullptr) {
+      auto* dt_opt = dynamic_cast<data_type_optimizer<DataType>*>(opt);
+      dt_opt->set_learning_rate(new_lr);
+    }
+  }
+}
+
+float variable_minibatch::get_current_learning_rate(
+  model *m) const {
+  for (weights *w : m->get_weights()) {
+    optimizer *opt = w->get_optimizer();
+    if (opt != nullptr) {
+      auto* dt_opt = dynamic_cast<data_type_optimizer<DataType>*>(opt);
+      return dt_opt->get_learning_rate();
+    }
+  }
+  return 0.0f;
+}
+
+step_minibatch::step_minibatch(
+  size_t starting_mbsize, size_t step, size_t ramp_time) :
+  variable_minibatch(starting_mbsize), m_step(step),
+  m_ramp_time(ramp_time) {}
+
+bool step_minibatch::schedule(
+  model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  if (c.get_epoch() % m_step == 0) {
+    new_mbsize = m_current_mini_batch_size * 2;
+    new_lr = get_current_learning_rate(m) * 2;
+    ramp_time = m_ramp_time;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+minibatch_schedule::minibatch_schedule(
+  size_t starting_mbsize, std::vector<minibatch_step> steps) :
+  variable_minibatch(starting_mbsize), m_steps(std::move(steps)) {
+  std::sort(m_steps.rbegin(), m_steps.rend(),
+            [] (const minibatch_step& a, const minibatch_step& b) {
+              return a.epoch < b.epoch;
+            });
+}
+
+bool minibatch_schedule::schedule(
+  model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) {
+  const auto& c = static_cast<const sgd_execution_context&>(m->get_execution_context());
+  if (!m_steps.empty() && c.get_epoch() == m_steps.back().epoch) {
+    new_mbsize = m_steps.back().mbsize;
+    new_lr = m_steps.back().lr;
+    ramp_time = m_steps.back().ramp_time;
+    m_steps.pop_back();
+    return true;
+  }
+  return false;
+}
+
+std::unique_ptr<callback_base>
+build_step_minibatch_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackStepMinibatch&>(proto_msg);
+  return make_unique<step_minibatch>(params.starting_mbsize(),
+                                                    params.step(),
+                                                    params.ramp_time());
+}
+
+std::unique_ptr<callback_base>
+build_minibatch_schedule_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const&) {
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackMinibatchSchedule&>(proto_msg);
+  std::vector<minibatch_schedule::minibatch_step> steps;
+  for (int i = 0; i < params.step_size(); ++i) {
+    const auto& proto_step = params.step(i);
+    steps.emplace_back(proto_step.epoch(),
+                       proto_step.mbsize(),
+                       proto_step.lr(),
+                       proto_step.ramp_time());
+  }
+  return make_unique<minibatch_schedule>(params.starting_mbsize(),
+                                                        steps);
+}
+
+} // namespace callback
+} // namespace lbann
diff --git a/src/comm.cpp b/src/comm.cpp
index 24c545064ea..ff07ee7ca9c 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -26,6 +26,7 @@
 // lbann_comm .hpp .cpp - LBANN communication utilities
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_COMM_INSTANTIATE
 #include "lbann/comm.hpp"
 #include "lbann/utils/timer.hpp"
 #include "lbann/utils/exception.hpp"
@@ -134,178 +135,255 @@ void lbann_comm::intertrainer_sum_matrix(AbsDistMat& mat) {
   allreduce(mat, intertrainer_comm, El::mpi::SUM);
 }
 
-void lbann_comm::allreduce(AbsMat& m,
+namespace {
+
+template <typename BackendT>
+struct BackendTag {};
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM)
+auto GetRequest(Al::request& r, BackendTag<Al::dummy_backend>)
+    -> typename Al::dummy_backend::req_type
+{
+    return Al::dummy_backend::null_req;
+}
+
+auto GetRequest(Al::request& r, BackendTag<::Al::MPIBackend>)
+    -> typename ::Al::MPIBackend::req_type&
+{
+    return r.mpi_req;
+}
+void UpdateRequest(typename ::Al::MPIBackend::req_type&,
+                   El::SyncInfo<El::Device::CPU> const&) noexcept
+{
+}
+
+#ifdef AL_HAS_NCCL
+auto GetRequest(Al::request& r, BackendTag<::Al::NCCLBackend>) noexcept
+    -> typename ::Al::NCCLBackend::req_type&
+{
+    return r.nccl_req;
+}
+void UpdateRequest(typename ::Al::NCCLBackend::req_type& req,
+                   El::SyncInfo<El::Device::GPU> const& si) noexcept
+{
+  if (req)
+    req->orig_stream = si.stream_;
+}
+#endif // AL_HAS_NCCL
+
+#ifdef AL_HAS_MPI_CUDA
+auto GetRequest(Al::request& r, BackendTag<::Al::MPICUDABackend>) noexcept
+    -> typename ::Al::MPICUDABackend::req_type&
+{
+    return r.mpicuda_req;
+}
+void UpdateRequest(typename ::Al::MPICUDABackend::req_type& req,
+                   El::SyncInfo<El::Device::GPU> const& si) noexcept
+{
+  if (req)
+    req->orig_stream = si.stream_;
+}
+#endif // AL_HAS_MPI_CUDA
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM)
+
+// The best we can do on CPU is exactly the Elemental implementation:
+// If the buffer is contiguous, call the El::mpi interface, which will
+// dispatch to Aluminum if possible for the type; otherwise,
+// pack-allreduce-unpack.
+//
+// Likewise, if we don't have Aluminum, this is the best we can do on GPU.
+//
+// If we DO have Aluminum, the compiler should select that overload
+// for GPUs as it is "more specialized" than this template. If that's
+// not what's happening, there's a compiler bug.
+template <typename T, El::Device D>
+void allreduce_impl(El::Matrix<T, D>& m,
+                    const El::mpi::Comm& c,
+                    El::mpi::Op const& op) {
+  return El::AllReduce(m, c, op);
+}
+
+template <typename T, El::Device D>
+void nb_allreduce_impl(El::Matrix<T, D>& m,
+                       const El::mpi::Comm& c,
+                       Al::request&,
+                       El::mpi::Op const& op) {
+  return El::AllReduce(m, c, op);
+}
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM)
+
+template <typename T, typename BackendT,
+          El::EnableWhen<
+            El::AluminumSupportsBackendAndCollective<
+              T, El::Collective::ALLREDUCE, BackendT>,
+            int> = 0>
+void allreduce_aluminum(El::Matrix<T, El::Device::GPU>& m,
+                        const El::mpi::Comm& c,
+                        El::mpi::Op const& op,
+                        BackendTag<BackendT>,
+                        typename BackendT::allreduce_algo_type algo
+                        = BackendT::allreduce_algo_type::automatic) {
+  const auto local_size = m.Height() * m.Width();
+  ::Al::Allreduce<BackendT>(
+    m.Buffer(),
+    local_size,
+    mpi_op_to_al_op(op),
+    c.template GetComm<BackendT>(El::SyncInfoFromMatrix(m)),
+    algo);
+}
+
+template <typename T, typename BackendT,
+          El::EnableWhen<
+            El::AluminumSupportsBackendAndCollective<
+              T, El::Collective::ALLREDUCE, BackendT>,
+            int> = 0>
+void nb_allreduce_aluminum(El::Matrix<T, El::Device::GPU>& m,
+                           const El::mpi::Comm& c,
+                           Al::request& req,
+                           El::mpi::Op const& op,
+                           BackendTag<BackendT> const& tag,
+                           typename BackendT::allreduce_algo_type algo
+                           = BackendT::allreduce_algo_type::automatic) {
+  const auto local_size = m.Height() * m.Width();
+  const auto& syncinfo = El::SyncInfoFromMatrix(m);
+  auto& request = GetRequest(req, tag);
+  ::Al::NonblockingAllreduce<BackendT>(
+    m.Buffer(),
+    local_size,
+    mpi_op_to_al_op(op),
+    c.template GetComm<BackendT>(syncinfo),
+    request,
+    algo);
+  UpdateRequest(request, syncinfo);
+}
+
+template <typename T, typename BackendT,
+          El::EnableUnless<
+            El::AluminumSupportsBackendAndCollective<
+              T, El::Collective::ALLREDUCE, BackendT>,
+            int> = 0>
+void nb_allreduce_aluminum(El::Matrix<T, El::Device::GPU>& m,
+                           const El::mpi::Comm& c,
+                           Al::request& req,
+                           El::mpi::Op const& op,
+                           BackendTag<BackendT> const& tag,
+                           typename BackendT::allreduce_algo_type algo
+                           = BackendT::allreduce_algo_type::automatic) {
+  El::AllReduce(m, c, op);
+}
+
+template <typename T, typename BackendT,
+          El::EnableUnless<
+            El::AluminumSupportsBackendAndCollective<
+              T, El::Collective::ALLREDUCE, BackendT>,
+            int> = 0>
+void allreduce_aluminum(El::Matrix<T, El::Device::GPU>& m,
+                        const El::mpi::Comm& c,
+                        El::mpi::Op const& op,
+                        BackendTag<BackendT>,
+                        typename BackendT::allreduce_algo_type
+                        = BackendT::allreduce_algo_type::automatic) {
+  // We cannot dispatch with this backend directly to Aluminum. Let
+  // Elemental handle it.
+  El::AllReduce(m, c, op);
+}
+
+template <typename T>
+void allreduce_impl(El::Matrix<T, El::Device::GPU>& m,
+                    El::mpi::Comm const& c,
+                    El::mpi::Op const& op) {
+  return El::AllReduce(m, c, op);
+}
+
+template <typename T>
+void nb_allreduce_impl(El::Matrix<T, El::Device::GPU>& m,
+                       El::mpi::Comm const& c,
+                       Al::request& req,
+                       El::mpi::Op const& op) {
+  if (m.Width() > 1 && m.Height() != m.LDim()) {
+    // Aluminum doesn't do allreduces on strided matrices
+    return El::AllReduce(m, c, op);
+  }
+
+#if defined(AL_HAS_NCCL)
+  return nb_allreduce_aluminum(
+    m, c, req, op,
+    BackendTag<::Al::NCCLBackend>{});
+#elif defined(AL_HAS_MPI_CUDA)
+  return nb_allreduce_aluminum(
+    m, c, req, op,
+    BackendTag<::Al::MPICUDABackend>{},
+    ::Al::MPICUDABackend::allreduce_algo_type::host_transfer);
+#else
+  // At this point just call Elemental again
+  return El::AllReduce(m, c, op);
+#endif
+}
+
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_ALUMINUM)
+}// namespace <anon>
+
+template <typename TensorDataType>
+void lbann_comm::allreduce(El::AbstractMatrix<TensorDataType>& m,
                            const El::mpi::Comm& c,
                            El::mpi::Op op) {
   if (El::mpi::Size(c) == 1 || m.Height() < 1 || m.Width() < 1) {
     return;
   }
+
   const int local_size = m.Height() * m.Width();
   bytes_sent += sizeof(DataType) * local_size;
-#ifdef LBANN_HAS_ALUMINUM
-  if (m.Width() > 1 && m.Height() != m.LDim()) {
-    std::stringstream err;
-    err << "Aluminum does not support allreduces "
-        << "on non-contiguous matrices "
-        << "(height=" << m.Height() << ", "
-        << "width=" << m.Width() << ", "
-        << "leading dim=" << m.LDim() << ")";
-    LBANN_ERROR(err.str());
-  }
-  std::type_index t = std::type_index(typeid(::Al::MPIBackend));
+  bytes_received += sizeof(DataType) * local_size * (El::mpi::Size(c) - 1);
+
+  switch (m.GetDevice()) {
+  case El::Device::CPU:
+    return allreduce_impl(
+      static_cast<El::Matrix<TensorDataType, El::Device::CPU>&>(m), c, op);
 #ifdef LBANN_HAS_GPU
-  if (m.GetDevice() == El::Device::GPU) {
-#ifdef AL_HAS_NCCL
-    // We require NCCL for GPU matrices.
-    t = std::type_index(typeid(::Al::NCCLBackend));
-    // If available, use the MPI-CUDA backend for small matrices.
-#ifdef AL_HAS_MPI_CUDA
-    // Tuned for Sierra.
-    if ((El::mpi::Size(c) >= 64 && local_size <= 4096) ||
-        (El::mpi::Size(c) >= 128 && local_size <= 8192) ||
-        (El::mpi::Size(c) >= 256 && local_size <= 32768) ||
-        (El::mpi::Size(c) >= 512 && local_size <= 65536) ||
-        (El::mpi::Size(c) >= 2048 && local_size <= 262144)) {
-      t = std::type_index(typeid(::Al::MPICUDABackend));
-    }
-#endif  // AL_HAS_MPI_CUDA
-#elif defined(AL_HAS_MPI_CUDA)
-    t = std::type_index(typeid(::Al::MPICUDABackend));
-#else
-    throw lbann_exception("Allreduce on GPU matrix requires NCCL or MPI-CUDA"
-                          " support in Aluminum");
-#endif  // AL_HAS_NCCL
-  }
-#endif  // LBANN_HAS_GPU
-  if (t == std::type_index(typeid(::Al::MPIBackend))) {
-    ::Al::Allreduce<::Al::MPIBackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::MPIBackend>(El::SyncInfo<El::Device::CPU>{}));
+  case El::Device::GPU:
+    return allreduce_impl(
+      static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(m), c, op);
+#endif // LBANN_HAS_GPU
   }
-#ifdef AL_HAS_NCCL
-  if (t == std::type_index(typeid(::Al::NCCLBackend))) {
-    ::Al::Allreduce<::Al::NCCLBackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::NCCLBackend>(
-          SyncInfoFromMatrix(
-              static_cast<El::Matrix<DataType,El::Device::GPU>&>(m))));
-  }
-#endif // AL_HAS_NCCL
-#ifdef AL_HAS_MPI_CUDA
-  if (t == std::type_index(typeid(::Al::MPICUDABackend))) {
-    // Force the host-transfer algorithm for now.
-    ::Al::Allreduce<::Al::MPICUDABackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::MPICUDABackend>(
-          SyncInfoFromMatrix(
-              static_cast<El::Matrix<DataType,El::Device::GPU>&>(m))),
-      ::Al::MPICUDAAllreduceAlgorithm::host_transfer);
-  }
-#endif  // AL_HAS_MPI_CUDA
-#else
-  El::AllReduce(m, c, op);
-#endif
-  bytes_received += sizeof(DataType) * local_size * (El::mpi::Size(c) - 1);
+
 }
 
-void lbann_comm::allreduce(AbsDistMat& m,
+template <typename TensorDataType>
+void lbann_comm::allreduce(El::AbstractDistMatrix<TensorDataType>& m,
                            const El::mpi::Comm& c,
                            El::mpi::Op op) {
   allreduce(m.Matrix(), c, op);
 }
 
-void lbann_comm::nb_allreduce(AbsMat& m,
+template <typename TensorDataType>
+void lbann_comm::nb_allreduce(El::AbstractMatrix<TensorDataType>& m,
                               const El::mpi::Comm& c,
                               Al::request& req,
                               El::mpi::Op op) {
   if (El::mpi::Size(c) == 1 || m.Height() < 1 || m.Width() < 1) {
     return;
   }
-#ifdef LBANN_HAS_ALUMINUM
+
   const int local_size = m.Height() * m.Width();
   bytes_sent += sizeof(DataType) * local_size;
-  if (m.Width() > 1 && m.Height() != m.LDim()) {
-    std::stringstream err;
-    err << "Aluminum does not support allreduces "
-        << "on non-contiguous matrices "
-        << "(height=" << m.Height() << ", "
-        << "width=" << m.Width() << ", "
-        << "leading dim=" << m.LDim() << ")";
-    LBANN_ERROR(err.str());
-  }
-  std::type_index t = std::type_index(typeid(::Al::MPIBackend));
+  bytes_received += sizeof(DataType) * local_size * (El::mpi::Size(c) - 1);
+
+  switch (m.GetDevice()) {
+  case El::Device::CPU:
+    return nb_allreduce_impl(
+      static_cast<El::Matrix<TensorDataType, El::Device::CPU>&>(m), c, req, op);
 #ifdef LBANN_HAS_GPU
-  if (m.GetDevice() == El::Device::GPU) {
-#ifdef AL_HAS_NCCL
-    // We require NCCL for GPU matrices.
-    t = std::type_index(typeid(::Al::NCCLBackend));
-    // If available, use the MPI-CUDA backend for small matrices.
-#ifdef AL_HAS_MPI_CUDA
-    // Tuned for Sierra.
-    if ((El::mpi::Size(c) >= 64 && local_size <= 4096) ||
-        (El::mpi::Size(c) >= 128 && local_size <= 8192) ||
-        (El::mpi::Size(c) >= 256 && local_size <= 32768) ||
-        (El::mpi::Size(c) >= 512 && local_size <= 65536) ||
-        (El::mpi::Size(c) >= 2048 && local_size <= 262144)) {
-      t = std::type_index(typeid(::Al::MPICUDABackend));
-    }
-#endif  // AL_HAS_MPI_CUDA
-#elif defined(AL_HAS_MPI_CUDA)
-    t = std::type_index(typeid(::Al::MPICUDABackend));
-#else
-    throw lbann_exception("Allreduce on GPU matrix requires NCCL or MPI-CUDA"
-                          " support in Aluminum");
-#endif  // AL_HAS_NCCL
-  }
-#endif  // LBANN_HAS_GPU
-  if (t == std::type_index(typeid(::Al::MPIBackend))) {
-    ::Al::NonblockingAllreduce<::Al::MPIBackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::MPIBackend>(El::SyncInfo<El::Device::CPU>{}),
-      req.mpi_req);
+  case El::Device::GPU:
+    return nb_allreduce_impl(
+      static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(m), c, req, op);
+#endif // LBANN_HAS_GPU
   }
-  /// @todo MPI-CUDA backend
-#ifdef AL_HAS_NCCL
-  if (t == std::type_index(typeid(::Al::NCCLBackend))) {
-    ::Al::NonblockingAllreduce<::Al::NCCLBackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::NCCLBackend>(
-          SyncInfoFromMatrix(
-              static_cast<El::Matrix<DataType,El::Device::GPU>&>(m))),
-      req.nccl_req);
-  }
-#endif // AL_HAS_NCCL
-#ifdef AL_HAS_MPI_CUDA
-  if (t == std::type_index(typeid(::Al::MPICUDABackend))) {
-    // Force the host-transfer algorithm for now.
-    ::Al::NonblockingAllreduce<::Al::MPICUDABackend>(
-      m.Buffer(),
-      local_size,
-      mpi_op_to_al_op(op),
-      c.template GetComm<::Al::MPICUDABackend>(
-          SyncInfoFromMatrix(
-              static_cast<El::Matrix<DataType,El::Device::GPU>&>(m))),
-      req.mpicuda_req,
-      ::Al::MPICUDAAllreduceAlgorithm::host_transfer);
-  }
-#endif  // AL_HAS_MPI_CUDA
-  bytes_received += sizeof(DataType) * local_size * (El::mpi::Size(c) - 1);
-#else
-  allreduce(m, c, op);
-#endif // LBANN_HAS_ALUMINUM
+
 }
 
-void lbann_comm::nb_allreduce(AbsDistMat& m,
+template <typename TensorDataType>
+void lbann_comm::nb_allreduce(El::AbstractDistMatrix<TensorDataType>& m,
                               const El::mpi::Comm& c,
                               Al::request& req,
                               El::mpi::Op op) {
@@ -498,8 +576,34 @@ void lbann_comm::reset_threads() {
   }
 }
 
+const El::mpi::Comm& lbann_comm::get_packed_group_comm(int num_per_group) const {
+  if (group_communicators.count(num_per_group) == 0) {
+    // Ensure we can get an even number of groups.
+    if (get_procs_in_world() % num_per_group != 0) {
+      std::stringstream err;
+      err << "Cannot create a packed group comm with group size "
+          << num_per_group
+          << " out of " << get_procs_in_world()
+          << " processes";
+      LBANN_ERROR(err.str());
+    }
+    MPI_Comm comm;
+    MPI_Comm_split(
+      get_world_comm().GetMPIComm(),
+      get_rank_in_world() / (get_procs_in_world() / num_per_group),
+      0, &comm);
+    group_communicators.emplace(num_per_group, comm);
+    MPI_Comm_free(&comm);  // El::mpi::Comm duplicates internally.
+  }
+  return group_communicators[num_per_group];
+}
+
+void lbann_comm::lbann_comm_abort(std::string msg) {
+  throw lbann_exception(msg);
+}
+
 #ifdef LBANN_HAS_ALUMINUM
-::Al::ReductionOperator lbann_comm::mpi_op_to_al_op(El::mpi::Op op) {
+::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op) {
   if (op == El::mpi::SUM) {
     return ::Al::ReductionOperator::sum;
   } else if (op == El::mpi::PROD) {
@@ -514,10 +618,6 @@ ::Al::ReductionOperator lbann_comm::mpi_op_to_al_op(El::mpi::Op op) {
 }
 #endif
 
-void lbann_comm::lbann_comm_abort(std::string msg) {
-  throw lbann_exception(msg);
-}
-
 int get_rank_in_world() {
   int initialized = 0, finalized = 1, rank = -1;
   MPI_Initialized(&initialized);
@@ -528,4 +628,18 @@ int get_rank_in_world() {
   return rank;
 }
 
+#define PROTO(T)                                                                            \
+  template void lbann_comm::allreduce<T>(                                                   \
+    El::AbstractMatrix<T>& m, const El::mpi::Comm& c, El::mpi::Op op);                      \
+  template void lbann_comm::allreduce<T>(                                                   \
+    El::AbstractDistMatrix<T>& m, const El::mpi::Comm& c, El::mpi::Op op);                  \
+  template void lbann_comm::nb_allreduce<T>(                                                \
+    El::AbstractMatrix<T>& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op);    \
+  template void lbann_comm::nb_allreduce<T>(                                                \
+    El::AbstractDistMatrix<T>& m, const El::mpi::Comm& c, Al::request& req, El::mpi::Op op)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 }  // namespace lbann
diff --git a/src/data_coordinator/CMakeLists.txt b/src/data_coordinator/CMakeLists.txt
new file mode 100644
index 00000000000..185a475b696
--- /dev/null
+++ b/src/data_coordinator/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  data_coordinator.cpp
+  data_coordinator_metadata.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/data_coordinator/data_coordinator.cpp b/src/data_coordinator/data_coordinator.cpp
new file mode 100644
index 00000000000..13c3178646e
--- /dev/null
+++ b/src/data_coordinator/data_coordinator.cpp
@@ -0,0 +1,167 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <lbann/data_coordinator/data_coordinator.hpp>
+#include <lbann/trainers/trainer.hpp>
+
+namespace lbann {
+
+void data_coordinator::setup(int max_mini_batch_size, std::map<execution_mode, generic_data_reader *> data_readers) {
+  m_data_readers = data_readers;
+
+  if(m_data_readers[execution_mode::training] != nullptr) {
+    this->m_training_dataset.total_samples() = m_data_readers[execution_mode::training]->get_num_data();
+  }
+
+  if(m_data_readers[execution_mode::validation] != nullptr) {
+    this->m_validation_dataset.total_samples() = m_data_readers[execution_mode::validation]->get_num_data();
+  }
+
+  if(m_data_readers[execution_mode::testing] != nullptr) {
+    this->m_testing_dataset.total_samples() = m_data_readers[execution_mode::testing]->get_num_data();
+  }
+
+  /// @todo BVE FIXME the list of execution modes should not include
+  // ones will null data readers.  Fix this in next PR.
+  // Setup data readers
+  for(auto&& dr: m_data_readers) {
+    if (!dr.second) continue;
+    dr.second->setup(m_trainer->get_io_thread_pool().get_num_threads(),
+                     &(m_trainer->get_io_thread_pool()));
+    dr.second->set_rank(m_comm->get_rank_in_trainer());
+  }
+
+  /** Calculate how many iterations are required for training, testing,
+   *  and validation given a specified mini-batch size.
+   */
+  for(auto&& dr: m_data_readers) {
+    if (!dr.second) continue;
+    calculate_num_iterations_per_epoch(max_mini_batch_size, dr.second);
+  }
+
+  options *opts = options::get();
+  if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache") || opts->has_string("data_store_spill")) {
+    bool master = m_comm->am_world_master();
+    if (master) {
+      std::cout << "\nUSING DATA STORE!\n\n";
+    }
+    for (auto&& r : m_data_readers) {
+      if (!r.second) continue;
+      r.second->setup_data_store(m_trainer->get_max_mini_batch_size());
+    }
+  }
+}
+
+void data_coordinator::calculate_num_iterations_per_epoch(int max_mini_batch_size, generic_data_reader *data_reader) {
+  if(data_reader == nullptr) { return; }
+  // If the data reader does not have any data bail out (e.g. unused validation reader)
+  if(data_reader->get_num_data() == 0) { return; }
+
+  if(max_mini_batch_size > data_reader->get_num_data()) {
+    max_mini_batch_size = data_reader->get_num_data();
+  }
+
+  /// Check to make sure that there is enough data for all of the parallel readers
+  int num_parallel_readers_per_model = compute_max_num_parallel_readers(data_reader->get_num_data(), max_mini_batch_size, this->m_comm->get_procs_per_trainer());
+  data_reader->set_num_parallel_readers(num_parallel_readers_per_model);
+  if(num_parallel_readers_per_model == 0
+     || (num_parallel_readers_per_model != this->m_comm->get_procs_per_trainer() && num_parallel_readers_per_model != max_mini_batch_size)) {
+    throw lbann_exception(
+      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
+      " :: generic_data_distribution: number of parallel readers is zero");
+  }
+
+  /// Set the basic parameters for stride and offset of the data reader
+  int batch_stride = max_mini_batch_size;
+  int base_offset  = this->m_comm->get_rank_in_trainer();
+  /// Set mini-batch size and stride
+  data_reader->set_mini_batch_size(max_mini_batch_size);
+  data_reader->set_stride_to_next_mini_batch(batch_stride);
+  data_reader->set_sample_stride(num_parallel_readers_per_model);
+  data_reader->set_iteration_stride(1);
+  /// Set data reader base offset and model offset
+  data_reader->set_base_offset(base_offset);
+  data_reader->set_model_offset(0);
+  data_reader->set_initial_position();
+
+  /// By default each data reader will plan to process the entire data set
+  int num_iterations_per_epoch = ceil((float) data_reader->get_num_data() / (float) max_mini_batch_size);
+  int last_mini_batch_size = data_reader->get_num_data() % max_mini_batch_size;
+  if(last_mini_batch_size == 0) {
+    last_mini_batch_size = max_mini_batch_size;
+  }
+  data_reader->set_num_iterations_per_epoch(num_iterations_per_epoch);
+  data_reader->set_last_mini_batch_size(last_mini_batch_size);
+  data_reader->set_stride_to_last_mini_batch(data_reader->get_stride_to_next_mini_batch());
+
+  data_reader->set_global_mini_batch_size(max_mini_batch_size);
+  data_reader->set_global_last_mini_batch_size(last_mini_batch_size);
+  return;
+}
+
+void data_coordinator::calculate_num_iterations_per_epoch(int mini_batch_size) {
+  for(auto&& dr: m_data_readers) {
+    if (!dr.second) continue;
+    calculate_num_iterations_per_epoch(mini_batch_size, dr.second);
+  }
+}
+
+int data_coordinator::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const {
+  return compute_max_num_parallel_readers(data_set_size, mini_batch_size, requested_num_parallel_readers, this->m_comm);
+}
+
+int data_coordinator::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm) {
+  int num_parallel_readers = requested_num_parallel_readers;
+
+  if(comm->get_procs_per_trainer() != num_parallel_readers) {
+    if (comm->am_trainer_master()) {
+      std::cout << "Warning the requested number of parallel readers "
+                << num_parallel_readers
+                << " does not match the grid size "
+                << comm->get_procs_per_trainer()
+                << " OVERRIDING requested number of parallel readers."
+                << std::endl;
+    }
+    num_parallel_readers = comm->get_procs_per_trainer();
+  }
+
+#if 0
+  if(mini_batch_size < num_parallel_readers) {
+    if (comm->am_trainer_master()) {
+      std::cout << "Warning the requested number of parallel readers "
+                << num_parallel_readers
+                << " is larger than the requested mini-batch size "
+                << mini_batch_size
+                << " OVERRIDING requested number of parallel readers."
+                << std::endl;
+    }
+    num_parallel_readers = mini_batch_size;
+  }
+#endif
+  return num_parallel_readers;
+}
+
+} // namespace lbann
diff --git a/src/data_coordinator/data_coordinator_metadata.cpp b/src/data_coordinator/data_coordinator_metadata.cpp
new file mode 100644
index 00000000000..0189e8f8ae1
--- /dev/null
+++ b/src/data_coordinator/data_coordinator_metadata.cpp
@@ -0,0 +1,78 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+std::string to_string(const data_reader_target_mode m) {
+  switch(m) {
+  case data_reader_target_mode::CLASSIFICATION:
+    return "classification";
+  case data_reader_target_mode::REGRESSION:
+    return "regression";
+  case data_reader_target_mode::RECONSTRUCTION:
+    return "reconstruction";
+  case data_reader_target_mode::INPUT:
+    return "input";
+  case data_reader_target_mode::NA:
+    return "na";
+  default:
+    LBANN_ERROR("Invalid data reader target mode specified");
+    return "";
+  }
+}
+
+std::string to_string(const slice_points_mode m) {
+  switch(m) {
+  case slice_points_mode::INDEPENDENT:
+    return "independent";
+  case slice_points_mode::DEPENDENT:
+    return "dependent";
+  case slice_points_mode::NA:
+    return "na";
+  default:
+    LBANN_ERROR("Invalid slice points mode specified");
+    return "";
+  }
+}
+
+slice_points_mode slice_points_mode_from_string(const std::string& str) {
+  if(str == "independent" || str == "INDEPENDENT") {
+    return slice_points_mode::INDEPENDENT;
+  }
+  if(str == "dependent" || str == "DEPENDENT") {
+    return slice_points_mode::DEPENDENT;
+  }
+  if(str == "na" || str == "NA") {
+    return slice_points_mode::NA;
+  }
+  LBANN_ERROR("Invalid slice points mode specified");
+  return slice_points_mode::NA;
+}
+
+} // namespace lbann
diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt
index ebc80896808..780d74e0b1a 100644
--- a/src/data_readers/CMakeLists.txt
+++ b/src/data_readers/CMakeLists.txt
@@ -1,49 +1,25 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
-  cv_augmenter.cpp
-  cv_colorizer.cpp
-  cv_cropper.cpp
-  cv_decolorizer.cpp
-  cv_mean_extractor.cpp
-  cv_normalizer.cpp
-  cv_process.cpp
-  cv_process_patches.cpp
-  cv_resizer.cpp
-  cv_subtractor.cpp
-  cv_transform.cpp
-  cv_utils.cpp
   data_reader.cpp
-  data_reader_ascii.cpp
   data_reader_cifar10.cpp
   data_reader_csv.cpp
   data_reader_image.cpp
   data_reader_imagenet.cpp
-  data_reader_imagenet_patches.cpp
-  data_reader_jag.cpp
   data_reader_jag_conduit.cpp
   data_reader_merge_features.cpp
   data_reader_merge_samples.cpp
   data_reader_mesh.cpp
   data_reader_mnist.cpp
-  data_reader_moving_mnist.cpp
   data_reader_nci.cpp
   data_reader_numpy.cpp
   data_reader_numpy_npz.cpp
   data_reader_pilot2_molecular.cpp
   data_reader_synthetic.cpp
-  data_reader_multi_images.cpp
-  data_reader_mnist_siamese.cpp
-  data_reader_multihead_siamese.cpp
   data_reader_python.cpp
-  offline_patches_npz.cpp
-  image_preprocessor.cpp
-  image_utils.cpp
-  numpy_conduit_converter.cpp 
   data_reader_numpy_npz_conduit.cpp
+  data_reader_npz_ras_lipid.cpp
+  data_reader_smiles.cpp
   )
 
-# Add the subdirectories
-add_subdirectory(patchworks)
-
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/data_readers/cv_augmenter.cpp b/src/data_readers/cv_augmenter.cpp
deleted file mode 100644
index 2418d592399..00000000000
--- a/src/data_readers/cv_augmenter.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_augmenter .cpp .hpp - Augmenting functions for images in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_augmenter.hpp"
-#include "lbann/utils/mild_exception.hpp"
-#include "lbann/utils/random.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_augmenter::cv_augmenter()
-  : cv_transform(),
-    m_do_horizontal_flip(false),
-    m_do_vertical_flip(false),
-    m_rotation_range(0.0f),
-    m_horizontal_shift_range(0.0f),
-    m_vertical_shift_range(0.0f),
-    m_shear_range(0.0f),
-    m_flip(_no_flip_),
-    m_trans(cv::Mat_<float>::eye(3,3)) {
-  //check_enabled(); // enable if default parameter changes
-}
-
-
-cv_augmenter::cv_augmenter(const cv_augmenter& rhs)
-  : cv_transform(rhs),
-    m_do_horizontal_flip(rhs.m_do_horizontal_flip),
-    m_do_vertical_flip(rhs.m_do_vertical_flip),
-    m_rotation_range(rhs.m_rotation_range),
-    m_horizontal_shift_range(rhs.m_horizontal_shift_range),
-    m_vertical_shift_range(rhs.m_vertical_shift_range),
-    m_shear_range(rhs.m_shear_range),
-    m_flip(rhs.m_flip),
-    m_trans(rhs.m_trans) {
-}
-
-cv_augmenter *cv_augmenter::clone() const {
-  return new cv_augmenter(*this);
-}
-
-cv_augmenter& cv_augmenter::operator=(const cv_augmenter& rhs) {
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  cv_transform::operator=(rhs);
-  m_do_horizontal_flip = rhs.m_do_horizontal_flip;
-  m_do_vertical_flip = rhs.m_do_vertical_flip;
-  m_rotation_range = rhs.m_rotation_range;
-  m_horizontal_shift_range = rhs.m_horizontal_shift_range;
-  m_vertical_shift_range = rhs.m_vertical_shift_range;
-  m_shear_range = rhs.m_shear_range;
-  m_flip = rhs.m_flip;
-  m_trans = rhs.m_trans;
-
-  return (*this);
-}
-
-
-bool cv_augmenter::check_to_enable() const {
-  return ( m_do_horizontal_flip ||
-           m_do_vertical_flip ||
-          (m_horizontal_shift_range != 0.0f) ||
-          (m_vertical_shift_range != 0.0f) ||
-          (m_shear_range != 0.0f) ||
-          (m_rotation_range != 0.0f));
-}
-
-
-void cv_augmenter::set(const bool hflip, const bool vflip, const float rot,
-                       const float hshift, const float vshift, const float shear) {
-  reset();
-  m_do_horizontal_flip = hflip;
-  m_do_vertical_flip = vflip;
-  m_rotation_range = rot;
-  m_horizontal_shift_range = hshift;
-  m_vertical_shift_range = vshift;
-  m_shear_range = shear;
-}
-
-
-void cv_augmenter::reset() {
-  m_enabled = false; // will turns on when the transform is determined
-  m_flip = _no_flip_;
-  m_trans = cv::Mat_<float>::eye(3,3);
-}
-
-
-bool cv_augmenter::determine_transform(const cv::Mat& image) {
-  reset();
-
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  if (!check_to_enable()) {
-    return false;
-  }
-
-  rng_gen& gen = get_io_generator();
-
-  std::uniform_int_distribution<int> bool_dist(0, 1);
-
-  // Flips
-#ifdef _COMPAT_WITH_EL_AUGMENT_
-  const bool horiz_flip = bool_dist(gen) && m_do_horizontal_flip;
-  const bool vert_flip = bool_dist(gen) && m_do_vertical_flip;
-#else
-  const bool horiz_flip = m_do_horizontal_flip && bool_dist(gen);
-  const bool vert_flip = m_do_vertical_flip && bool_dist(gen);
-#endif
-
-  if (horiz_flip && vert_flip) {
-    m_flip = _both_axes_;
-  } else if (horiz_flip) {
-    m_flip = _horizontal_;
-  } else if (vert_flip) {
-    m_flip = _vertical_;
-  } else {
-    m_flip = _no_flip_;
-  }
-
-  // Shift (Translate)
-  float x_shift = 0.0f;
-  float y_shift = 0.0f;
-  if (m_horizontal_shift_range != 0.0f) {
-    std::uniform_real_distribution<float> dist(-m_horizontal_shift_range,
-        m_horizontal_shift_range);
-    x_shift = dist(gen) * image.cols;
-  }
-  if (m_vertical_shift_range != 0.0f) {
-    std::uniform_real_distribution<float> dist(-m_vertical_shift_range,
-        m_vertical_shift_range);
-    y_shift = dist(gen) * image.rows;
-  }
-  cv::Mat_<float> shift_mat = cv::Mat_<float>::eye(3,3);
-  shift_mat(0, 2) = x_shift;
-  shift_mat(1, 2) = y_shift;
-  //std::cout << "x_shift " << x_shift << ",    y_shift " << y_shift << std::endl;
-
-  // Shearing
-  float shear = 0.0f;
-  if (m_shear_range != 0.0f) {
-    std::uniform_real_distribution<float> dist(-m_shear_range,
-        m_shear_range);
-    shear = dist(gen);
-  }
-  cv::Mat_<float> shear_mat = cv::Mat_<float>::zeros(3,3);
-  shear_mat(0, 0) = 1.0f;
-  shear_mat(2, 2) = 1.0f;
-  shear_mat(0, 1) = -std::sin(shear);
-  shear_mat(1, 1) = std::cos(shear);
-  //std::cout << "shear " << shear << std::endl;
-
-  // Rotation
-  float rotate = 0.0f;
-  if (m_rotation_range != 0.0f) {
-    std::uniform_real_distribution<float> dist(-m_rotation_range,
-        m_rotation_range);
-    rotate = pi / 180.0f * dist(gen);
-  }
-  cv::Mat_<float> rot_mat = cv::Mat_<float>::zeros(3,3);
-  rot_mat(2, 2) = 1.0f;
-  rot_mat(0, 0) = std::cos(rotate);
-  rot_mat(0, 1) = -std::sin(rotate);
-  rot_mat(1, 0) = std::sin(rotate);
-  rot_mat(1, 1) = std::cos(rotate);
-  //std::cout << "rotate " << rotate << std::endl;
-
-  // Compute the final transformation.
-#if 0
-  cv::Mat_<float> tmp_mat = cv::Mat_<float>::zeros(3, 3);
-  cv::gemm(shift_mat, shear_mat, 1.0f, tmp_mat, 0.0f, tmp_mat, 0);
-  cv::gemm(tmp_mat, rot_mat, 1.0f, m_trans, 0.0f, m_trans, 0);
-#else
-  //m_trans = (shift_mat * shear_mat) * rot_mat;
-  m_trans = shear_mat * rot_mat;
-  m_trans(0,2) = x_shift;
-  m_trans(1,2) = y_shift;
-#endif
-
-  return (m_enabled = true);
-}
-
-
-bool cv_augmenter::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as it is applied
-
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  cv::Mat image_copy;
-
-  if (m_flip == _no_flip_) {
-    image_copy = image.clone();
-  } else {
-    cv::flip(image, image_copy, static_cast<int>(m_flip));
-  }
-
-  cv::Mat_<float> _trans(m_trans, cv::Rect_<float>(0,0,3,2));
-
-  cv::warpAffine(image_copy, image, _trans, image.size(),
-                 cv::INTER_LINEAR, cv::BORDER_REPLICATE);
-
-  return true;
-}
-
-std::string cv_augmenter::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - horizontal flip: " << (m_do_horizontal_flip? "true" : "false") << std::endl
-     << " - vertical flip: " << (m_do_vertical_flip? "true" : "false") << std::endl
-     << " - rotation range: " << m_rotation_range << std::endl
-     << " - horizontal shift range: " << m_horizontal_shift_range << std::endl
-     << " - vertical shift range: " << m_vertical_shift_range << std::endl
-     << " - shear range: " << m_shear_range << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_augmenter::print(std::ostream& os) const {
-  os << get_description()
-     << " - flipping: " << cv_transform::flip_desc(m_flip) << std::endl << std::fixed
-     << " - transfrom: " << m_trans(0,0) << '\t' << m_trans(0,1) << '\t' << m_trans(0,2)  << std::endl
-     << "              " << m_trans(1,0) << '\t' << m_trans(1,1) << '\t' << m_trans(1,2)  << std::endl
-     << "              " << m_trans(2,0) << '\t' << m_trans(2,1) << '\t' << m_trans(2,2)  << std::endl; //<< std::defaultfloat;
-
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_colorizer.cpp b/src/data_readers/cv_colorizer.cpp
deleted file mode 100644
index 4606623cf70..00000000000
--- a/src/data_readers/cv_colorizer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_colorizer .cpp .hpp - transform a non-color (grayscale) image into a
-//                          3-channel color image
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_colorizer.hpp"
-#include "lbann/utils/mild_exception.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_colorizer::cv_colorizer(const cv_colorizer& rhs)
-  : cv_transform(rhs), m_gray(rhs.m_gray) {}
-
-cv_colorizer& cv_colorizer::operator=(const cv_colorizer& rhs) {
-  cv_transform::operator=(rhs);
-  m_gray = rhs.m_gray;
-  return *this;
-}
-
-cv_colorizer *cv_colorizer::clone() const {
-  return (new cv_colorizer(*this));
-}
-
-bool cv_colorizer::determine_transform(const cv::Mat& image) {
-  //reset(); // redundant here
-  // enable colorizing transform if the given image is in grayscale
-  m_enabled = m_gray = (!image.empty() && (image.channels() == 1));
-  //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant
-  return m_enabled;
-}
-
-bool cv_colorizer::determine_inverse_transform() {
-  // Enable inverse transform only if grayscale to color transform has been applied
-  m_enabled = m_gray;
-  // indicate that the current image is a color image
-  m_gray = false;
-  return m_enabled;
-}
-
-bool cv_colorizer::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as the transform is applied once
-
-  if (!m_gray) { // apply the inverse transform from color to gray
-    cv::Mat image_dst;
-    cv::cvtColor(image, image_dst, cv::COLOR_BGR2GRAY);
-    image = image_dst;
-  } else { // apply the transform from gray to color
-    cv::Mat image_dst;
-    cv::cvtColor(image, image_dst, cv::COLOR_GRAY2BGR);
-    image = image_dst;
-  }
-
-  return true;
-}
-
-std::string cv_colorizer::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_colorizer::print(std::ostream& os) const {
-  os << get_description()
-     << " - " << (m_gray? "grayscale" : "color") << std::endl;
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_cropper.cpp b/src/data_readers/cv_cropper.cpp
deleted file mode 100644
index 6e16b09466a..00000000000
--- a/src/data_readers/cv_cropper.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_cropper .cpp .hpp - functions to crop images
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_cropper.hpp"
-#include "lbann/utils/mild_exception.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/utils/exception.hpp"
-#include <algorithm>
-#include <ostream>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-const int cv_cropper::m_interpolation_choices[3] = {cv::INTER_LINEAR, cv::INTER_AREA, cv::INTER_LINEAR};
-
-cv_cropper::cv_cropper()
-  : cv_transform(), m_width(0u), m_height(0u),
-    m_rand_crop(false), m_is_roi_set(false),
-    m_roi_size(std::pair<int,int>(0,0)),
-    m_zoom(1.0), m_interpolation(m_interpolation_choices[0]),
-    m_adaptive_interpolation(false) {}
-
-
-cv_cropper *cv_cropper::clone() const {
-  return new cv_cropper(*this);
-}
-
-/// Make sure to clear the roi flag as well when clearing roi size
-void cv_cropper::unset_roi() {
-  m_is_roi_set = false;
-  m_roi_size = std::pair<int, int>(0, 0);
-}
-
-void cv_cropper::set(const unsigned int width, const unsigned int height,
-                     const bool random_crop,
-                     const std::pair<int, int>& roi_sz,
-                     const bool adaptive_interpolation) {
-  reset();
-  m_width = width;
-  m_height = height;
-  m_rand_crop = random_crop;
-  m_adaptive_interpolation = adaptive_interpolation;
-
-  if ((roi_sz.first > 0) && (roi_sz.second > 0)) {
-    if (((unsigned) roi_sz.first < width) || ((unsigned) roi_sz.second < height)) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: cv_cropper: ROI size is smaller than that of a patch";
-      throw lbann_exception(err.str());
-    } else {
-      m_is_roi_set = true;
-      m_roi_size = roi_sz;
-    }
-  } else if (!((roi_sz.first == 0) && (roi_sz.second == 0))) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_cropper: invalid ROI size";
-    throw lbann_exception(err.str());
-  } else {
-    unset_roi();
-  }
-}
-
-void cv_cropper::reset() {
-  m_enabled = false;
-  m_zoom = 1.0;
-  m_interpolation = m_interpolation_choices[0];
-}
-
-bool cv_cropper::determine_transform(const cv::Mat& image) {
-  m_enabled = false; //sufficient for now in place of reset();
-
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  double zoom_h = 1.0;
-  double zoom_v = 1.0;
-  if (m_is_roi_set) {
-    zoom_h = image.cols / static_cast<double>(m_roi_size.first);
-    zoom_v = image.rows / static_cast<double>(m_roi_size.second);
-  }
-
-  m_zoom = std::min(zoom_h, zoom_v);
-
-  if (m_zoom > 1.0) { // rescales the image by the factor of 1/m_zoom (shrink)
-    m_interpolation =  m_interpolation_choices[static_cast<int>(m_adaptive_interpolation)];
-  } else {
-    m_interpolation =  m_interpolation_choices[static_cast<int>(m_adaptive_interpolation) << 1];
-  }
-
-  return (m_enabled = true);
-}
-
-/**
- * Method 1:
- *  a. Rescale the raw image, I, such that one dimension matches the corresponding
- *     dimension of the specified rectangular area, R, while trying to maintain the
- *     size as closely as possible to that of the raw image without altering the
- *     aspect ratio.
- *  b. Crop off the excess area of the resized image, which goes beyond the
- *     specified R aligned at the center of the image.
- *  c. Crop out an area of the specified size, C, at the center of R or at a random
- *     position within R.
- *
- * Method 2:
- *  Instead of rescaling-crop-crop as in method 1,
- *  a. Compute the projection of the final crop area, C', on the raw image I without
- *     actually rescaling the image. This still requires to compute the scaling factor
- *     for image resizing.
- *     However, instead of applying it to the raw image, apply the inverse to project
- *     the crop C onto the raw image I. This does not change any actual pixel.
- *  b. Crop the projected area C'
- *  c. Rescale C' to C. This deals with a smaller number of pixels than method 1 for
- *     resizing, only those that remain.
- *
- *  We rely on Method 2 here.
- */
-bool cv_cropper::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as it is applied
-
-  //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant
-
-  const double zoomed_roi_width = m_roi_size.first * m_zoom;
-  const double zoomed_roi_height = m_roi_size.second * m_zoom;
-  const double zoomed_width = m_width * m_zoom;
-  const double zoomed_height = m_height * m_zoom;
-
-  int crop_x_start = 0;
-  int crop_y_start = 0;
-
-  // Get random crop of image
-  if(m_rand_crop) {
-    const int rnd_dw = fast_rand_int(get_fast_io_generator(), static_cast<int>(2*(zoomed_roi_width - zoomed_width)) + 1);
-    const int rnd_dh = fast_rand_int(get_fast_io_generator(), static_cast<int>(2*(zoomed_roi_height - zoomed_height)) + 1);
-    crop_x_start = static_cast<int>(image.cols - zoomed_roi_width + rnd_dw + 1) / 2;
-    crop_y_start = static_cast<int>(image.rows - zoomed_roi_height + rnd_dh + 1) / 2;
-  } else {
-    crop_x_start = static_cast<int>(image.cols - zoomed_width + 1) / 2;
-    crop_y_start = static_cast<int>(image.rows - zoomed_height + 1) / 2;
-  }
-
-  cv::Mat zoomed_crop = image(cv::Rect(crop_x_start, crop_y_start, zoomed_width, zoomed_height));
-  cv::Mat crop;
-  cv::resize(zoomed_crop, crop, cv::Size(m_width,m_height), 0, 0, m_interpolation);
-  image = crop;
-
-  return true;
-}
-
-std::string cv_cropper::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - crop size: " << m_width  << "x" << m_height << std::endl
-     << " - resized size: " << m_roi_size.first << "x" << m_roi_size.second << std::endl
-     << " - random crop: " << m_rand_crop << std::endl
-     << " - adaptive interpolation: " << m_adaptive_interpolation << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_cropper::print(std::ostream& os) const {
-  os << get_description()
-     << " - zoom: 1/" << m_zoom << std::endl
-     << " - interpolation: ";
-  switch(m_interpolation) {
-    case cv::INTER_LINEAR: os << "INTER_LINEAR" << std::endl; break;
-    case cv::INTER_CUBIC:  os << "INTER_CUBIC" << std::endl; break;
-    case cv::INTER_AREA:   os << "INTER_AREA" << std::endl; break;
-    default: os << "unrecognized" << std::endl; break;
-  }
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_decolorizer.cpp b/src/data_readers/cv_decolorizer.cpp
deleted file mode 100644
index 9d7f7ae3a14..00000000000
--- a/src/data_readers/cv_decolorizer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_decolorizer .cpp .hpp - transform a color image into a single-channel
-//                            monochrome image
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_decolorizer.hpp"
-#include "lbann/utils/mild_exception.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_decolorizer::cv_decolorizer(const cv_decolorizer& rhs)
-  : cv_transform(rhs), m_color(rhs.m_color), m_pick_1ch(rhs.m_pick_1ch) {}
-
-cv_decolorizer& cv_decolorizer::operator=(const cv_decolorizer& rhs) {
-  cv_transform::operator=(rhs);
-  m_color = rhs.m_color;
-  m_pick_1ch = rhs.m_pick_1ch;
-  return *this;
-}
-
-cv_decolorizer *cv_decolorizer::clone() const {
-  return (new cv_decolorizer(*this));
-}
-
-void cv_decolorizer::set(const bool pick_1ch) {
-  m_pick_1ch = pick_1ch;
-  reset();
-}
-
-bool cv_decolorizer::determine_transform(const cv::Mat& image) {
-  //reset(); // redundant here
-  // enable decolorizing transform if the given image is a color image
-  m_enabled = m_color = (!image.empty() && (image.channels() > 1));
-  //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant
-  return m_enabled;
-}
-
-bool cv_decolorizer::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as the transform is applied once
-
-  if (m_color) {
-    if (m_pick_1ch) {
-      // Drop all the channels but one.
-      const int Nch = image.channels();
-      std::vector<cv::Mat> channels(Nch);
-      cv::split(image, channels);
-      image = channels[1 % Nch];
-    } else {
-      // Compute a new channel by the linear combination of all channels
-      cv::Mat image_dst;
-      cv::cvtColor(image, image_dst, cv::COLOR_BGR2GRAY);
-      image = image_dst;
-    }
-  }
-
-  return true;
-}
-
-std::string cv_decolorizer::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_decolorizer::print(std::ostream& os) const {
-  os << get_description()
-     << " - " << (m_color? "color" : "grayscale") << std::endl;
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_mean_extractor.cpp b/src/data_readers/cv_mean_extractor.cpp
deleted file mode 100644
index 4026e71bcb3..00000000000
--- a/src/data_readers/cv_mean_extractor.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_mean_extractor .cpp .hpp - accumulate mean over the image set
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_mean_extractor.hpp"
-#include "lbann/utils/mild_exception.hpp"
-#include "lbann/utils/exception.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_mean_extractor::cv_mean_extractor()
-: cv_transform(), m_batch_size(m_default_batch_size), m_batch_cnt(0u), m_partial_cnt(0u), m_type_code(0)
-{}
-
-cv_mean_extractor::cv_mean_extractor(const cv_mean_extractor& rhs)
-  : cv_transform(rhs), m_batch_size(rhs.m_batch_size),
-    m_batch_cnt(rhs.m_batch_cnt), m_partial_cnt(rhs.m_partial_cnt),
-    m_type_code(rhs.m_type_code), m_sum(rhs.m_sum.clone()), m_avg(rhs.m_avg.clone())
-{}
-
-cv_mean_extractor& cv_mean_extractor::operator=(const cv_mean_extractor& rhs) {
-  cv_transform::operator=(rhs);
-  m_batch_size = rhs.m_batch_size;
-  m_batch_cnt = rhs.m_batch_cnt;
-  m_partial_cnt = rhs.m_partial_cnt;
-  m_type_code = rhs.m_type_code;
-  m_sum = rhs.m_sum.clone();
-  m_avg = rhs.m_avg.clone();
-  return *this;
-}
-
-cv_mean_extractor *cv_mean_extractor::clone() const {
-  return (new cv_mean_extractor(*this));
-}
-
-/** Set up the internal matrices used to accumulate image statistics,
- *  and initialize the batch size.
- */
-void cv_mean_extractor::set(const unsigned int width, const unsigned int height,
-                            const unsigned int n_ch, const unsigned int batch_sz) {
-  if (!m_sum.empty() || (width == 0u) || (height == 0u) || (n_ch == 0u) || (batch_sz == 0u)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_mean_extractor: either using an invalid "
-        << "parameter or attempting to reconfigure";
-    throw lbann_exception(err.str());
-  }
-
-  m_batch_size = batch_sz;
-
-  create_matrices(width, height, n_ch);
-  reset();
-}
-
-/**
- * This can be used to set the batch size only, and defer the creation of
- * matrices for accumulating statistics until the first image is seen.
- */
-void cv_mean_extractor::set(const unsigned int batch_sz) {
-  if (!m_sum.empty() || (batch_sz == 0u)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_mean_extractor: " <<
-          "cannot reset the batch size once started and it must be greater than 0";
-    throw lbann_exception(err.str());
-  }
-  m_batch_size = batch_sz;
-}
-
-void cv_mean_extractor::create_matrices(const unsigned int width, const unsigned int height, const unsigned int n_ch) {
-  // OpenCV image type code
-  m_type_code = cv_image_type<Float_T>::T(n_ch);
-  m_sum = cv::Mat(height, width, m_type_code);
-  m_avg = cv::Mat(height, width, m_type_code);
-}
-
-void cv_mean_extractor::reset() {
-  // convert to a single change image before resetting the values as the
-  // dimension of Scalar is limited to 4 (4 channels)
-  cv::Mat m_sum_1ch = m_sum.reshape(1);
-  m_sum_1ch.setTo(static_cast<Float_T>(0));
-  cv::Mat m_avg_1ch = m_avg.reshape(1);
-  m_avg_1ch.setTo(static_cast<Float_T>(0));
-
-  m_batch_cnt = 0u;
-  m_partial_cnt = 0u;
-  m_enabled = false;
-}
-
-/**
- * If the size or the number of channels of the given image is different
- * from what is expected, fails.
- */
-bool cv_mean_extractor::determine_transform(const cv::Mat& image) {
-  m_enabled = false;
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false);
-  // If it has not been configured (other than batch size), do it here
-  if (m_sum.empty()) {
-    create_matrices(image.cols, image.rows, image.channels());
-    reset();
-
-    m_enabled = true;
-  } else {
-    m_enabled = check_if_cv_Mat_has_same_shape(image, m_avg);
-  }
-  return m_enabled;
-}
-
-bool cv_mean_extractor::determine_inverse_transform() {
-  // inversing is irrelevant
-  return (m_enabled = false);
-}
-
-bool cv_mean_extractor::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as the transform is applied once
-  const double f = get_depth_normalizing_factor(image.depth());
-  cv::addWeighted(m_sum, 1.0, image, f, 0.0, m_sum, m_type_code);
-  if (++m_partial_cnt == m_batch_size) {
-    m_partial_cnt = 0u;
-    ++m_batch_cnt;
-    cv::addWeighted(m_avg, static_cast<double>(m_batch_cnt-1)/m_batch_cnt,
-                    m_sum, 1/static_cast<double>(m_batch_cnt*m_batch_size),
-                    0.0, m_avg, m_type_code);
-    cv::Mat m_sum_1ch = m_sum.reshape(1);
-    m_sum_1ch.setTo(static_cast<Float_T>(0));
-  }
-  return true;
-}
-
-std::string cv_mean_extractor::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - batch size " << m_batch_size << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_mean_extractor::print(std::ostream& os) const {
-  os << get_description()
-     << " - partial cnt " << m_partial_cnt << std::endl
-     << " - batch cnt " << m_batch_cnt << std::endl;
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_normalizer.cpp b/src/data_readers/cv_normalizer.cpp
deleted file mode 100644
index 88099544bfc..00000000000
--- a/src/data_readers/cv_normalizer.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_cv_normalizer .cpp .hpp - Normalizing functions for images
-//                                 in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_normalizer.hpp"
-#include "lbann/utils/mild_exception.hpp"
-#include <cmath> //fabs
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_normalizer::cv_normalizer()
-  : cv_transform(), m_mean_subtraction(false), m_unit_variance(false),
-    m_unit_scale(true), m_z_score(false)
-{}
-
-
-cv_normalizer::cv_normalizer(const cv_normalizer& rhs)
-  : cv_transform(rhs), m_mean_subtraction(rhs.m_mean_subtraction), m_unit_variance(rhs.m_unit_variance),
-    m_unit_scale(rhs.m_unit_scale), m_z_score(rhs.m_z_score), m_trans(rhs.m_trans) {
-}
-
-
-cv_normalizer& cv_normalizer::operator=(const cv_normalizer& rhs) {
-  if (this == &rhs) {
-    return (*this);
-  }
-  cv_transform::operator=(rhs);
-  m_mean_subtraction = rhs.m_mean_subtraction;
-  m_unit_variance = rhs.m_unit_variance;
-  m_unit_scale = rhs.m_unit_scale;
-  m_z_score = rhs.m_z_score;
-  m_trans = rhs.m_trans;
-
-  return (*this);
-}
-
-
-cv_normalizer *cv_normalizer::clone() const {
-  return new cv_normalizer(*this);
-}
-
-
-cv_normalizer::normalization_type& cv_normalizer::set_normalization_type(
-  normalization_type& ntype, const normalization_type flag) const {
-  return (ntype = set_normalization_bits(ntype, flag));
-}
-
-
-bool cv_normalizer::check_to_enable() const {
-  return (m_mean_subtraction || m_unit_variance || m_unit_scale || m_z_score);
-}
-
-
-void cv_normalizer::set(const bool meansub, const bool unitvar, const bool unitscale, const bool zscore) {
-  reset();
-  m_mean_subtraction = meansub;
-  m_unit_variance = unitvar;
-  m_unit_scale = unitscale;
-  m_z_score = zscore;
-}
-
-
-void cv_normalizer::reset() {
-  m_enabled = false;
-  m_trans.clear();
-}
-
-
-bool cv_normalizer::determine_transform(const cv::Mat& image) {
-  reset();
-
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  if (!check_to_enable()) {
-    return false;
-  }
-
-  normalization_type ntype = _none;
-  if (m_unit_scale) {
-    set_normalization_type(ntype, _u_scale);
-  }
-  if (m_mean_subtraction) {
-    set_normalization_type(ntype, _mean_sub);
-  }
-  if (m_unit_variance) {
-    set_normalization_type(ntype, _unit_var);
-  }
-  if (m_z_score) {
-    set_normalization_type(ntype, _z_score);
-  }
-
-  ComputeType u_scale = 1.0;
-  ComputeType largest = 1.0;
-
-  //if (!m_z_score && m_unit_scale) {
-  if (ntype < _z_score) { // !(m_z_score || (m_mean_subtraction && m_unit_variance))
-    switch(image.depth()) {
-    case CV_8U:
-      largest = std::numeric_limits<uint8_t>::max();
-      break;
-    case CV_8S:
-      largest = std::numeric_limits<int8_t>::max();
-      break;
-    case CV_16U:
-      largest = std::numeric_limits<uint16_t>::max();
-      break;
-    case CV_16S:
-      largest = std::numeric_limits<int16_t>::max();
-      break;
-    case CV_32S:
-      largest = std::numeric_limits<int32_t>::max();
-      break;
-    default:
-      return false;
-      // Currently, do nothing for non-integral types. However, a set of scaling
-      // paramters can be added to the argument list of this function.
-    }
-    u_scale = static_cast<ComputeType>(1.0)/largest;
-  }
-
-  std::vector<ComputeType> mean;
-  std::vector<ComputeType> stddev;
-  const normalization_type code_wo_uscale = mask_normalization_bits(ntype, _z_score);
-  const auto NCh = static_cast<size_t>(image.channels());
-
-  if (code_wo_uscale != _none) {
-    if (!compute_mean_stddev(image, mean, stddev) || (NCh != mean.size())) {
-      return false;
-    }
-  #if 0
-    for (int ch = 0; ch < image.channels(); ++ch) {
-      std::cout << "channel " << ch << "\tmean " << mean[ch] << "\tstddev " << stddev[ch] << std::endl;
-    }
-  #endif
-  }
-
-  m_trans.resize(NCh);
-
-  switch (code_wo_uscale) {
-  case _none: // Note that mean.size() is zero in this case
-    for (size_t ch=0u; ch < NCh; ++ch) {
-      m_trans[ch] = channel_trans_t(u_scale, 0.0);
-    }
-    break;
-  case _mean_sub:
-    for (size_t ch=0u; ch < NCh; ++ch) {
-      m_trans[ch] = channel_trans_t(u_scale,
-                                    - u_scale * mean[ch]);
-    }
-    break;
-  case _unit_var:
-    for (size_t ch=0u; ch < NCh; ++ch) {
-      if (stddev[ch] > fabs(mean[ch])*(1e-7)) {
-        m_trans[ch] =
-          channel_trans_t(static_cast<ComputeType>(1.0)/stddev[ch],
-                          u_scale * mean[ch] - mean[ch]/stddev[ch]);
-      } else {
-        m_trans[ch] = channel_trans_t(u_scale, 0.0);
-      }
-    }
-    break;
-  case _z_score:
-    for (size_t ch=0u; ch < NCh; ++ch) {
-      if (stddev[ch] > fabs(mean[ch])*(1e-7)) {
-        m_trans[ch] = channel_trans_t(static_cast<ComputeType>(1.0)/stddev[ch],
-                                      - mean[ch]/stddev[ch]);
-      } else {
-        m_trans[ch] = channel_trans_t(0.0, 0.0);
-      }
-    }
-    break;
-  default:
-    return false;
-  }
-
-  m_enabled = true;
-  return true;
-}
-
-
-/**
- * Manually invoke normalization before copying image from cv::Mat into
- * El::Matrix<DataType> format. Then, the transform must be disabled to
- * prevent it from being automatically applied again during copying.
- * After the copying is complete, either of the following two is required
- * depending on whether the inverse transform is needed afterwards or not.
- * If no inverse transform is needed , disabling or resetting is ok.
- * As the normalization could have been implicitly applied during copying
- * via scaling, the transform must be disabled after copying.
- * On the other hand, resetting the structure is ok if no inverse transform
- * is needed. Alternatively, the inverse transform can be set.
- */
-bool cv_normalizer::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as it is applied
-  return scale(image, m_trans);
-}
-
-
-/**
- * The actual transform can either be manually invoked, or automatically during
- * copying from a cv::Mat image to El::Matrix<DataType> data to avoid reading
- * the image twice.
- * @param _trans The channel-wise parameters for linear transform
- */
-void cv_normalizer::set_transform(const std::vector<channel_trans_t>& _trans) {
-  m_trans = _trans;
-  m_enabled = true;
-}
-
-
-/**
- * In case that undoing normalization is required, this call arranges it to
- * occur during copying from El::Matrix<DataType> data to a cv::Mat image
- * while avoiding reading the image twice.
- */
-bool cv_normalizer::determine_inverse_transform() {
-  m_enabled = false; // unless this method is successful, stays disabled
-  const size_t NCh = m_trans.size();
-  if (NCh == 0u) {
-    m_trans.clear();
-    return false;
-  }
-
-  std::vector<channel_trans_t> trans_reverse(NCh, channel_trans_t(1.0, 0.0));
-
-  for (size_t ch=0u; ch < NCh; ++ch) {
-    if (m_trans[ch].first == 0.0) {
-      m_trans.clear();
-      return false;
-    }
-    trans_reverse[ch] =
-      channel_trans_t(static_cast<ComputeType>(1.0)/m_trans[ch].first,
-                      - m_trans[ch].second/m_trans[ch].first);
-  }
-  trans_reverse.swap(m_trans);
-
-  return (m_enabled = true);
-}
-
-
-
-bool cv_normalizer::scale(cv::Mat& image, const std::vector<channel_trans_t>& trans) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  switch(image.depth()) {
-  case CV_8U:
-    return scale_with_known_type<_depth_type(CV_8U),  DataType>(image, trans);
-  case CV_8S:
-    return scale_with_known_type<_depth_type(CV_8S),  DataType>(image, trans);
-  case CV_16U:
-    return scale_with_known_type<_depth_type(CV_16U), DataType>(image, trans);
-  case CV_16S:
-    return scale_with_known_type<_depth_type(CV_16S), DataType>(image, trans);
-  case CV_32S:
-    return scale_with_known_type<_depth_type(CV_32S), DataType>(image, trans);
-  case CV_32F:
-    return scale_with_known_type<_depth_type(CV_32F), DataType>(image, trans);
-  case CV_64F:
-    return scale_with_known_type<_depth_type(CV_64F), DataType>(image, trans);
-  }
-  return false;
-}
-
-
-bool cv_normalizer::compute_mean_stddev(const cv::Mat& image,
-                                        std::vector<ComputeType>& mean, std::vector<ComputeType>& stddev,
-                                        cv::InputArray mask) {
-  if (image.empty()) {
-    return false;
-  }
-  if (image.channels() > 4) {
-    _SWITCH_CV_FUNC_4PARAMS(image.depth(), \
-                            compute_mean_stddev_with_known_type, image, mean, stddev, mask)
-  } else {
-    // cv::meanStdDev() currently only works with double type for mean and stddev and images of 1-4 channels
-    using Ch_T = double;
-    //using Ch_T = ComputeType;
-    using Output_T = cv_image_type<Ch_T>;
-    cv::Mat _mean(1, 4, Output_T::T());
-    cv::Mat _stddev(1, 4, Output_T::T());
-    cv::meanStdDev(image, _mean, _stddev, mask);
-    mean.resize(image.channels());
-    stddev.resize(image.channels());
-    for (int c=0; c < image.channels(); ++c) {
-      mean[c] = static_cast<ComputeType>(_mean.at<Ch_T>(0,c));
-      stddev[c] = static_cast<ComputeType>(_stddev.at<Ch_T>(0,c));
-    }
-    return true;
-  }
-  return false;
-}
-
-std::string cv_normalizer::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - mean subtraction: " << (m_mean_subtraction? "true" : "false") << std::endl
-     << " - unit variance: " << (m_unit_variance? "true" : "false") << std::endl
-     << " - unit scale: " << (m_unit_scale? "true" : "false") << std::endl
-     << " - z-score: " << (m_z_score? "true" : "false") << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_normalizer::print(std::ostream& os) const {
-  os << get_description()
-     << " - transform:";
-  for (const channel_trans_t& tr: m_trans) {
-    os << " [" << tr.first << ' ' << tr.second << "]\n             ";
-  }
-  os << std::endl;
-
-  return os;
-}
-
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_process.cpp b/src/data_readers/cv_process.cpp
deleted file mode 100644
index 2864aeeca61..00000000000
--- a/src/data_readers/cv_process.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_process .cpp .hpp - structure that defines the operations
-//                        on image data in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-
-#include "lbann/data_readers/cv_process.hpp"
-#include "lbann/utils/exception.hpp"
-#include <algorithm> // std::min
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-/**
- * Copy constructor.
- * Rather than transferring the ownership of the managed cv_transform objects
- * pointed by the pointers, or sharing them by simply copying the pointers,
- * copy-constructs the objects and owns the pointers to those newly created
- * objects.
- */
-cv_process::cv_process(const cv_process& rhs)
-  : m_flip(rhs.m_flip), m_split(rhs.m_split),
-    m_is_normalizer_set(rhs.m_is_normalizer_set),
-    m_normalizer_idx(rhs.m_normalizer_idx)
-{
-  for (size_t i = 0u; i < rhs.m_transforms.size(); ++i) {
-    std::unique_ptr<cv_transform> p(rhs.m_transforms[i]->clone());
-    if (!p) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: cv_process: undefined transform " << i;
-      throw lbann_exception(err.str());
-    }
-    m_transforms.push_back(std::move(p)); // avoid using emplace
-  }
-}
-
-/**
- * Assignment operator.
- * Rather than transferring the ownership of the managed cv_transform objects
- * pointed by the pointers, or sharing them by simply copying the pointers,
- * copy-constructs the objects and owns the pointers to those newly created
- * objects.
- */
-cv_process& cv_process::operator=(const cv_process& rhs) {
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  m_flip = rhs.m_flip;
-  m_split = rhs.m_split;
-  m_is_normalizer_set = rhs.m_is_normalizer_set;
-  m_normalizer_idx = rhs.m_normalizer_idx;
-
-  m_transforms.clear();
-
-  for (size_t i = 0u; i < rhs.m_transforms.size(); ++i) {
-    std::unique_ptr<cv_transform> p(rhs.m_transforms[i]->clone());
-    if (!p)  {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: cv_process: undefined transform " << i;
-      throw lbann_exception(err.str());
-    }
-    m_transforms.push_back(std::move(p));
-  }
-
-  return (*this);
-}
-
-
-void cv_process::reset() {
-  for (auto & m_transform : m_transforms)
-    m_transform->reset();
-}
-
-void cv_process::disable_lazy_normalizer() {
-  if (to_fuse_normalizer_with_copy()) {
-    m_transforms[m_normalizer_idx]->disable();
-  }
-}
-
-void cv_process::disable_transforms() {
-  for (auto & m_transform : m_transforms) {
-    m_transform->disable();
-  }
-}
-
-bool cv_process::add_transform(std::unique_ptr<cv_transform> tr) {
-  if (!tr) return false;
-  m_transforms.push_back(std::move(tr));
-  return true;
-}
-
-bool cv_process::to_fuse_normalizer_with_copy() const {
-  return (m_is_normalizer_set &&
-          ((m_normalizer_idx+1) == m_transforms.size()) &&
-          (dynamic_cast<const cv_normalizer*>(m_transforms[m_normalizer_idx].get()) != nullptr));
-}
-
-void cv_process::set_normalizer_info() {
-  m_is_normalizer_set = true;
-  m_normalizer_idx = m_transforms.size();
-}
-
-bool cv_process::add_normalizer(std::unique_ptr<cv_normalizer> tr) {
-  if (!tr || m_is_normalizer_set) return false;
-  set_normalizer_info();
-  m_transforms.push_back(std::move(tr));
-  return true;
-}
-
-bool cv_process::add_normalizer(std::unique_ptr<cv_subtractor> tr) {
-  if (!tr || m_is_normalizer_set) return false;
-  set_normalizer_info();
-  m_transforms.push_back(std::move(tr));
-  return true;
-}
-
-/// Allow read-only access to a particular transform indexed by idx
-const cv_transform* cv_process::get_transform(const unsigned int idx) const {
-  if (idx >= m_transforms.size()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_process: invalid index " << idx << " >= " << m_transforms.size();
-    throw lbann_exception(err.str());
-  }
-  return m_transforms[idx].get();
-}
-
-/// Allow read-write access to a particular transform indexed by idx
-cv_transform* cv_process::get_transform(const unsigned int idx) {
-  if (idx >= m_transforms.size()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_process: invalid index " << idx << " >= " << m_transforms.size();
-    throw lbann_exception(err.str());
-  }
-  return m_transforms[idx].get();
-}
-
-std::vector<unsigned int> cv_process::get_data_dims() const {
-  for(const std::unique_ptr<cv_transform>& tr: m_transforms) {
-    const auto* const c = dynamic_cast<const cv_cropper*>(&(*tr));
-    if (c != nullptr) {
-      return {c->get_crop_width(), c->get_crop_height()};
-    }
-  }
-  return {0u, 0u};
-}
-
-/**
- * Call this before image saving/exporting in postprocessing if inverse normalization
- * is needed to save image.  Unless normalization is followed by a transform, inverse
- * normalization is done while copying data from El::Matrix<DataType> to cv::Mat format.
- * Otherwise, it will be done during postprocessing as the rest of transforms in order.
- */
-void cv_process::determine_inverse_lazy_normalization() {
-  if (!m_is_normalizer_set || !to_fuse_normalizer_with_copy()) {
-    return;
-  }
-
-  m_transforms[m_normalizer_idx]->determine_inverse_transform();
-}
-
-/**
- * Preprocess an image.
- * It executes a range of transforms specified as [tr_strart, tr_end). If tr_end
- * is unspecified, it is considered as the total number of transforms. If it is 0,
- * no transform will perform.
- * By default, it executes all of them. Selective execution is useful whe
- * generating multiple patches (small images) out of an image.
- * We first run transforms until generating patches, and stop. Then, generate
- * patches, and run the rest of the transforms on each patches generated.
- * @return true if successful
- */
-bool cv_process::preprocess(cv::Mat& image, unsigned int tr_start, unsigned int tr_end) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  bool ok = true;
-
-  if (tr_end == 0u) return true;
-  if (tr_start == 0u) {
-    if (to_flip())
-      cv::flip(image, image, how_to_flip());
-  } else if ((tr_start >= m_transforms.size()) || (tr_start >= tr_end)) {
-    return true;
-  }
-
-  // While many transforms can update pixel values in place, some require new
-  // memory locations to write new values. In addition, at the end of a pre-
-  // processing pipeline, the values in an OpenCV matrix is copied into an
-  // Elemental matrix. Normalization typically is the last transform in a
-  // preprocessing pipeline. It is also simple enough (e.g., applying a linear
-  // function to existing values) that we can merge it with copying from one memory
-  // to another. Therefore, unless there is another preprocessing operation to be
-  // done after normalization, in which case we prefer in-place updating,
-  // we implicitly apply it during copying between memory locations to avoid
-  // redundant memory access overheads. For this reason, we treat normalization
-  // differently from other transforms. However, if a subtractor is used as a
-  // normalizer, it is treated as an ordinary transform.
-
-  const unsigned int num_trs = static_cast<unsigned int>(m_transforms.size());
-  const bool lazy_normalization = (tr_end == num_trs) && to_fuse_normalizer_with_copy();
-  const unsigned int n_immediate_transforms
-      = std::min((lazy_normalization?  m_normalizer_idx : num_trs), tr_end);
-
-  for (size_t i = tr_start; i < n_immediate_transforms; ++i) {
-    if (m_transforms[i]->determine_transform(image)) {
-      ok = m_transforms[i]->apply(image);
-    }
-  }
-
-  if (lazy_normalization) {
-    m_transforms[m_normalizer_idx]->determine_transform(image);
-  }
-
-  return ok;
-}
-
-/**
- * Postprocess an image.
- * @return true if successful
- */
-bool cv_process::postprocess(cv::Mat& image) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  bool ok = true;
-
-  const bool lazy_normalization = to_fuse_normalizer_with_copy();
-  const unsigned int n_immediate_transforms
-      = (lazy_normalization? m_normalizer_idx : m_transforms.size());
-
-  // If normalizer is the last transform in the preprocessing pipeline, it will
-  // be the first in the postprocessing. In addition, it has implicitly been
-  // inversed during copying from El::Mat to cv::Mat before calling postprocess(image)
-
-  for (size_t i = n_immediate_transforms; i > 0; --i) {
-    if (m_transforms[i-1]->determine_inverse_transform()) {
-      ok = m_transforms[i-1]->apply(image);
-      _LBANN_MILD_EXCEPTION(!ok, "inverse transform " << i-1 << " has failed!", false);
-    }
-  }
-
-  if (to_flip()) {
-    cv::flip(image, image, how_to_flip());
-  }
-
-  return ok;
-}
-
-std::vector<cv_normalizer::channel_trans_t> cv_process::get_transform_normalize() const {
-  return (to_fuse_normalizer_with_copy()?
-          dynamic_cast<const cv_normalizer*>(m_transforms[m_normalizer_idx].get())->transform() :
-          std::vector<cv_normalizer::channel_trans_t>());
-}
-
-std::vector<cv_normalizer::channel_trans_t> cv_process::get_transform_normalize(const unsigned int ch) const {
-  std::vector<cv_normalizer::channel_trans_t> trans;
-  if (to_fuse_normalizer_with_copy()) {
-    trans = dynamic_cast<const cv_normalizer*>(m_transforms[m_normalizer_idx].get())->transform();
-  }
-
-  return ((trans.size() > ch) ?
-          std::vector<cv_normalizer::channel_trans_t>(1, trans[ch]) :
-          std::vector<cv_normalizer::channel_trans_t>(1, cv_normalizer::channel_trans_t(1.0, 0.0)));
-}
-
-std::string cv_process::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - flip: " << cv_transform::flip_desc(m_flip) << std::endl
-     << " - split channels: " << m_split << std::endl
-     << " - is normalizer set: " << m_is_normalizer_set << std::endl;
-
-  if (m_is_normalizer_set)
-     os << " - normalizer index: " << m_normalizer_idx << std::endl;
-
-  os << " - number of transforms: " << m_transforms.size() << std::endl;
-  for(size_t i = 0u; i< m_transforms.size(); ++i) {
-    if(!m_transforms[i])
-      os << "   transform [" << i << "]: not set" << std::endl;
-    else
-      os << "   transform [" << i << "]: " << m_transforms[i]->get_name()
-         << " of " << m_transforms[i]->get_type() << " type" << std::endl;
-  }
-
-  return os.str();
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_process_patches.cpp b/src/data_readers/cv_process_patches.cpp
deleted file mode 100644
index 08a227741a9..00000000000
--- a/src/data_readers/cv_process_patches.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_process_patches .cpp .hpp - structure that defines the operations
-//                      on patches extracted from an image in the opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-
-#include "lbann/data_readers/cv_process_patches.hpp"
-#include <limits> // std::numeric_limits
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_process_patches::cv_process_patches()
-  : cv_process(), m_self_label(false),
-    m_when_to_extract(std::numeric_limits<unsigned int>::max()) {
-}
-
-cv_process_patches::cv_process_patches(const bool self_label)
-  : cv_process(), m_self_label(self_label),
-    m_when_to_extract(std::numeric_limits<unsigned int>::max()) {
-}
-
-cv_process_patches::cv_process_patches(const cv_process_patches& rhs)
-  : cv_process(rhs), m_pd(rhs.m_pd), m_self_label(rhs.m_self_label),
-    m_when_to_extract(rhs.m_when_to_extract) {
-}
-
-cv_process_patches::cv_process_patches(const cv_transform::cv_flipping flip_code, const bool tosplit)
-  : cv_process(flip_code, tosplit), m_self_label(false),
-    m_when_to_extract(std::numeric_limits<unsigned int>::max()) {
-}
-
-cv_process_patches& cv_process_patches::operator=(const cv_process_patches& rhs) {
-  if (this == &rhs) {
-    return (*this);
-  }
-  cv_process::operator=(rhs);
-  m_pd = rhs.m_pd;
-  m_self_label = rhs.m_self_label;
-  m_when_to_extract = rhs.m_when_to_extract;
-
-  return (*this);
-}
-
-void cv_process_patches::set_patch_descriptor(const patchworks::patch_descriptor& pd,
-                                              const unsigned int when_to_extract) {
-  m_pd = pd;
-  m_self_label = m_pd.is_self_labeling();
-  m_when_to_extract = when_to_extract;
-}
-
-/**
- * Preprocess patches extracted from an image.
- * @return true if successful
- */
-bool cv_process_patches::preprocess(cv::Mat& image, std::vector<cv::Mat>& patches) {
-  bool ok = true;
-  patches.clear();
-
-  ok = cv_process::preprocess(image, 0u, m_when_to_extract);
-  ok = ok && m_pd.extract_patches(image, patches);
-
-  for (size_t i=0u; ok && (i < patches.size()); ++i) {
-    ok = cv_process::preprocess(patches[i], m_when_to_extract);
-  }
-
-  return ok;
-}
-
-std::string cv_process_patches::get_description() const {
-  std::stringstream os;
-  const unsigned int when_to_extract = ((m_when_to_extract > m_transforms.size())?
-                                         m_transforms.size() : m_when_to_extract);
-  const std::string when_exactly = ((when_to_extract == 0u)?
-    "at the beginning" : ("after " + m_transforms[when_to_extract-1]->get_name()));
-  os << cv_process::get_description();
-  os << " - self-labeling: " << m_self_label << std::endl
-     << " - extract patches " << when_exactly << std::endl
-     << m_pd  << std::endl;
-
-  return os.str();
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_resizer.cpp b/src/data_readers/cv_resizer.cpp
deleted file mode 100644
index e3c12c0844a..00000000000
--- a/src/data_readers/cv_resizer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_resizer .cpp .hpp - Functions to resize images
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_resizer.hpp"
-#include "lbann/utils/exception.hpp"
-#include <algorithm>
-#include <ostream>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-const int cv_resizer::m_interpolation_choices[3] = {cv::INTER_LINEAR, cv::INTER_AREA, cv::INTER_LINEAR};
-
-cv_resizer::cv_resizer()
-  : cv_transform(), m_width(0u), m_height(0u),
-    m_interpolation(m_interpolation_choices[0]),
-    m_adaptive_interpolation(false) {}
-
-
-cv_resizer *cv_resizer::clone() const {
-  return new cv_resizer(*this);
-}
-
-void cv_resizer::set(const unsigned int width, const unsigned int height,
-                     const bool adaptive_interpolation) {
-  reset();
-  m_width = width;
-  m_height = height;
-  m_adaptive_interpolation = adaptive_interpolation;
-
-  if ((m_width == 0u) || (m_height == 0u)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_resizer: invalid size of the resized image";
-    throw lbann_exception(err.str());
-  }
-}
-
-void cv_resizer::reset() {
-  m_enabled = false;
-  m_interpolation = m_interpolation_choices[0];
-}
-
-bool cv_resizer::determine_transform(const cv::Mat& image) {
-  m_enabled = false; //sufficient for now in place of reset();
-
-  if (image.empty()) {
-    throw lbann_exception("cv_resizer::determine_transform : empty image.");
-  }
-
-  const double zoom = image.cols * image.rows / static_cast<double>(m_width * m_height);
-
-  if (zoom <= 1.0) { // shirinking
-    m_interpolation =  m_interpolation_choices[static_cast<int>(m_adaptive_interpolation)];
-  } else { // enlarging
-    m_interpolation =  m_interpolation_choices[static_cast<int>(m_adaptive_interpolation) << 1];
-  }
-
-  return (m_enabled = true);
-}
-
-bool cv_resizer::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as it is applied
-
-  cv::Mat image_new;
-  cv::resize(image, image_new, cv::Size(m_width, m_height), 0, 0, m_interpolation);
-  image = image_new;
-
-  return true;
-}
-
-std::string cv_resizer::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl
-     << " - desired size: " << m_width  << "x" << m_height << std::endl
-     << " - adaptive interpolation: " << m_adaptive_interpolation << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_resizer::print(std::ostream& os) const {
-  os << get_description()
-     << " - interpolation: ";
-  switch(m_interpolation) {
-    case cv::INTER_LINEAR: os << "INTER_LINEAR" << std::endl; break;
-    case cv::INTER_CUBIC:  os << "INTER_CUBIC" << std::endl; break;
-    case cv::INTER_AREA:   os << "INTER_AREA" << std::endl; break;
-    default: os << "unrecognized" << std::endl; break;
-  }
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_subtractor.cpp b/src/data_readers/cv_subtractor.cpp
deleted file mode 100644
index b52e1c1391c..00000000000
--- a/src/data_readers/cv_subtractor.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_subtractor .cpp .hpp - subtract channel values of an image (possibly the
-// pixel-wise mean of dataset) from the corresponding values of another (input)
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_subtractor.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/utils/mild_exception.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <iostream>
-#include <fstream>
-#include <iterator>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-cv_subtractor::cv_subtractor(const cv_subtractor& rhs)
-  : cv_transform(rhs),
-    m_img_to_sub(rhs.m_img_to_sub),
-    m_img_to_div(rhs.m_img_to_div),
-    m_channel_mean(rhs.m_channel_mean),
-    m_channel_stddev(rhs.m_channel_stddev),
-    m_applied(rhs.m_applied)
-{}
-
-cv_subtractor& cv_subtractor::operator=(const cv_subtractor& rhs) {
-  cv_transform::operator=(rhs);
-  m_img_to_sub = rhs.m_img_to_sub;
-  m_img_to_div = rhs.m_img_to_div;
-  m_channel_mean = rhs.m_channel_mean;
-  m_channel_stddev = rhs.m_channel_stddev;
-  m_applied = rhs.m_applied;
-  return *this;
-}
-
-cv_subtractor *cv_subtractor::clone() const {
-  return (new cv_subtractor(*this));
-}
-
-/**
- * Load an image in the file of the proprietary format.
- * The file name describes the image configuration as:
- *   *-(width)x(height)x(num_channels)-(opencv_depth_code).bin
- * There is no header in the file. The file is a binary dump of an OpenCV cv::Mat data.
- * For the better portability, an existing format can be used to carry image data.
- */
-cv::Mat cv_subtractor::read_binary_image_file(const std::string filename) {
-  std::vector<int> tokens;
-  { // Extract the information on the image from the file name
-    const std::vector<char> delims = {'-', 'x','x','-','.'};
-    std::string dir;
-    std::string basename;
-
-    parse_path(filename, dir, basename);
-    tokens = get_tokens(basename, delims);
-    if (tokens.size() != delims.size()) {
-      return cv::Mat();
-    }
-  }
-
-  std::ifstream file(filename, std::ios::binary);
-  if (!file.good()) {
-    return cv::Mat();
-  }
-  file.unsetf(std::ios::skipws);
-
-  { // Check file size
-    const size_t image_byte_size
-      = tokens[1] * tokens[2] * tokens[3] * CV_ELEM_SIZE(tokens[4]);
-
-    file.seekg(0, std::ios::end);
-    const size_t file_size = static_cast<size_t>(file.tellg());
-    if (image_byte_size != file_size) {
-      return cv::Mat();
-    }
-  }
-
-  // Construct an image data structure
-  cv::Mat image(tokens[1], tokens[2], CV_MAKETYPE(tokens[4], tokens[3]));
-
-  // Reset the file pointer
-  file.seekg(0, std::ios::beg);
-
-  // Load the image from the file
-  std::copy(std::istream_iterator<unsigned char>(file),
-            std::istream_iterator<unsigned char>(),
-            reinterpret_cast<unsigned char*>(image.data));
-
-  return image;
-}
-
-void cv_subtractor::set_mean(const std::string name_of_img_to_sub, const int depth_code) {
-  cv::Mat img_to_sub;
-  std::string ext = get_ext_name(name_of_img_to_sub);
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  if (ext == "bin") {
-    img_to_sub = read_binary_image_file(name_of_img_to_sub);
-  } else { // let OpenCV handle
-    img_to_sub = cv::imread(name_of_img_to_sub);
-  }
-  if (img_to_sub.empty()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_subtractor: cannot load the image "
-        << name_of_img_to_sub << " to subtract.";
-    throw lbann_exception(err.str());
-  }
-  set_mean(img_to_sub, depth_code);
-  m_channel_mean.clear();
-
-  if (m_channel_stddev.empty() && !m_img_to_div.empty() &&
-      !check_if_cv_Mat_has_same_shape(m_img_to_div, m_img_to_sub)) {
-    throw lbann_exception("cv_subtractor::set_mean() : mean and variance images have different shapes");
-  }
-}
-
-void cv_subtractor::set_mean(const std::vector<lbann::DataType> ch_mean) {
-  if (ch_mean.size() > cv::Scalar::channels) {
-    throw lbann_exception(std::string("cv_subtractor::set_mean() : ") +
-      "provide the mean image if the number of channels are larger than " +
-      std::to_string(cv::Scalar::channels) + '.');
-  }
-  m_channel_mean = ch_mean;
-}
-
-bool cv_subtractor::create_img_to_sub(int width, int height, int n_channels) {
-  if ((n_channels == 0) || (static_cast<size_t>(n_channels) != m_channel_mean.size()) ||
-      (width == 0) || (height == 0)) {
-    return false;
-  }
-  const std::vector<lbann::DataType>& ch_mean = m_channel_mean;
-  cv::Scalar px = cv::Scalar::all(0.0);
-  for (size_t i = 0u; i < ch_mean.size(); ++i) {
-    px[static_cast<int>(i)] = ch_mean[i];
-  }
-  cv::Mat img_to_sub(height, width, cv_image_type<lbann::DataType>::T(n_channels), px);
-  set_mean(img_to_sub);
-  return true;
-}
-
-void cv_subtractor::set_mean(const cv::Mat& image, const int depth_code) {
-  reset();
-
-  const double f = get_depth_normalizing_factor(image.depth());
-
-  // Make sure that the image is set as a floating point type image
-  // Note that this is the only way to set m_img_to_sub. This means that
-  // m_img_to_sub will be of a floating point type unless it is empty.
-
-  if ((depth_code != CV_32F) && (depth_code != CV_64F)) {
-    // If the depth_code does not indicate a floating point type, see if the
-    // image is already of a floating point type. If so, use the same type.
-    // Otherwise, use the type of LBANN's DataType.
-    if (check_if_cv_Mat_is_float_type(image)) {
-      image.convertTo(m_img_to_sub, image.depth(), f, 0.0);
-    } else {
-      image.convertTo(m_img_to_sub, cv_image_type<lbann::DataType>::T(), f, 0.0);
-    }
-  } else {
-    image.convertTo(m_img_to_sub, depth_code, f, 0.0);
-  }
-}
-
-void cv_subtractor::set_stddev(const std::string name_of_img_to_div, const int depth_code) {
-  cv::Mat img_to_div;
-  std::string ext = get_ext_name(name_of_img_to_div);
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  if (ext == "bin") {
-    img_to_div = read_binary_image_file(name_of_img_to_div);
-  } else { // let OpenCV handle
-    img_to_div = cv::imread(name_of_img_to_div);
-  }
-  if (img_to_div.empty()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: cv_subtractor: cannot load the image "
-        << name_of_img_to_div << " to normalize.";
-    throw lbann_exception(err.str());
-  }
-  set_stddev(img_to_div, depth_code);
-  m_channel_stddev.clear();
-
-  if (m_channel_mean.empty() && !m_img_to_sub.empty() &&
-      !check_if_cv_Mat_has_same_shape(m_img_to_sub, m_img_to_div)) {
-    throw lbann_exception("cv_subtractor::set_stddev() : mean and variance images have different shapes.");
-  }
-}
-
-void cv_subtractor::set_stddev(const std::vector<lbann::DataType> ch_stddev) {
-  if (ch_stddev.size() > cv::Scalar::channels) {
-    throw lbann_exception(std::string("cv_subtractor::set_stddev() : ") +
-      "provide the stddev image if the number of channels are larger than " +
-      std::to_string(cv::Scalar::channels) + '.');
-  }
-  m_channel_stddev = ch_stddev;
-}
-
-bool cv_subtractor::create_img_to_div(int width, int height, int n_channels) {
-  if ((n_channels == 0) || (static_cast<size_t>(n_channels) != m_channel_stddev.size()) ||
-      (width == 0) || (height == 0)) {
-    return false;
-  }
-  const std::vector<lbann::DataType>& ch_stddev = m_channel_stddev;
-  cv::Scalar px = cv::Scalar::all(0.0);
-  for (size_t i = 0u; i < ch_stddev.size(); ++i) {
-    px[static_cast<int>(i)] = ch_stddev[i];
-  }
-  cv::Mat img_to_div(height, width, cv_image_type<lbann::DataType>::T(n_channels), px);
-  set_stddev(img_to_div);
-  return true;
-}
-
-void cv_subtractor::set_stddev(const cv::Mat& image, const int depth_code) {
-  reset();
-
-  const double f = get_depth_normalizing_factor(image.depth());
-
-  if ((depth_code != CV_32F) && (depth_code != CV_64F)) {
-    if (check_if_cv_Mat_is_float_type(image)) {
-      image.convertTo(m_img_to_div, image.depth(), f, 0.0);
-    } else {
-      image.convertTo(m_img_to_div, cv_image_type<lbann::DataType>::T(), f, 0.0);
-    }
-  } else {
-    image.convertTo(m_img_to_div, depth_code, f, 0.0);
-  }
-}
-
-bool cv_subtractor::determine_transform(const cv::Mat& image) {
-  reset();
-  if (m_channel_mean.empty()) {
-    if (!m_img_to_sub.empty()) { // pixel-wise
-      if (!check_if_cv_Mat_has_same_shape(image, m_img_to_sub)) {
-        throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") +
-                              "input and mean images have different sizes.");
-      }
-      m_enabled = true;
-    }
-  } else { // channel-wise
-    if (!check_if_cv_Mat_has_same_shape(image, m_img_to_sub) &&
-        !create_img_to_sub(image.cols, image.rows, image.channels())) {
-      throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") +
-                            "failed to create mean image.");
-    }
-    m_enabled = true;
-  }
-  if (m_channel_stddev.empty()) {
-    if (!m_img_to_div.empty()) { // pixel-wise
-      if (!check_if_cv_Mat_has_same_shape(image, m_img_to_div)) {
-        throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") +
-                              "input and stddev images have different sizes.");
-      }
-      m_enabled = true;
-    }
-  } else { // channel-wise
-    if (!check_if_cv_Mat_has_same_shape(image, m_img_to_div) &&
-        !create_img_to_div(image.cols, image.rows, image.channels())) {
-      throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") +
-                            "failed to create stddev image.");
-    }
-    m_enabled = true;
-  }
-  return m_enabled;
-}
-
-bool cv_subtractor::determine_inverse_transform() {
-  return (m_enabled = m_applied);
-}
-
-/**
- * Currently only supports mean-subtraction and z-score.
- * TODO: Unit variance is not supported. It can be implemented by adding
- * 'm_img_to_sub' to the result of z-score. Both z-score and unit variance
- * requires both mean and stddev. Thus, we would need an additional flag to
- * distinguish which method is being set up.
- */
-bool cv_subtractor::apply(cv::Mat& image) {
-  m_enabled = false; // turn off as the transform is applied once
-  if (m_applied) { // inverse if applied already
-    double f = get_depth_denormalizing_factor(CV_8U);
-
-    cv::Mat image_new;
-
-    if (!m_img_to_div.empty()) {
-      double ff = 1.0;
-      if (m_img_to_sub.empty()) {
-        ff = f;
-        f = 1.0;
-      }
-      cv::multiply(image, m_img_to_div, image_new, ff, m_img_to_div.depth());
-      image = image_new;
-    }
-
-    if (!m_img_to_sub.empty()) {
-      cv::addWeighted(m_img_to_sub, f, image, f, 0.0, image_new, CV_8U);
-      image = image_new;
-    }
-
-    m_applied = false;
-  } else {
-    double f = get_depth_normalizing_factor(image.depth());
-
-    cv::Mat image_new;
-    if (!m_img_to_sub.empty()) {
-      cv::addWeighted(m_img_to_sub, -1.0, image, f, 0.0, image_new, m_img_to_sub.depth());
-      f = 1.0; // to avoid redundant depth normalization
-      image = image_new;
-    }
-
-    if (!m_img_to_div.empty()) {
-      cv::divide(image, m_img_to_div, image_new, f, m_img_to_div.depth());
-      image = image_new;
-    }
-
-    m_applied = true;
-  }
-
-  return true;
-}
-
-bool cv_subtractor::check_if_channel_wise() const {
-  return !(m_channel_mean.empty() || m_channel_stddev.empty());
-}
-
-std::string cv_subtractor::get_description() const {
-  std::stringstream os;
-  os << get_type() + ":" << std::endl;
-  return os.str();
-}
-
-std::ostream& cv_subtractor::print(std::ostream& os) const {
-  os << get_description()
-     << " - image shape to subtract: "
-     << m_img_to_sub.cols << 'x' << m_img_to_sub.rows
-     << 'x' << m_img_to_sub.channels()
-     << '-' << m_img_to_sub.depth() << std::endl
-     << " - image shape to divide: "
-     << m_img_to_div.cols << 'x' << m_img_to_div.rows
-     << 'x' << m_img_to_div.channels()
-     << '-' << m_img_to_div.depth() << std::endl;
-
-  os << " - mean per channel to subtract:";
-  for (const auto v: m_channel_mean) {
-    os << ' ' << v;
-  }
-  os << std::endl;
-
-  os << " - stddev per channel to divide:";
-  for (const auto v: m_channel_stddev) {
-    os << ' ' << v;
-  }
-  os << std::endl;
-
-#if 0
-  double f = get_depth_denormalizing_factor(CV_8U);
-  if (!m_img_to_sub.empty()) {
-    cv::Mat img_sub;
-    m_img_to_sub.convertTo(img_sub, CV_8U, f, 0.0);
-    cv::imwrite("img_sub.png", img_sub);
-  }
-  if (!m_img_to_div.empty()) {
-    cv::Mat img_div;
-    m_img_to_div.convertTo(img_div, CV_8U, f, 0.0);
-    cv::imwrite("img_div.png", img_div);
-  }
-#endif
-  return os;
-}
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_transform.cpp b/src/data_readers/cv_transform.cpp
deleted file mode 100644
index 1a0774b2813..00000000000
--- a/src/data_readers/cv_transform.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_transform .cpp .hpp - base class for the transformation
-//                          on image data in opencv format
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_transform.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-const constexpr char* const cv_transform::cv_flip_desc[];
-
-/** The mathematical constant (this is the way to get it in C++). */
-const float cv_transform::pi = static_cast<float>(std::acos(-1));
-
-double get_depth_normalizing_factor(const int cv_depth) {
-  _SWITCH_CV_FUNC_1PARAM(cv_depth, depth_norm_factor, );
-  // The caller must check the exception by detecting 0.0
-  return 0.0;
-}
-
-double get_depth_denormalizing_factor(const int cv_depth) {
-  _SWITCH_CV_FUNC_1PARAM(cv_depth, depth_norm_inverse_factor, );
-  // The caller must check the exception by detecting 0.0
-  return 0.0;
-}
-
-} // end of namespace lbann
-
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/cv_utils.cpp b/src/data_readers/cv_utils.cpp
deleted file mode 100644
index 9a730775344..00000000000
--- a/src/data_readers/cv_utils.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// cv_utils .cpp .hpp - operations related to opencv images
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/cv_utils.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/utils/timer.hpp"
-#include "lbann/utils/file_utils.hpp"
-//#include <iostream>
-
-#ifdef LBANN_HAS_OPENCV
-namespace lbann {
-
-bool cv_utils::copy_cvMat_to_buf(const cv::Mat& image, std::vector<uint8_t>& buf, const cv_process& pp) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  _SWITCH_CV_FUNC_3PARAMS(image.depth(), \
-                          copy_cvMat_to_buf_with_known_type, \
-                          image, buf, pp)
-  return false;
-}
-
-
-cv::Mat cv_utils::copy_buf_to_cvMat(const std::vector<uint8_t>& buf,
-                                    const int Width, const int Height, const int Type, const cv_process& pp) {
-  _LBANN_MILD_EXCEPTION(buf.size() != \
-                        static_cast<size_t>(Width * Height * CV_MAT_CN(Type) * CV_ELEM_SIZE(CV_MAT_DEPTH(Type))), \
-                        "Size mismatch: Buffer has " << buf.size() << " items when " \
-                        << static_cast<size_t>(Width * Height * CV_MAT_CN(Type) * CV_ELEM_SIZE(CV_MAT_DEPTH(Type))) \
-                        << " are expected.", \
-                        cv::Mat())
-
-  _SWITCH_CV_FUNC_4PARAMS(CV_MAT_DEPTH(Type), \
-                          copy_buf_to_cvMat_with_known_type, \
-                          buf, Width, Height, pp)
-
-  _LBANN_DEBUG_MSG("Unknown image depth: " << CV_MAT_DEPTH(Type));
-  return cv::Mat();
-}
-
-
-bool cv_utils::copy_cvMat_to_buf(const cv::Mat& image, CPUMat& buf, const cv_process& pp) {
-  _LBANN_SILENT_EXCEPTION(image.empty(), "", false)
-
-  _SWITCH_CV_FUNC_3PARAMS(image.depth(), \
-                          copy_cvMat_to_buf_with_known_type, \
-                          image, buf, pp)
-  return false;
-}
-
-
-cv::Mat cv_utils::copy_buf_to_cvMat(const CPUMat& buf,
-                                    const int Width, const int Height, const int Type, const cv_process& pp) {
-  _SWITCH_CV_FUNC_4PARAMS(CV_MAT_DEPTH(Type), \
-                          copy_buf_to_cvMat_with_known_type, \
-                          buf, Width, Height, pp)
-
-  _LBANN_DEBUG_MSG("Unknown image depth: " << CV_MAT_DEPTH(Type));
-  return cv::Mat();
-}
-
-
-std::ostream& operator<<(std::ostream& os, const cv_transform& tr) {
-  tr.print(os);
-  return os;
-}
-
-
-cv::Mat cv_utils::lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf, cv::Mat* cv_buf) {
-  // Load an image bytestream into memory
-  bool ok = lbann::load_file(img_file_path, buf);
-  if (!ok) {
-    throw lbann_exception("lbann_imread() : failed to load " + img_file_path);
-  }
-
-  // create a zero-copying view on a block of bytes
-  using InputBuf_T = lbann::cv_image_type<uint8_t>;
-  const cv::Mat inbuf(1, buf.size(), InputBuf_T::T(1), buf.data());
-
-  // decode the image data in the memory buffer
-  // Note that if cv_buf is not NULL, then the return value is *cv_buf
-  cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
-  return image;
-}
-
-
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp
index bcdef9d9b68..a3669ff8572 100644
--- a/src/data_readers/data_reader.cpp
+++ b/src/data_readers/data_reader.cpp
@@ -29,9 +29,14 @@
 #include "lbann/data_readers/data_reader.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
 #include "lbann/utils/omp_pragma.hpp"
-#include "lbann/models/model.hpp"
+#include "lbann/utils/threads/thread_pool.hpp"
+#include "lbann/trainers/trainer.hpp"
 #include <omp.h>
 #include <future>
+#include "lbann/io/persist.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
 
 namespace lbann {
 
@@ -50,7 +55,8 @@ void generic_data_reader::shuffle_indices(rng_gen& gen) {
   }
 }
 
-void generic_data_reader::setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) {
+  /// @todo BVE FIXME
+void generic_data_reader::setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) {
   m_base_offset = 0;
   m_sample_stride = 1;
   m_stride_to_next_mini_batch = 0;
@@ -73,18 +79,19 @@ void generic_data_reader::setup(int num_io_threads, std::shared_ptr<thread_pool>
 }
 
 
-bool lbann::generic_data_reader::fetch_data_block(CPUMat& X, El::Int thread_id, El::Int mb_size, El::Matrix<El::Int>& indices_fetched) {
-  std::string error_message;
-  for (int s = thread_id; s < mb_size; s+=m_io_thread_pool->get_num_threads()) {
+bool lbann::generic_data_reader::fetch_data_block(CPUMat& X, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix<El::Int>& indices_fetched) {
+  locked_io_rng_ref io_rng = set_io_generators_local_index(block_offset);
+
+  for (int s = block_offset; s < mb_size; s+=block_stride) {
     int n = m_current_pos + (s * m_sample_stride);
     int index = m_shuffled_indices[n];
     bool valid = fetch_datum(X, index, s);
     if (!valid) {
-      error_message = "invalid datum (index " + std::to_string(index) + ")";
+      LBANN_ERROR("invalid datum (index ", std::to_string(index), ")");
     }
-    if (!error_message.empty()) { LBANN_ERROR(error_message); }
     indices_fetched.Set(s, 0, index);
   }
+
   return true;
 }
 
@@ -92,7 +99,7 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X, El::Matrix<El::Int>& indic
   #ifdef DEBUG
   if (m_current_pos == 0) {
     if (is_master()) {
-      std::cout << "role: " << get_role() << " model: " << m_model->get_name()
+      std::cout << "role: " << get_role() << " model: " << m_trainer->get_name()
                 << " shuffled indices: ";
       for (size_t j=0; j<15; j++) {
         std::cout << m_shuffled_indices[j] << " ";
@@ -140,6 +147,8 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X, El::Matrix<El::Int>& indic
     set_jag_variables(mb_size);
   }
 
+  // Fetch data is executed by the thread pool so it has to dispatch
+  // work to other threads in the thread pool and do some work locally
   for (int t = 0; t < static_cast<int>(m_io_thread_pool->get_num_threads()); t++) {
     // Queue up work into other threads and then finish off the
     // mini-batch in the active thread
@@ -148,10 +157,15 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X, El::Matrix<El::Int>& indic
     }else {
       m_io_thread_pool->submit_job_to_work_group(
         std::bind(&generic_data_reader::fetch_data_block, this, std::ref(X), t,
+                  m_io_thread_pool->get_num_threads(),
                   mb_size, std::ref(indices_fetched)));
     }
   }
-  fetch_data_block(X, m_io_thread_pool->get_local_thread_id(), mb_size, indices_fetched);
+  fetch_data_block(X,
+                   m_io_thread_pool->get_local_thread_id(),
+                   m_io_thread_pool->get_num_threads(),
+                   mb_size,
+                   indices_fetched);
 
   // Wait for all of the threads to finish
   m_io_thread_pool->finish_work_group();
@@ -350,35 +364,30 @@ int generic_data_reader::get_next_position() const {
   }
 }
 
-void generic_data_reader::select_subset_of_data_partitioned() {
-
-  //sanity checks
-  if (get_absolute_sample_count()) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: generic_data_reader - absolute_sample_count is not supported "
-      + "for partitioned data_set");
-  }
+void generic_data_reader::error_check_counts() const {
+  size_t count = get_absolute_sample_count();
   double use_percent = get_use_percent();
-  if (use_percent <= 0.0 || use_percent > 1.0) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: generic_data_reader - percent_of_data_to_use must be > 0 "
-      + "and <= 1");
+  if (count == 0 and use_percent == 0.0) {
+      LBANN_ERROR("get_use_percent() and get_absolute_sample_count() are both zero; exactly one must be zero");
   }
-  if (! (m_partition_mode == 1 || m_partition_mode == 2)) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: generic_data_reader - overlap mode must be 1 or 2\n"
+  if (!(count == 0 or use_percent == 0.0)) {
+      LBANN_ERROR("get_use_percent() and get_absolute_sample_count() are both non-zero; exactly one must be zero");
+  }
+  if (m_is_partitioned && !(m_partition_mode == 1 || m_partition_mode == 2)) {
+    LBANN_ERROR("overlap mode must be 1 or 2\n"
       " 1 - share overlap data with one neighboring models;\n"
       " 2 - a set of overlap indices is common to (is shared by) all models");
   }
+  if (count != 0) {
+    if(count > static_cast<size_t>(get_num_data())) {
+      LBANN_ERROR("absolute_sample_count=" +
+        std::to_string(count) + " is > get_num_data=" +
+        std::to_string(get_num_data()));
+    }
+  }
+}
 
-  shuffle_indices();
-
-  //optionally only use a portion of the data (useful during development
-  //and testing)
-  m_shuffled_indices.resize( get_use_percent() * m_shuffled_indices.size());
+void generic_data_reader::select_subset_of_data_partitioned() {
 
   std::vector<int> common_pool;
   //case where there's an overlap set that is common to all models
@@ -494,54 +503,54 @@ void generic_data_reader::select_subset_of_data_partitioned() {
   }
 }
 
-void generic_data_reader::select_subset_of_data() {
+size_t generic_data_reader::get_num_indices_to_use() const {
+  error_check_counts();
+  //note: exactly one of the following is guaranteed to be non-zero
+  size_t count = get_absolute_sample_count();
+  double use_percent = get_use_percent();
+
+  size_t r = 0.;
+  if (count) {
+    r = count;
+  } else if (use_percent) {
+    r = use_percent*get_num_data();
+    if (r == 0) {
+      LBANN_ERROR("get_num_indices_to_use() computed zero indices; probably: percent_of_data_to_use is too small WRT num_data");
+    }
+  } else {
+    LBANN_ERROR("it's impossible to be here");
+  }
+
+  return r;
+}
+
+void generic_data_reader::resize_shuffled_indices() {
   // ensure that all readers have the same number of indices
   if (m_jag_partitioned) {
     size_t n = m_comm->trainer_allreduce<size_t>(m_shuffled_indices.size(), El::mpi::MIN);
     m_shuffled_indices.resize(n);
   }
 
+  size_t num_indices = get_num_indices_to_use();
+  shuffle_indices();
+  m_shuffled_indices.resize(num_indices);
+}
+
+void generic_data_reader::select_subset_of_data() {
   // optionally partition data set amongst the models
   if (m_is_partitioned) {
     select_subset_of_data_partitioned();
     return ;
   }
 
-  shuffle_indices();
-
-  size_t count = get_absolute_sample_count();
-  double use_percent = get_use_percent();
-  if (count == 0 and use_percent == 0.0) {
-      throw lbann_exception(
-        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-        " :: generic_data_reader::select_subset_of_data() get_use_percent() "
-        + "and get_absolute_sample_count() are both zero; exactly one "
-        + "must be zero");
-  }
-  if (!(count == 0 or use_percent == 0.0)) {
-      throw lbann_exception(
-        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-        " :: generic_data_reader::select_subset_of_data() get_use_percent() "
-        "and get_absolute_sample_count() are both non-zero; exactly one "
-        "must be zero");
-  }
-
-  if (count != 0) {
-    if(count > static_cast<size_t>(get_num_data())) {
-      throw lbann_exception(
-        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-        " :: generic_data_reader::select_subset_of_data() - absolute_sample_count=" +
-        std::to_string(count) + " is > get_num_data=" +
-        std::to_string(get_num_data()));
-    }
-    m_shuffled_indices.resize(get_absolute_sample_count());
+  if (get_validation_percent() == 0.) {
+    return;
   }
 
-  if (use_percent) {
-    m_shuffled_indices.resize(get_use_percent()*get_num_data());
+  long unused = get_validation_percent()*get_num_data();
+  if (unused == 0) {
+    LBANN_ERROR("validation % of ", get_validation_percent(), " was requested, but the number of validation indices was computed as zero. Probably: % validation requested is too small wrt num_indices (aka, num samples)");
   }
-
-  long unused = get_validation_percent()*get_num_data(); //get_num_data() = m_shuffled_indices.size()
   long use_me = get_num_data() - unused;
   if (unused > 0) {
       m_unused_indices=std::vector<int>(m_shuffled_indices.begin() + use_me, m_shuffled_indices.end());
@@ -559,46 +568,34 @@ void generic_data_reader::use_unused_index_set() {
   if(m_data_store != nullptr) {
     /// Update the data store's pointer to the shuffled indices
     m_data_store->set_shuffled_indices(&m_shuffled_indices);
-    m_data_store->purge_unused_samples(m_unused_indices);
   }
   m_unused_indices.clear();
   std::vector<int>().swap(m_unused_indices); // Trick to force memory reallocation
 }
 
 /** \brief Given directory to store checkpoint files, write state to file and add to number of bytes written */
-bool generic_data_reader::save_to_checkpoint_shared(persist& p, const char *name) {
-  // rank 0 writes the training state file
-  if (m_comm->am_trainer_master()) {
-    pack_scalars(p,name);
+bool generic_data_reader::save_to_checkpoint_shared(persist& p, execution_mode mode) {
+  if (get_comm()->am_trainer_master()) {
+    write_cereal_archive<generic_data_reader>(*this, p, mode, "_dr.xml");
   }
   return true;
 }
 
 /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */
-bool lbann::generic_data_reader::load_from_checkpoint_shared(persist& p, const char *name) {
-  // rank 0 reads the training state file
-  struct packing_header header;
-  if (m_comm->am_trainer_master()) {
-    unpack_scalars(p,&header,name);
-  }
-  m_comm->trainer_broadcast(0, header);
-  unpack_header(header);
-
-  m_comm->trainer_broadcast(0, m_shuffled_indices);
-
+bool lbann::generic_data_reader::load_from_checkpoint_shared(persist& p, execution_mode mode) {
+  load_from_shared_cereal_archive<generic_data_reader>(*this, p, mode, *get_comm(), "_dr.xml");
   // Adjust current position to deal with fact that it was just loaded to all ranks from rank 0 (differs by rank #)
   m_current_pos += m_comm->get_rank_in_trainer();
   return true;
 }
 
-bool generic_data_reader::save_to_checkpoint_distributed(persist& p, const char *name) {
-  pack_scalars(p,name);
+bool generic_data_reader::save_to_checkpoint_distributed(persist& p, execution_mode mode) {
+  write_cereal_archive<generic_data_reader>(*this, p, mode, "_dr.xml");
   return true;
 }
 
-bool lbann::generic_data_reader::load_from_checkpoint_distributed(persist& p, const char *name) {
-  struct packing_header header;
-  unpack_scalars(p,&header,name);
+bool lbann::generic_data_reader::load_from_checkpoint_distributed(persist& p, execution_mode mode) {
+  read_cereal_archive<generic_data_reader>(*this, p, mode, "_dc.xml");
   return true;
 }
 
@@ -708,9 +705,10 @@ double generic_data_reader::get_use_percent() const {
   return m_use_percent;
 }
 
-void generic_data_reader::instantiate_data_store(const std::vector<int>& local_list_sizes) {
+void generic_data_reader::instantiate_data_store() {
+  double tm1 = get_time();
   options *opts = options::get();
-  if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store"))) {
+  if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache") || opts->has_string("data_store_spill"))) {
     if (m_data_store != nullptr) {
       delete m_data_store;
       m_data_store = nullptr;
@@ -726,62 +724,68 @@ void generic_data_reader::instantiate_data_store(const std::vector<int>& local_l
     LBANN_ERROR("shuffled_indices.size() == 0");
   }
 
-  //a call to m_data_store->check_mem_capacity(...) should go here, but
-  //at the moment that depends on the sample_list class, which it shouldn't
-  //TODO: revisit
+  if (opts->get_bool("node_sizes_vary")) {
+    m_data_store->set_node_sizes_vary();
+  }
 
   m_data_store->set_shuffled_indices(&m_shuffled_indices);
 
-  // optionally preload the data store
-  if (opts->get_bool("preload_data_store")) {
-    if(is_master()) {
-      std::cout << "Starting the preload" << std::endl;
-    }
-    if (local_list_sizes.size() != 0) {
-      m_data_store->build_preloaded_owner_map(local_list_sizes);
-    }
-    preload_data_store();
-    if(is_master()) {
-      std::cout << "preload complete" << std::endl;
-    }
-  }
-
-  if(is_master()) {
-    std::cout << "Setting up the data store is complete" << std::endl;
-  }
+  std::stringstream s;
+  s << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1);
+  m_data_store->set_profile_msg(s.str());
 }
 
 void generic_data_reader::setup_data_store(int mini_batch_size) {
   if (m_data_store == nullptr) {
     LBANN_ERROR("m_data_store == nullptr; you shouldn't be here");
   }
+  // optionally preload the data store
+  if (m_data_store->is_preloading()) {
+    m_data_store->set_profile_msg("generic_data_reader::instantiate_data_store - starting the preload");
+    double tm2 = get_time();
+    preload_data_store();
+    std::stringstream s;
+    s << "Preload complete; time: " << get_time() - tm2;
+    m_data_store->set_profile_msg(s.str());
+    /*
+    size_t n = m_data_store->get_num_global_indices();
+    if (n != m_shuffled_indices.size()) {
+      LBANN_ERROR("num samples loaded in the data_store: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size(), " for role: ", get_role());
+    }
+*/
+  }
+
   m_data_store->setup(mini_batch_size);
 }
 
 bool generic_data_reader::data_store_active() const {
-  if (m_data_store != nullptr && m_data_store->is_preloaded()) {
+  if (m_data_store != nullptr && m_data_store->is_fully_loaded()) {
     return true;
   }
+
+  const auto& c = static_cast<const sgd_execution_context&>(m_trainer->get_data_coordinator().get_execution_context());
   /// Use the data store for all modes except testing
   /// i.e. training, validation, tournament
   return (m_data_store != nullptr
-          && (((m_model->get_execution_mode() == execution_mode::training)
-               && m_model->get_epoch() > 0)
-              || ((m_model->get_execution_mode() == execution_mode::validation)
-                  && m_model->get_epoch() > 1)));
+          && (((c.get_execution_mode() == execution_mode::training)
+               && c.get_epoch() > 0)
+              || ((c.get_execution_mode() == execution_mode::validation)
+                  && c.get_epoch() > 0)));
 }
 
 bool generic_data_reader::priming_data_store() const {
-  if (m_data_store != nullptr && m_data_store->is_preloaded()) {
+  const auto& c = static_cast<const sgd_execution_context&>(m_trainer->get_data_coordinator().get_execution_context());
+  if (m_data_store != nullptr && m_data_store->is_fully_loaded()) {
     return false;
   }
+
   /// Use the data store for all modes except testing
   /// i.e. training, validation, tournament
   return (m_data_store != nullptr
-          && (((m_model->get_execution_mode() == execution_mode::training)
-               && m_model->get_epoch() == 0)
-              || ((m_model->get_execution_mode() == execution_mode::validation)
-                  && m_model->get_epoch() == 1)
+          && (((c.get_execution_mode() == execution_mode::training)
+               && c.get_epoch() == 0)
+              || ((c.get_execution_mode() == execution_mode::validation)
+                  && c.get_epoch() == 0)
               || m_data_store->is_explicitly_loading()));
 }
 
@@ -810,4 +814,69 @@ void generic_data_reader::set_mini_batch_size(const int s) {
   m_mini_batch_size = s;
 }
 
+void generic_data_reader::set_role(std::string role) {
+  m_role = role;
+  if (options::get()->has_string("jag_partitioned")
+      && get_role() == "train") {
+    m_jag_partitioned = true;
+    if (is_master()) {
+      std::cout << "USING JAG DATA PARTITIONING\n";
+    }
+  }
+}
+
+void generic_data_reader::preload_data_store() {
+  if (m_data_store->is_local_cache()) {
+    m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling m_data_store->preload_local_cache()");
+    m_data_store->preload_local_cache();
+  }
+
+  else {
+    std::vector<int> local_list_sizes;
+    int np = m_comm->get_procs_per_trainer();
+    int base_files_per_rank = m_shuffled_indices.size() / np;
+    int extra = m_shuffled_indices.size() - (base_files_per_rank*np);
+    if (extra > np) {
+      LBANN_ERROR("extra > np");
+    }
+    local_list_sizes.resize(np, 0);
+    for (int j=0; j<np; j++) {
+      local_list_sizes[j] = base_files_per_rank;
+      if (j < extra) {
+        local_list_sizes[j] += 1;
+      }
+    }
+    m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling m_data_store->build_preloaded_owner_map()");
+    m_data_store->build_preloaded_owner_map(local_list_sizes);
+    m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling do_preload_data_store()");
+    do_preload_data_store();
+    m_data_store->set_loading_is_complete();
+  }
+
+}
+
+void generic_data_reader::print_get_methods(const std::string filename) {
+  if (!is_master()) {
+    return;
+  }
+  std::ofstream out(filename.c_str());
+  if (!out) {
+    LBANN_ERROR("failed to open ", filename, " for writing");
+  }
+
+  out << "get_file_dir " << get_file_dir() << std::endl;
+  out << "get_local_file_dir " << get_local_file_dir() << std::endl;
+  out << "get_data_index_list " << get_data_index_list() << std::endl;
+  out << "get_data_filename " << get_data_filename()  << std::endl;
+  out << "get_label_filename " << get_label_filename() << std::endl;
+  out << "get_role " << get_role() << std::endl;
+  out << "get_type " << get_type() << std::endl;
+  out << "get_num_data " << get_num_data() << std::endl;
+  out << "get_absolute_sample_count" << get_absolute_sample_count() << std::endl;
+  out << "get_use_percent " << get_use_percent() << std::endl;
+  out << "get_validation_percent " << get_validation_percent() << std::endl;
+  out.close();
+}
+
+
 }  // namespace lbann
diff --git a/src/data_readers/data_reader_ascii.cpp b/src/data_readers/data_reader_ascii.cpp
deleted file mode 100644
index 854c6348861..00000000000
--- a/src/data_readers/data_reader_ascii.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_data_reader_ascii .hpp .cpp - generic_data_reader class for ASCII text files
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_ascii.hpp"
-#include <cstdio>
-namespace lbann {
-
-ascii_reader::ascii_reader(int sequence_length, bool shuffle)
-  : generic_data_reader(shuffle), m_sequence_length(sequence_length) {}
-
-bool ascii_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-
-  // Get text sequence from file
-  const int pos = data_id - m_sequence_length;
-  const int num_chars = (std::min(pos + m_sequence_length, m_file_size)
-                         - std::max(pos, 0));
-  std::vector<char> sequence(m_sequence_length, 0);
-  if (num_chars > 0) {
-    std::ifstream fs(get_file_dir() + get_data_filename(),
-                     std::fstream::in);
-    fs.seekg(std::max(pos, 0));
-    fs.read(&sequence[std::max(-pos, 0)], num_chars);
-    fs.close();
-  }
-
-  // Convert text sequence to binary vector
-  for (int i = 0; i < m_sequence_length; ++i) {
-    auto current_char = (int) sequence[i];
-    if (current_char < 0 || current_char >= 128) {
-      current_char = 0;
-    }
-    X(128 * i + current_char, mb_idx) = DataType(1);
-  }
-
-  return true;
-}
-
-bool ascii_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-
-  // Get text sequence from file
-  const int pos = data_id - m_sequence_length + 1;
-  const int num_chars = (std::min(pos + m_sequence_length, m_file_size)
-                         - std::max(pos, 0));
-  std::vector<char> sequence(m_sequence_length, 0);
-  if (num_chars > 0) {
-    std::ifstream fs(get_file_dir() + get_data_filename(),
-                     std::fstream::in);
-    fs.seekg(std::max(pos, 0));
-    fs.read(&sequence[std::max(-pos, 0)], num_chars);
-    fs.close();
-  }
-
-  // Convert text sequence to binary vector
-  for (int i = 0; i < m_sequence_length; ++i) {
-    auto current_char = (int) sequence[i];
-    if (current_char < 0 || current_char >= 128) {
-      current_char = 0;
-    }
-    Y(128 * i + current_char, mb_idx) = DataType(1);
-  }
-
-  return true;
-}
-
-//===================================================
-
-void ascii_reader::load() {
-
-  // Make sure directory path ends with a slash
-  if (m_file_dir.back() != '/') {
-    m_file_dir.push_back('/');
-  }
-
-  // Get length of data file
-  std::ifstream fs(get_file_dir() + get_data_filename(),
-                   std::fstream::in | std::fstream::ate);
-  m_file_size = fs.tellg();
-  fs.close();
-
-  // Reset indices
-  m_shuffled_indices.resize(m_file_size + m_sequence_length);
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-  if (is_master()) {
-    std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " <<
-      m_shuffled_indices.size() << std::endl;
-  }
-  select_subset_of_data();
-
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_cifar10.cpp b/src/data_readers/data_reader_cifar10.cpp
index 5b69c79d7c3..655d6fa6e49 100644
--- a/src/data_readers/data_reader_cifar10.cpp
+++ b/src/data_readers/data_reader_cifar10.cpp
@@ -23,7 +23,7 @@
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 //
-// lbann_data_reader_cifar10 .hpp .cpp - generic_data_reader class for CIFAR10 dataset
+// data_reader_cifar10 .hpp .cpp - Data reader for CIFAR-10/100
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/data_readers/data_reader_cifar10.hpp"
@@ -46,66 +46,97 @@ void cifar10_reader::set_defaults() {
 }
 
 void cifar10_reader::load() {
-  //open data file
-  std::string image_dir = get_file_dir();
-  std::string filename = get_data_filename();
-  std::string path = image_dir + "/" + filename;
-  std::ifstream in(path, std::ios::binary);
-  if (!in.good()) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: failed to open " + path + " for reading");
-  }
+  // These are all specified by the CIFAR10/100 description.
+  constexpr size_t num_channels = 3;
+  constexpr size_t channel_size = 32*32;
+  constexpr size_t image_size = num_channels*channel_size;
+  constexpr size_t cifar10_label_size = 1;
+  constexpr size_t cifar100_label_size = 2;
 
-  //get number of images, with error checking
-  int len = get_linearized_data_size() + 1;  //should be 3073
-  in.seekg(0, in.end);
-  std::streampos fs = in.tellg();
-  in.seekg(0, in.beg);
-  if (fs % len != 0) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " ::  fs % len != 0; fs: " + std::to_string(fs) + " len: " +
-      std::to_string(len));
+  if (m_num_labels != 10 && m_num_labels != 100) {
+    LBANN_ERROR("Unsupported number of labels for CIFAR10/100.");
   }
 
-  //reserve space for string images
-  int num_images = fs / len;
-  m_data.resize(num_images);
-  for (auto & h : m_data) {
-    h.resize(len);
-  }
+  const bool cifar100 = m_num_labels == 100;
 
-  //read in the images; each image is 1 byte, which is the
-  //label (0-9), and 3072 pixels
-  for (auto & h : m_data) {
-    in.read((char *)&(h[0]), len);
+  std::string path = get_file_dir();
+  // These filenames are specified by the CIFAR-10/100 dataset description.
+  std::vector<std::string> filenames;
+  size_t images_per_file = 10000;
+  if (this->get_role() == "train") {
+    if (cifar100) {
+      filenames = {"train.bin"};
+      images_per_file = 50000;
+    } else {
+      filenames = {
+        "data_batch_1.bin",
+        "data_batch_2.bin",
+        "data_batch_3.bin",
+        "data_batch_4.bin",
+        "data_batch_5.bin"
+      };
+    }
+  } else if (this->get_role() == "test") {
+    if (cifar100) {
+      filenames = {"test.bin"};
+    } else {
+      filenames = {"test_batch.bin"};
+    }
+  } else {
+    LBANN_ERROR("Unsupported training mode for CIFAR loading.");
   }
-  in.close();
 
-  m_shuffled_indices.resize(m_data.size());
-  for (size_t n = 0; n < m_data.size(); n++) {
-    m_shuffled_indices[n] = n;
+  for (const auto& filename : filenames) {
+    std::ifstream f(path + "/" + filename,
+                    std::ios::in | std::ios::binary);
+    if (!f.good()) {
+      LBANN_ERROR("Could not open " + path + "/" + filename);
+    }
+    // Temporary buffer to hold an image.
+    std::vector<uint8_t> buf(image_size + (cifar100 ?
+                                           cifar100_label_size :
+                                           cifar10_label_size), 0);
+    for (size_t i = 0; i < images_per_file; ++i) {
+      f.read(reinterpret_cast<char*>(buf.data()), buf.size());
+      if (static_cast<size_t>(f.gcount()) != buf.size()) {
+        LBANN_ERROR("Could not read from " + path + "/" + filename);
+      }
+      // CIFAR-10 has only one label; for CIFAR-100, the second byte is the
+      // fine label.
+      m_labels.push_back(buf[cifar100 ? 1 : 0]);
+      // Convert to OpenCV layout.
+      std::vector<uint8_t> image(image_size);
+      for (size_t channel = 0; channel < num_channels; ++channel) {
+        const size_t src_start = channel*channel_size;
+        for (size_t j = 0; j < channel_size; ++j) {
+          image[j*num_channels + channel] = buf[src_start + j];
+        }
+      }
+      m_images.push_back(std::move(image));
+    }
+    f.close();
   }
 
+  m_shuffled_indices.clear();
+  m_shuffled_indices.resize(m_images.size());
+  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
 bool cifar10_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  for (size_t p = 1; p<m_data[data_id].size(); p++) {
-    X.Set(p-1, mb_idx, m_data[data_id][p]);
-  }
-
-  auto pixel_col = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1));
-  augment(pixel_col, m_image_height, m_image_width, m_image_num_channels);
-  normalize(pixel_col, m_image_num_channels);
-  pixel_noise(pixel_col); //add noise to image, disable by default
+  // Copy to a matrix so we can do data augmentation.
+  // Sizes per CIFAR-10/100 dataset description.
+  El::Matrix<uint8_t> image(3*32*32, 1);
+  std::vector<size_t> dims = {size_t(3), size_t(32), size_t(32)};
+  std::copy_n(m_images[data_id].data(), 3*32*32, image.Buffer());
+  auto X_v = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1));
+  m_transform_pipeline.apply(image, X_v, dims);
   return true;
 }
 
 bool cifar10_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-  auto label = (int)m_data[data_id][0];
-  Y.Set(label, mb_idx, 1);
+  Y.Set(m_labels[data_id], mb_idx, 1);
   return true;
 }
 
diff --git a/src/data_readers/data_reader_csv.cpp b/src/data_readers/data_reader_csv.cpp
index 3b1b80b37fd..e7eccec0067 100644
--- a/src/data_readers/data_reader_csv.cpp
+++ b/src/data_readers/data_reader_csv.cpp
@@ -265,6 +265,7 @@ void csv_reader::load() {
   // Reset indices.
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp
index 596d08d344f..593a7413823 100644
--- a/src/data_readers/data_reader_image.cpp
+++ b/src/data_readers/data_reader_image.cpp
@@ -27,6 +27,12 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/data_readers/data_reader_image.hpp"
+#include "lbann/utils/image.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/data_store/data_store_conduit.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/threads/thread_utils.hpp"
+#include "lbann/utils/lbann_library.hpp"
 #include <fstream>
 
 namespace lbann {
@@ -37,15 +43,10 @@ image_data_reader::image_data_reader(bool shuffle)
 }
 
 image_data_reader::image_data_reader(const image_data_reader& rhs)
-  : generic_data_reader(rhs),
-    m_image_dir(rhs.m_image_dir),
-    m_image_list(rhs.m_image_list),
-    m_image_width(rhs.m_image_width),
-    m_image_height(rhs.m_image_height),
-    m_image_num_channels(rhs.m_image_num_channels),
-    m_image_linearized_size(rhs.m_image_linearized_size),
-    m_num_labels(rhs.m_num_labels)
-{}
+  : generic_data_reader(rhs)
+{
+  copy_members(rhs);
+}
 
 image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) {
   generic_data_reader::operator=(rhs);
@@ -60,6 +61,24 @@ image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) {
   return (*this);
 }
 
+void image_data_reader::copy_members(const image_data_reader &rhs) {
+
+  if(rhs.m_data_store != nullptr) {
+    m_data_store = new data_store_conduit(rhs.get_data_store());
+    m_data_store->set_data_reader_ptr(this);
+  }
+
+  m_image_dir = rhs.m_image_dir;
+  m_image_list = rhs.m_image_list;
+  m_image_width = rhs.m_image_width;
+  m_image_height = rhs.m_image_height;
+  m_image_num_channels = rhs.m_image_num_channels;
+  m_image_linearized_size = rhs.m_image_linearized_size;
+  m_num_labels = rhs.m_num_labels;
+  //m_thread_cv_buffer = rhs.m_thread_cv_buffer
+}
+
+
 void image_data_reader::set_linearized_image_size() {
   m_image_linearized_size = m_image_width * m_image_height * m_image_num_channels;
 }
@@ -100,24 +119,27 @@ void image_data_reader::set_input_params(const int width, const int height, cons
 
 bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
   const label_t label = m_image_list[data_id].second;
+  if (label < label_t{0} || label >= static_cast<label_t>(m_num_labels)) {
+    LBANN_ERROR(
+      "\"",this->get_type(),"\" data reader ",
+      "expects data with ",m_num_labels," labels, ",
+      "but data sample ",data_id," has a label of ",label);
+  }
   Y.Set(label, mb_idx, 1);
   return true;
 }
 
 void image_data_reader::load() {
-  //const std::string imageDir = get_file_dir();
-  const std::string imageListFile = get_data_filename();
+  options *opts = options::get();
 
-  m_image_list.clear();
+  const std::string imageListFile = get_data_filename();
 
   // load image list
+  m_image_list.clear();
   FILE *fplist = fopen(imageListFile.c_str(), "rt");
   if (!fplist) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: failed to open: " + imageListFile);
+    LBANN_ERROR("failed to open: " + imageListFile + " for reading");
   }
-
   while (!feof(fplist)) {
     char imagepath[512];
     label_t imagelabel;
@@ -128,23 +150,94 @@ void image_data_reader::load() {
   }
   fclose(fplist);
 
+  // TODO: this will probably need to change after sample_list class
+  //       is modified
   // reset indices
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_image_list.size());
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
+
+  opts->set_option("node_sizes_vary", 1);
+  instantiate_data_store();
 
   select_subset_of_data();
 }
 
-void image_data_reader::setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) {
-  generic_data_reader::setup(num_io_threads, io_thread_pool);
+void read_raw_data(const std::string &filename, std::vector<char> &data) {
+  data.clear();
+  std::ifstream in(filename.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open " + filename + " for reading");
+  }
+  in.seekg(0, in.end);
+  int num_bytes = in.tellg();
+  in.seekg(0, in.beg);
+  data.resize(num_bytes);
+  in.read((char*)data.data(), num_bytes);
+  in.close();
+}
+
+
+void image_data_reader::do_preload_data_store() {
+  options *opts = options::get();
+
+  int rank = m_comm->get_rank_in_trainer();
+
+  bool threaded = ! options::get()->get_bool("data_store_no_thread");
+  if (threaded) {
+    if (is_master()) {
+      std::cout << "mode: data_store_thread\n";
+    }
+    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(m_comm, opts);
+    int num_threads = static_cast<int>(io_thread_pool->get_num_threads());
 
-  using InputBuf_T = lbann::cv_image_type<uint8_t>;
-  auto cvMat = cv::Mat(1, get_linearized_data_size(), InputBuf_T::T(1));
-  m_thread_cv_buffer.resize(num_io_threads);
-  for(int tid = 0; tid < num_io_threads; ++tid) {
-    m_thread_cv_buffer[tid] = cvMat.clone();
+    std::vector<std::unordered_set<int>> data_ids(num_threads);
+    int j = 0;
+    for (size_t data_id=0; data_id<m_shuffled_indices.size(); data_id++) {
+      int index = m_shuffled_indices[data_id];
+      if (m_data_store->get_index_owner(index) != rank) {
+        continue;
+      }
+      data_ids[j++].insert(index);
+      if (j == num_threads) {
+        j = 0;
+      }
+    }
+
+    for (int t = 0; t < num_threads; t++) {
+      if(t == io_thread_pool->get_local_thread_id()) {
+        continue;
+      } else {
+        io_thread_pool->submit_job_to_work_group(std::bind(&image_data_reader::load_conduit_nodes_from_file, this, data_ids[t]));
+      }
+    }
+    load_conduit_nodes_from_file(data_ids[io_thread_pool->get_local_thread_id()]);
+    io_thread_pool->finish_work_group();
   }
+
+  else {
+    if (is_master()) {
+      std::cout << "mode: NOT data_store_thread\n";
+    }
+    for (size_t data_id=0; data_id<m_shuffled_indices.size(); data_id++) {
+      int index = m_shuffled_indices[data_id];
+      if (m_data_store->get_index_owner(index) != rank) {
+        continue;
+      }
+      conduit::Node &node = m_data_store->get_empty_node(index);
+      load_conduit_node_from_file(index, node);
+      m_data_store->set_preloaded_conduit_node(index, node);
+    }
+  }
+}
+
+void image_data_reader::setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) {
+  generic_data_reader::setup(num_io_threads, io_thread_pool);
+   m_transform_pipeline.set_expected_out_dims(
+    {static_cast<size_t>(m_image_num_channels),
+     static_cast<size_t>(m_image_height),
+     static_cast<size_t>(m_image_width)});
 }
 
 std::vector<image_data_reader::sample_t> image_data_reader::get_image_list_of_current_mb() const {
@@ -153,4 +246,25 @@ std::vector<image_data_reader::sample_t> image_data_reader::get_image_list_of_cu
   return ret;
 }
 
+bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set<int> &data_ids) {
+  for (auto data_id : data_ids) {
+    conduit::Node &node = m_data_store->get_empty_node(data_id);
+    load_conduit_node_from_file(data_id, node);
+    m_data_store->set_preloaded_conduit_node(data_id, node);
+  }
+  return true;
+}
+
+void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) {
+  node.reset();
+  const std::string filename = get_file_dir() + m_image_list[data_id].first;
+  int label = m_image_list[data_id].second;
+  std::vector<char> data;
+  read_raw_data(filename, data);
+  node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label);
+  node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data);
+  node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size();
+}
+
+
 }  // namespace lbann
diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp
index fc46023737e..0d83fc679ad 100644
--- a/src/data_readers/data_reader_imagenet.cpp
+++ b/src/data_readers/data_reader_imagenet.cpp
@@ -27,53 +27,17 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/data_readers/data_reader_imagenet.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include <omp.h>
+#include "lbann/utils/image.hpp"
+#include "lbann/utils/file_utils.hpp"
 
 namespace lbann {
 
-imagenet_reader::imagenet_reader(const std::shared_ptr<cv_process>& pp, bool shuffle)
+imagenet_reader::imagenet_reader(bool shuffle)
   : image_data_reader(shuffle) {
   set_defaults();
-
-  if (!pp) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-
-  m_master_pps = lbann::make_unique<cv_process>(*pp);
-}
-
-imagenet_reader::imagenet_reader(const imagenet_reader& rhs)
-  : image_data_reader(rhs) {
-  if (!rhs.m_master_pps) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-  m_master_pps = lbann::make_unique<cv_process>(*rhs.m_master_pps);
-}
-
-imagenet_reader& imagenet_reader::operator=(const imagenet_reader& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  image_data_reader::operator=(rhs);
-
-  if (!rhs.m_master_pps) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-  m_master_pps = lbann::make_unique<cv_process>(*rhs.m_master_pps);
-  return (*this);
 }
 
-imagenet_reader::~imagenet_reader() {
-}
+imagenet_reader::~imagenet_reader() {}
 
 void imagenet_reader::set_defaults() {
   m_image_width = 256;
@@ -83,68 +47,61 @@ void imagenet_reader::set_defaults() {
   m_num_labels = 1000;
 }
 
-void imagenet_reader::setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) {
-  image_data_reader::setup(num_io_threads, io_thread_pool);
-  replicate_processor(*m_master_pps, num_io_threads);
-}
-
-/// Replicate image processor for each I/O thread
-bool imagenet_reader::replicate_processor(const cv_process& pp, const int nthreads) {
-  m_pps.resize(nthreads);
-
-  // Construct thread private preprocessing objects out of a shared pointer
-  for (int i = 0; i < nthreads; ++i) {
-    m_pps[i] = lbann::make_unique<cv_process>(pp);
-  }
-
-  bool ok = true;
-  for (int i = 0; ok && (i < nthreads); ++i) {
-    if (!m_pps[i]) ok = false;
-  }
-
-  if (!ok || (nthreads <= 0)) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " cannot replicate image processor";
-    throw lbann_exception(err.str());
-    return false;
-  }
-
-  const std::vector<unsigned int> dims = pp.get_data_dims();
-  if ((dims.size() == 2u) && (dims[0] != 0u) && (dims[1] != 0u)) {
-    m_image_width = static_cast<int>(dims[0]);
-    m_image_height = static_cast<int>(dims[1]);
-    set_linearized_image_size();
-  }
-
-  return true;
-}
-
 CPUMat imagenet_reader::create_datum_view(CPUMat& X, const int mb_idx) const {
   return El::View(X, El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1));
 }
 
 bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  int tid = m_io_thread_pool->get_local_thread_id();
-  const std::string imagepath = get_file_dir() + m_image_list[data_id].first;
-
-  int width=0, height=0, img_type=0;
-
-  CPUMat X_v = create_datum_view(X, mb_idx);
+  El::Matrix<uint8_t> image;
+  std::vector<size_t> dims;
+  const std::string image_path = get_file_dir() + m_image_list[data_id].first;
+  if (m_data_store != nullptr) {
+    bool have_node = true;
+    conduit::Node node;
+    if (m_data_store->is_local_cache()) {
+      if (m_data_store->has_conduit_node(data_id)) {
+        const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+        node.set_external(ds_node);
+      } else {
+        load_conduit_node_from_file(data_id, node);
+        m_data_store->set_conduit_node(data_id, node);
+      }
+    } else if (data_store_active()) {
+      const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
+      node.set_external(ds_node);
+    } else if (priming_data_store()) {
+      load_conduit_node_from_file(data_id, node);
+      m_data_store->set_conduit_node(data_id, node);
+    } else {
+      if (get_role() != "test") {
+        LBANN_ERROR("you shouldn't be here; please contact Dave Hysom");
+      }
+      if (m_issue_warning) {
+        if (is_master()) {
+          LBANN_WARNING("m_data_store != nullptr, but we are not retrivieving a node from the store; role: " + get_role() + "; this is probably OK for test mode, but may be an error for train or validate modes");
+        }
+      }
+      m_issue_warning = false;
+      load_image(image_path, image, dims);
+      have_node = false;
+    }
+
+    if (have_node) {
+      char *buf = node[LBANN_DATA_ID_STR(data_id) + "/buffer"].value();
+      size_t size = node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value();
+      El::Matrix<uint8_t> encoded_image(size, 1, reinterpret_cast<uint8_t*>(buf), size);
+      decode_image(encoded_image, image, dims);
+    }
+  } 
+  
+  // this block fires if not using data store
+  else {
+    load_image(image_path, image, dims);
+  }
 
-  bool ret;
-  ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
+  auto X_v = create_datum_view(X, mb_idx);
+  m_transform_pipeline.apply(image, X_v, dims);
 
-  if(!ret) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": image_utils::load_image failed to load - "
-                          + imagepath);
-  }
-  if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": mismatch data size -- either width, height or channel - "
-                          + imagepath + "[w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                          + "x" + std::to_string(CV_MAT_CN(img_type)) + "]");
-  }
   return true;
 }
 
diff --git a/src/data_readers/data_reader_imagenet_patches.cpp b/src/data_readers/data_reader_imagenet_patches.cpp
deleted file mode 100644
index f577474ba04..00000000000
--- a/src/data_readers/data_reader_imagenet_patches.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_imagenet_patches .hpp .cpp - extract patches from ImageNet dataset
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_imagenet_patches.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-
-#include <omp.h>
-
-namespace lbann {
-
-imagenet_reader_patches::imagenet_reader_patches(const std::shared_ptr<cv_process_patches>& pp, bool shuffle)
-  : image_data_reader(shuffle) {
-  set_defaults();
-
-  if (!pp) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-
-  m_master_pps = lbann::make_unique<cv_process_patches>(*pp);
-}
-
-imagenet_reader_patches::imagenet_reader_patches(const imagenet_reader_patches& rhs)
-  : image_data_reader(rhs)
-{
-  if (!rhs.m_master_pps) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-  m_num_patches = rhs.m_num_patches;
-  m_master_pps = lbann::make_unique<cv_process_patches>(*rhs.m_master_pps);
-}
-
-imagenet_reader_patches& imagenet_reader_patches::operator=(const imagenet_reader_patches& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  image_data_reader::operator=(rhs);
-
-  if (!rhs.m_master_pps) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor";
-    throw lbann_exception(err.str());
-  }
-  m_num_patches = rhs.m_num_patches;
-  m_master_pps = lbann::make_unique<cv_process_patches>(*rhs.m_master_pps);
-  return (*this);
-}
-
-imagenet_reader_patches::~imagenet_reader_patches() {
-}
-
-void imagenet_reader_patches::set_defaults() {
-  m_image_width = 256;
-  m_image_height = 256;
-  m_image_num_channels = 3;
-  set_linearized_image_size();
-  m_num_labels = 1000;
-  m_num_patches = 1;
-}
-
-void imagenet_reader_patches::setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) {
-  image_data_reader::setup(num_io_threads, io_thread_pool);
-  replicate_processor(*m_master_pps, num_io_threads);
-}
-
-
-/// Replicate image processor for each OpenMP thread
-bool imagenet_reader_patches::replicate_processor(const cv_process_patches& pp, const int nthreads) {
-  m_pps.resize(nthreads);
-
-  // Construct thread private preprocessing objects out of a shared pointer
-  for (int i = 0; i < nthreads; ++i) {
-    m_pps[i] = lbann::make_unique<cv_process_patches>(pp);
-  }
-
-  bool ok = true;
-  for (int i = 0; ok && (i < nthreads); ++i) {
-    if (!m_pps[i]) ok = false;
-  }
-
-  if (!ok || (nthreads <= 0)) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: cannot replicate image processor";
-    throw lbann_exception(err.str());
-    return false;
-  }
-  const std::vector<unsigned int> dims = pp.get_data_dims();
-  if ((dims.size() == 3u) && (dims[0] != 0u) && (dims[1] != 0u) && (dims[2] != 0u)) {
-    m_num_patches = static_cast<int>(dims[0]);
-    m_image_width = static_cast<int>(dims[1]);
-    m_image_height = static_cast<int>(dims[2]);
-    set_linearized_image_size();
-  }
-  if (pp.is_self_labeling()) {
-    m_num_labels = pp.get_num_labels();
-  }
-
-  return true;
-}
-
-std::vector<CPUMat> imagenet_reader_patches::create_datum_views(CPUMat& X, const int mb_idx) const {
-/*
-  if (X.Height() != get_linearized_data_size()) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": inconsistent number of patches");
-  }
-*/
-  std::vector<CPUMat> X_v(m_num_patches);
-  El::Int h = 0;
-  for(int i=0; i < m_num_patches; ++i) {
-    El::View(X_v[i], X, El::IR(h, h + m_image_linearized_size), El::IR(mb_idx, mb_idx + 1));
-    h = h + m_image_linearized_size;
-  }
-  return X_v;
-}
-
-bool imagenet_reader_patches::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  int tid = m_io_thread_pool->get_local_thread_id();
-  const std::string imagepath = get_file_dir() + m_image_list[data_id].first;
-
-  int width=0, height=0, img_type=0;
-  std::vector<CPUMat> X_v = create_datum_views(X, mb_idx);
-  bool ret;
-  ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
-    //ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v);
-
-  if (m_pps[tid]->is_self_labeling()) {
-    m_image_list[data_id].second = m_pps[tid]->get_patch_label();
-  }
-
-  if(!ret) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": image_utils::load_image failed to load - "
-                          + imagepath);
-  }
-  if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": mismatch data size -- either width, height or channel - "
-                          + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                          + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size));
-  }
-  return true;
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_jag.cpp b/src/data_readers/data_reader_jag.cpp
deleted file mode 100644
index 1003d4eb90e..00000000000
--- a/src/data_readers/data_reader_jag.cpp
+++ /dev/null
@@ -1,667 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/utils/file_utils.hpp"
-#include "lbann/utils/cnpy_utils.hpp"
-#include "lbann/data_readers/opencv_extensions.hpp"
-#include "lbann/data_readers/data_reader_jag.hpp"
-#include <limits>     // numeric_limits
-#include <algorithm>  // max_element
-#include <numeric>    // accumulate
-#include <functional> // multiplies
-#include <type_traits>// is_same
-
-namespace lbann {
-
-data_reader_jag::data_reader_jag(bool shuffle)
-  : generic_data_reader(shuffle),
-    m_independent({Undefined}), m_dependent({Undefined}),
-    m_image_loaded(false), m_scalar_loaded(false),
-    m_input_loaded(false), m_num_samples(0u),
-    m_linearized_image_size(0u),
-    m_linearized_scalar_size(0u),
-    m_linearized_input_size(0u),
-    m_image_normalization(0u),
-    m_image_width(0u), m_image_height(0u),
-    m_img_min(std::numeric_limits<data_t>::max()),
-    m_img_max(std::numeric_limits<data_t>::min()) {
-}
-
-
-data_reader_jag::~data_reader_jag() {
-}
-
-
-void data_reader_jag::set_independent_variable_type(
-  const std::vector< std::vector<data_reader_jag::variable_t> >& independent) {
-  if (!independent.empty() && !m_independent.empty() && (m_independent[0] == Undefined)) {
-    m_independent.clear();
-  }
-  for (const auto& group: independent) {
-    for (const auto type: group) {
-      add_independent_variable_type(type);
-    }
-  }
-}
-
-void data_reader_jag::add_independent_variable_type(
-  const data_reader_jag::variable_t independent) {
-  if (!(independent == JAG_Image || independent == JAG_Scalar ||
-        independent == JAG_Input || independent == Undefined)) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: unrecognized variable type " + std::to_string(static_cast<int>(independent)));
-  }
-  m_independent.push_back(independent);
-}
-
-void data_reader_jag::set_dependent_variable_type(
-  const std::vector < std::vector<data_reader_jag::variable_t> >& dependent) {
-  if (!dependent.empty() && !m_dependent.empty() && (m_dependent[0] == Undefined)) {
-    m_dependent.clear();
-  }
-  for (const auto& group: dependent) {
-    for (const auto type: group) {
-      add_dependent_variable_type(type);
-    }
-  }
-}
-
-void data_reader_jag::add_dependent_variable_type(
-  const data_reader_jag::variable_t dependent) {
-  if (!(dependent == JAG_Image || dependent == JAG_Scalar ||
-        dependent == JAG_Input || dependent == Undefined)) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: unrecognized variable type " + std::to_string(static_cast<int>(dependent)));
-  }
-  m_dependent.push_back(dependent);
-}
-
-
-std::vector<data_reader_jag::variable_t> data_reader_jag::get_independent_variable_type() const {
-  return m_independent;
-}
-
-std::vector<data_reader_jag::variable_t> data_reader_jag::get_dependent_variable_type() const {
-  return m_dependent;
-}
-
-bool data_reader_jag::is_independent(const variable_t t) const {
-  for(const auto i: m_independent) {
-    if (i == t) return true;
-  }
-  return false;
-}
-
-bool data_reader_jag::is_dependent(const variable_t t) const {
-  for(const auto d: m_dependent) {
-    if (d == t) return true;
-  }
-  return false;
-}
-
-bool data_reader_jag::is_used(const variable_t t) const {
-  return is_independent(t) || is_dependent(t);
-}
-
-void data_reader_jag::set_normalization_mode(int mode) {
-  if ((mode < 0) || (2 < mode)) {
-    throw lbann_exception("data_reader_jag: invalid normalization mode " +
-                          std::to_string(mode));
-  }
-  m_image_normalization = mode;
-}
-
-void data_reader_jag::set_image_dims(const int width, const int height) {
-  if ((width > 0) && (height > 0)) { // set and valid
-    m_image_width = width;
-    m_image_height = height;
-  } else if (!((width == 0) && (height == 0))) { // set but not valid
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__
-        << " :: data_reader_jag::set_image_dims() invalid image dims";
-    throw lbann_exception(err.str());
-  }
-}
-
-size_t data_reader_jag::get_num_samples() const {
-  return m_num_samples;
-}
-
-size_t data_reader_jag::get_linearized_image_size() const {
-  return m_linearized_image_size;
-}
-
-size_t data_reader_jag::get_linearized_scalar_size() const {
-  return m_linearized_scalar_size;
-}
-
-size_t data_reader_jag::get_linearized_input_size() const {
-  return m_linearized_input_size;
-}
-
-void data_reader_jag::set_linearized_image_size() {
-  if (!m_image_loaded) {
-    m_linearized_image_size = 0u;
-    m_image_width = 0u;
-    m_image_height = 0u;
-  } else {
-    m_linearized_image_size
-      = std::accumulate(m_images.shape.begin()+1, m_images.shape.end(),
-                        1u, std::multiplies<size_t>());
-    if (m_linearized_image_size != static_cast<size_t>(m_image_width*m_image_height)) {
-      if ((m_image_width == 0u) && (m_image_height == 0u)) {
-        m_image_height = 1;
-        m_image_width = static_cast<int>(m_linearized_image_size);
-      } else {
-        std::stringstream err;
-        err << __FILE__<<" "<<__LINE__
-            << " :: data_reader_jag::set_linearized_image_size() image size mismatch";
-        throw lbann_exception(err.str());
-      }
-    }
-  }
-}
-
-void data_reader_jag::set_linearized_scalar_size() {
-  if (!m_scalar_loaded) {
-    m_linearized_scalar_size = 0u;
-  } else {
-    m_linearized_scalar_size
-      = std::accumulate(m_scalars.shape.begin()+1, m_scalars.shape.end(),
-                        1u, std::multiplies<size_t>());
-  }
-}
-
-void data_reader_jag::set_linearized_input_size() {
-  if (!m_input_loaded) {
-    m_linearized_input_size = 0u;
-  } else {
-    m_linearized_input_size
-      = std::accumulate(m_inputs.shape.begin()+1, m_inputs.shape.end(),
-                        1u, std::multiplies<size_t>());
-  }
-}
-
-size_t data_reader_jag::get_linearized_size(const data_reader_jag::variable_t t) const {
-  switch (t) {
-    case JAG_Image:
-      return m_linearized_image_size;
-    case JAG_Scalar:
-      return m_linearized_scalar_size;
-    case JAG_Input:
-      return m_linearized_input_size;
-    default: { // includes Undefined case
-      throw lbann_exception(std::string("data_reader_jag::get_linearized_size() : ") +
-                                        "unknown or undefined variable type");
-    }
-  }
-  return 0u;
-}
-
-int data_reader_jag::get_linearized_data_size() const {
-  size_t sz = 0u;
-  for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
-    sz += get_linearized_size(t);
-  }
-  return static_cast<int>(sz);
-}
-
-int data_reader_jag::get_linearized_response_size() const {
-  size_t sz = 0u;
-  for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
-    sz += get_linearized_size(t);
-  }
-  return static_cast<int>(sz);
-}
-
-std::vector<size_t> data_reader_jag::get_linearized_data_sizes() const {
-  std::vector<size_t> all_dim;
-  all_dim.reserve(m_independent.size());
-  for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
-    all_dim.push_back(get_linearized_size(t));
-  }
-  return all_dim;
-}
-
-std::vector<size_t> data_reader_jag::get_linearized_response_sizes() const {
-  std::vector<size_t> all_dim;
-  all_dim.reserve(m_dependent.size());
-  for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
-    all_dim.push_back(get_linearized_size(t));
-  }
-  return all_dim;
-}
-
-
-const std::vector<int> data_reader_jag::get_dims(const data_reader_jag::variable_t t) const {
-  switch (t) {
-    case JAG_Image:
-      return {1, m_image_height, m_image_width};
-      //return {static_cast<int>(m_linearized_image_size)};
-    case JAG_Scalar:
-      return {static_cast<int>(m_linearized_scalar_size)};
-    case JAG_Input:
-      return {static_cast<int>(m_linearized_input_size)};
-    default: {
-      throw lbann_exception(std::string("data_reader_jag::get_dims() : ") +
-                                        "unknown or undefined variable type");
-    }
-  }
-  return {};
-}
-
-const std::vector<int> data_reader_jag::get_data_dims() const {
-  std::vector<int> all_dim;
-  for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
-    const std::vector<int> ld = get_dims(t);
-    all_dim.insert(all_dim.end(), ld.begin(), ld.end());
-  }
-  return all_dim;
-}
-
-
-void data_reader_jag::load() {
-  if (m_gan_labelling) {
-    m_num_labels=2;
-  }
-  if (is_master()) {
-    std::cout << "JAG load GAN m_gan_labelling : label_value "
-              << m_gan_labelling <<" : " << m_gan_label_value << std::endl;
-  }
-
-  const std::string data_dir = add_delimiter(get_file_dir());
-  const std::string namestr = get_data_filename();
-  std::vector<std::string> file_names = get_tokens(namestr);
-  if (file_names.size() != 3u) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: unexpected number of files " + std::to_string(file_names.size()));
-  }
-
-  for(auto& str : file_names) {
-    str = data_dir + str;
-  }
-  load(file_names[0], file_names[1], file_names[2], m_first_n);
-
-  size_t num_samples = get_num_samples();
-
-  if (m_first_n > 0) {
-    num_samples = (static_cast<size_t>(m_first_n) <= num_samples)?
-                   static_cast<size_t>(m_first_n) : num_samples;
-
-    m_first_n = num_samples;
-    set_use_percent(1.0);
-    set_absolute_sample_count(0u);
-  }
-
-  // reset indices
-  m_shuffled_indices.clear();
-
-  m_shuffled_indices.resize(num_samples);
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-
-  select_subset_of_data();
-}
-
-
-void data_reader_jag::load(const std::string image_file,
-            const std::string scalar_file,
-            const std::string input_file,
-            const size_t first_n) {
-  if ((m_independent.empty() || (m_independent[0] == Undefined)) &&
-      !m_dependent.empty() && (m_dependent[0] == Undefined)) {
-    throw lbann_exception("data_reader_jag: no type of variables to load is defined.");
-  }
-  if (is_used(JAG_Image) && !image_file.empty() && !check_if_file_exists(image_file)) {
-    throw lbann_exception("data_reader_jag: failed to load " + image_file);
-  }
-  if (is_used(JAG_Scalar) && !scalar_file.empty() && !check_if_file_exists(scalar_file)) {
-    throw lbann_exception("data_reader_jag: failed to load " + scalar_file);
-  }
-  if (is_used(JAG_Input) && !input_file.empty() && !check_if_file_exists(input_file)) {
-    throw lbann_exception("data_reader_jag: failed to load " + input_file);
-  }
-
-  m_num_samples = 0u;
-
-  // read in only those that will be used
-  if (is_used(JAG_Image) && !image_file.empty()) {
-    m_images  = cnpy::npy_load(image_file);
-    if (first_n > 0u) { // to use only first_n samples
-      cnpy_utils::shrink_to_fit(m_images, first_n);
-    }
-    m_image_loaded = true;
-    set_linearized_image_size();
-  }
-  if (is_used(JAG_Scalar) && !scalar_file.empty()) {
-    m_scalars = cnpy::npy_load(scalar_file);
-    if (first_n > 0u) { // to use only first_n samples
-      cnpy_utils::shrink_to_fit(m_scalars, first_n);
-    }
-    m_scalar_loaded = true;
-    set_linearized_scalar_size();
-  }
-  if (is_used(JAG_Input) && !input_file.empty()) {
-    m_inputs  = cnpy::npy_load(input_file);
-    if (first_n > 0u) { // to use only first_n samples
-      cnpy_utils::shrink_to_fit(m_inputs, first_n);
-    }
-    m_input_loaded = true;
-    set_linearized_input_size();
-  }
-
-  size_t num_samples = 0u;
-  bool ok = check_data(num_samples);
-
-  if (!ok) {
-    get_description();
-    throw lbann_exception("data_reader_jag: loaded data format not consistent");
-  }
-  m_num_samples = num_samples;
-  if (m_image_loaded) {
-    m_img_min = get_image_min();
-    m_img_max = get_image_max();
-    if (m_img_min == m_img_max) {
-      throw lbann_exception("data_reader_jag: no_variation in data");
-    }
-    normalize_image();
-  }
-}
-
-
-bool data_reader_jag::check_data(size_t& num_samples) const {
-  bool ok = true;
-  num_samples = 0u;
-  if (ok && m_image_loaded) {
-    ok = (m_linearized_image_size > 0u) && (m_images.word_size == sizeof(data_t));
-    if (!ok) {
-      std::cerr << "m_images.shape.size() = " << m_images.shape.size() << std::endl
-                << "m_linearized_image_size = " << m_linearized_image_size << std::endl
-                << "m_images.word_size = " << m_images.word_size << std::endl
-                << "sizeof(data_t) = " << sizeof(data_t) << std::endl;
-      return false;
-    }
-    num_samples = m_images.shape[0];
-  }
-  if (ok && m_scalar_loaded) {
-    ok = (m_linearized_scalar_size > 0u) && (m_scalars.word_size == sizeof(scalar_t));
-    if (!ok) {
-      std::cerr << "m_scalars.shape.size() = " << m_scalars.shape.size() << std::endl
-                << "m_linearized_scalar_size = " << m_linearized_scalar_size << std::endl
-                << "m_scalars.word_size = " << m_scalars.word_size << std::endl
-                << "sizeof(scalar_t) = " << sizeof(scalar_t) << std::endl;
-      return false;
-    }
-    if (num_samples > 0u) {
-      ok = ok && (num_samples == m_scalars.shape[0]);
-    } else {
-      num_samples = m_scalars.shape[0];
-    }
-  }
-  if (ok && m_input_loaded) {
-    ok = (m_linearized_input_size > 0u) && (m_inputs.word_size == sizeof(input_t));
-    if (!ok) {
-      std::cerr << "m_inputs.shape.size() = " << m_inputs.shape.size() << std::endl
-                << "m_linearized_input_size = " << m_linearized_input_size << std::endl
-                << "m_inputs.word_size = " << m_inputs.word_size << std::endl
-                << "sizeof(input_t) = " << sizeof(input_t) << std::endl;
-      return false;
-    }
-    if (num_samples > 0u) {
-      ok = ok && (num_samples == m_inputs.shape[0]);
-    } else {
-      num_samples = m_inputs.shape[0];
-    }
-  }
-  if (!ok) {
-    num_samples = 0u;
-  } else {
-    if (is_used(JAG_Image)) {
-      ok = ok && m_image_loaded;
-    }
-    if (is_used(JAG_Scalar)) {
-      ok = ok && m_scalar_loaded;
-    }
-    if (is_used(JAG_Input)) {
-      ok = ok && m_input_loaded;
-    }
-  }
-
-  return ok;
-}
-
-
-std::string data_reader_jag::get_description() const {
-  using std::string;
-  using std::to_string;
-  string ret = string("data_reader_jag:\n")
-    + " - images: "   + cnpy_utils::show_shape(m_images) + "\n"
-    + " - scalars: "  + cnpy_utils::show_shape(m_scalars) + "\n"
-    + " - inputs: "   + cnpy_utils::show_shape(m_inputs) + "\n";
-  if (m_image_loaded) {
-    ret += " - min pixel value: " + to_string(m_img_min) + "\n"
-         + " - max pixel value: " + to_string(m_img_max) + "\n"
-         + " - image width " + to_string(m_image_width) + "\n"
-         + " - image height " + to_string(m_image_height) + "\n"
-         + " - image normalization: " + to_string(m_image_normalization) + "\n";
-  }
-  return ret;
-}
-
-
-void data_reader_jag::normalize_image() {
-  if (!m_image_loaded) {
-    return;
-  }
-  using depth_t = cv_image_type<data_t>;
-  const int type_code = depth_t::T(1u);
-
-  if (m_image_normalization == 1) {
-    data_t* const ptr = get_image_ptr(0);
-    // Present the entire image data as a single image
-    // and normalize it once and for all
-    cv::Mat img(m_num_samples, m_linearized_image_size,
-                type_code, reinterpret_cast<void*>(ptr));
-    cv::normalize(img, img, 0.0, 1.0, cv::NORM_MINMAX);
-  } else if (m_image_normalization == 2) {
-    // normalize each image independently
-    for (size_t i=0u; i < m_num_samples; ++i) {
-      data_t* const ptr = get_image_ptr(i);
-      cv::Mat img(1, m_linearized_image_size,
-                  type_code, reinterpret_cast<void*>(ptr));
-      cv::normalize(img, img, 0.0, 1.0, cv::NORM_MINMAX);
-    }
-  } else if (m_image_normalization != 0) {
-    throw lbann_exception("data_reader_jag: invalid normalization mode " +
-                          std::to_string(m_image_normalization));
-  }
-}
-
-data_reader_jag::data_t* data_reader_jag::get_image_ptr(const size_t i) const {
-  return (m_image_loaded? cnpy_utils::data_ptr<data_t>(m_images, {i}) : nullptr);
-}
-
-cv::Mat data_reader_jag::get_image(const size_t i) const {
-  using InputBuf_T = cv_image_type<data_t>;
-
-  data_t* const ptr = get_image_ptr(i);
-  if (ptr == nullptr) {
-    return cv::Mat();
-  }
-  // Construct a zero copying view to data
-  const cv::Mat img_org(m_linearized_image_size, 1, InputBuf_T::T(1u),
-                        reinterpret_cast<void*>(ptr));
-
-  cv::Mat img;
-  if (std::is_same<data_t, DataType>::value) {
-    img = img_org.clone();
-  } else {
-    img_org.convertTo(img, cv_image_type<DataType>::T(1u));
-  }
-  return img.reshape(0, m_image_height);
-}
-
-data_reader_jag::data_t data_reader_jag::get_image_max() const {
-  if (!m_image_loaded) {
-    return std::numeric_limits<data_t>::min();
-  }
-  const data_t* ptr = get_image_ptr(0);
-  const size_t tot_num_pixels = m_images.shape[0]*m_linearized_image_size;
-  return *std::max_element(ptr, ptr + tot_num_pixels);
-}
-
-data_reader_jag::data_t data_reader_jag::get_image_min() const {
-  if (!m_image_loaded) {
-    return std::numeric_limits<data_t>::max();
-  }
-  const data_t* ptr = get_image_ptr(0);
-  const size_t tot_num_pixels = m_images.shape[0]*m_linearized_image_size;
-  return *std::min_element(ptr, ptr + tot_num_pixels);
-}
-
-data_reader_jag::scalar_t* data_reader_jag::get_scalar_ptr(const size_t i) const {
-  return (m_scalar_loaded? cnpy_utils::data_ptr<scalar_t>(m_scalars, {i}) : nullptr);
-}
-
-std::vector<DataType> data_reader_jag::get_scalar(const size_t i) const {
-  const scalar_t* const ptr = get_scalar_ptr(i);
-  if (ptr == nullptr) {
-    return {};
-  }
-  std::vector<DataType> ret(m_linearized_scalar_size);
-  for (size_t j = 0u; j < m_linearized_scalar_size; ++j) {
-    ret[j] = static_cast<DataType>(ptr[j]);
-  }
-  return ret;
-}
-
-data_reader_jag::input_t* data_reader_jag::get_input_ptr(const size_t i) const {
-  return (m_input_loaded? cnpy_utils::data_ptr<input_t>(m_inputs, {i}) : nullptr);
-}
-
-std::vector<DataType> data_reader_jag::get_input(const size_t i) const {
-  const input_t* const ptr = get_input_ptr(i);
-  if (ptr == nullptr) {
-    return {};
-  }
-  std::vector<DataType> ret(m_linearized_input_size);
-  for (size_t j = 0u; j < m_linearized_input_size; ++j) {
-    ret[j] = static_cast<DataType>(ptr[j]);
-  }
-  return ret;
-}
-
-
-std::vector<CPUMat>
-data_reader_jag::create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const {
-  std::vector<CPUMat> X_v(sizes.size());
-  El::Int h = 0;
-  for(size_t i=0u; i < sizes.size(); ++i) {
-    const El::Int h_end =  h + static_cast<El::Int>(sizes[i]);
-    El::View(X_v[i], X, El::IR(h, h_end), El::IR(mb_idx, mb_idx + 1));
-    h = h_end;
-  }
-  return X_v;
-}
-
-bool data_reader_jag::fetch(CPUMat& X, int data_id, int mb_idx,
-  const data_reader_jag::variable_t vt, const std::string tag) {
-  switch (vt) {
-    case JAG_Image: {
-      const data_t* ptr = get_image_ptr(data_id);
-      set_minibatch_item<data_t>(X, mb_idx, ptr, m_linearized_image_size);
-      break;
-    }
-    case JAG_Scalar: {
-      const scalar_t* ptr = get_scalar_ptr(data_id);
-      set_minibatch_item<scalar_t>(X, mb_idx, ptr, m_linearized_scalar_size);
-      break;
-    }
-    case JAG_Input: {
-      const input_t* ptr = get_input_ptr(data_id);
-      set_minibatch_item<input_t>(X, mb_idx, ptr, m_linearized_input_size);
-      break;
-    }
-    default: { // includes Undefined case
-      throw lbann_exception(std::string("data_reader_jag::fetch_") + tag +
-                            "() : unknown or undefined variable type (" +
-                            std::to_string(static_cast<int>(vt)) + ')');
-    }
-  }
-  return true;
-}
-
-bool data_reader_jag::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  std::vector<size_t> sizes = get_linearized_data_sizes();
-  std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-  bool ok = true;
-  for(size_t i = 0u; ok && (i < X_v.size()); ++i) {
-    ok = fetch(X_v[i], data_id, 0, m_independent[i], "datum");
-  }
-  return ok;
-}
-
-bool data_reader_jag::fetch_response(CPUMat& X, int data_id, int mb_idx) {
-  std::vector<size_t> sizes = get_linearized_response_sizes();
-  std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-  bool ok = true;
-  for(size_t i = 0u; ok && (i < X_v.size()); ++i) {
-    ok = fetch(X_v[i], data_id, 0, m_dependent[i], "response");
-  }
-  return ok;
-}
-
-bool data_reader_jag::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-  if(m_gan_label_value) Y.Set(m_gan_label_value,mb_idx,1); //fake sample is set to 1; adversarial model
-  else { //fake sample (second half of minibatch is set to 0;discriminator model
-    //mb_idx < (m_mb_size/2) ? Y.Set(1,mb_idx,1) : Y.Set(m_gan_label_value,mb_idx,1);
-    mb_idx < (get_current_mini_batch_size()/2) ? Y.Set(1,mb_idx,1) : Y.Set(m_gan_label_value,mb_idx,1);
-  }
-  //Y.Set(m_gan_label_value, mb_idx, 1);
-  return true;
-}
-
-void data_reader_jag::save_image(Mat& pixels, const std::string filename, bool do_scale) {
-  internal_save_image(pixels, filename, m_image_height, m_image_width, 1, do_scale);
-}
-
-} // end of namespace lbann
diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp
index b9214a680aa..76eb78685c8 100644
--- a/src/data_readers/data_reader_jag_conduit.cpp
+++ b/src/data_readers/data_reader_jag_conduit.cpp
@@ -28,12 +28,15 @@
 #include "lbann/data_readers/data_reader_jag_conduit.hpp"
 #include "lbann/io/data_buffers/partitioned_io_buffer.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
-#include "lbann/models/model.hpp"
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
 #include "lbann/utils/lbann_library.hpp"
+#include "lbann/utils/image.hpp"
+#include "lbann/utils/opencv.hpp"
+#include "lbann/transforms/repack_HWC_to_CHW_layout.hpp"
+#include "lbann/transforms/scale_and_translate.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
 #include "lbann/utils/file_utils.hpp" // for add_delimiter() in load()
-#include "lbann/data_readers/opencv_extensions.hpp"
 #include <limits>     // numeric_limits
 #include <algorithm>  // max_element
 #include <numeric>    // accumulate
@@ -41,7 +44,6 @@
 #include <type_traits>// is_same
 #include <set>
 #include <map>
-#include "lbann/data_readers/image_utils.hpp"
 #include <omp.h>
 #include "lbann/utils/timer.hpp"
 #include "lbann/utils/glob.hpp"
@@ -53,24 +55,9 @@
 #include <cereal/archives/binary.hpp>
 #include <sstream>
 
-// This macro may be moved to a global scope
-#define _THROW_LBANN_EXCEPTION_(_CLASS_NAME_,_MSG_) { \
-  std::stringstream _err; \
-  _err << __FILE__ << ' '  << __LINE__ << " :: " \
-      << (_CLASS_NAME_) << "::" << (_MSG_); \
-  throw lbann_exception(_err.str()); \
-}
-
-#define _THROW_LBANN_EXCEPTION2_(_CLASS_NAME_,_MSG1_,_MSG2_) { \
-  std::stringstream _err; \
-  _err << __FILE__ << ' '  << __LINE__ << " :: " \
-      << (_CLASS_NAME_) << "::" << (_MSG1_) << (_MSG2_); \
-  throw lbann_exception(_err.str()); \
-}
-
 // This comes after all the headers, and is only visible within the current implementation file.
 // To make sure, we put '#undef _CN_' at the end of this file
-#define _CN_ "data_reader_jag_conduit"
+#define _CN_ std::string("data_reader_jag_conduit")
 
 namespace lbann {
 
@@ -131,13 +118,16 @@ void data_reader_jag_conduit::shuffle_indices(rng_gen& gen) {
 
 int data_reader_jag_conduit::compute_max_num_parallel_readers() {
   if (m_io_buffer_type == "partitioned") {
-    set_num_parallel_readers(partitioned_io_buffer::compute_max_num_parallel_readers(
+#if 0
+    // @todo BVE FIXME - Why are we doing this here
+    set_num_parallel_readers(partitioned_io_buffer<DataType>::compute_max_num_parallel_readers(
                              0, get_mini_batch_size(),
                              get_num_parallel_readers(), get_comm()));
+#endif
     set_sample_stride(get_num_parallel_readers());
     set_iteration_stride(1);
   } else {
-    _THROW_LBANN_EXCEPTION_(get_type(), " unknown io_buffer type: " + m_io_buffer_type);
+    LBANN_ERROR(get_type() + ":: unknown io_buffer type: " + m_io_buffer_type);
   }
   return get_num_parallel_readers();
 }
@@ -146,18 +136,12 @@ bool data_reader_jag_conduit::check_num_parallel_readers(long data_set_size) {
   return true;
 }
 
-data_reader_jag_conduit::data_reader_jag_conduit(const std::shared_ptr<cv_process>& pp, bool shuffle)
+data_reader_jag_conduit::data_reader_jag_conduit(bool shuffle)
   : generic_data_reader(shuffle) {
   set_defaults();
-
-  if (!pp) {
-    _THROW_LBANN_EXCEPTION_(get_type(), " construction error: no image processor");
-  }
-
-  m_master_pps = lbann::make_unique<cv_process>(*pp);
 }
 
-void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, const std::vector<int>& ds_sample_move_list) {
+void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs) {
   m_independent = rhs.m_independent;
   m_independent_groups = rhs.m_independent_groups;
   m_dependent = rhs.m_dependent;
@@ -173,12 +157,6 @@ void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, c
   m_scalar_keys = rhs.m_scalar_keys;
   m_input_keys = rhs.m_input_keys;
 
-  if (!rhs.m_master_pps) {
-    _THROW_LBANN_EXCEPTION_(get_type(), " construction error: no image processor");
-  }
-
-  m_master_pps = lbann::make_unique<cv_process>(*rhs.m_master_pps);
-
   m_uniform_input_type = rhs.m_uniform_input_type;
 
   m_output_scalar_prefix = rhs.m_output_scalar_prefix;
@@ -210,11 +188,7 @@ void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, c
   m_list_per_model = rhs.m_list_per_model;
 
   if(rhs.m_data_store != nullptr) {
-    if(ds_sample_move_list.size() == 0) {
-      m_data_store = new data_store_conduit(rhs.get_data_store());
-    } else {
-      m_data_store = new data_store_conduit(rhs.get_data_store(), ds_sample_move_list);
-    }
+    m_data_store = new data_store_conduit(rhs.get_data_store());
     m_data_store->set_data_reader_ptr(this);
   }
 }
@@ -224,11 +198,6 @@ data_reader_jag_conduit::data_reader_jag_conduit(const data_reader_jag_conduit&
   copy_members(rhs);
 }
 
-data_reader_jag_conduit::data_reader_jag_conduit(const data_reader_jag_conduit& rhs, const std::vector<int>& ds_sample_move_list)
-  : generic_data_reader(rhs) {
-  copy_members(rhs, ds_sample_move_list);
-}
-
 data_reader_jag_conduit& data_reader_jag_conduit::operator=(const data_reader_jag_conduit& rhs) {
   // check for self-assignment
   if (this == &rhs) {
@@ -284,50 +253,45 @@ void data_reader_jag_conduit::set_defaults() {
   m_scalar_normalization_params.clear();
   m_input_normalization_params.clear();
 
-  m_sample_list.clear();
+  //m_sample_list.clear();
   m_list_per_trainer = false;
   m_list_per_model = false;
 }
 
-void data_reader_jag_conduit::setup(int num_io_threads, std::shared_ptr<thread_pool> io_thread_pool) {
+void data_reader_jag_conduit::setup(int num_io_threads, observer_ptr<thread_pool> io_thread_pool) {
   generic_data_reader::setup(num_io_threads, io_thread_pool);
-  replicate_processor(*m_master_pps, num_io_threads);
 }
 
-/// Replicate image processor for each I/O thread
-bool data_reader_jag_conduit::replicate_processor(const cv_process& pp, const int nthreads) {
-  m_pps.resize(nthreads);
-
-  // Construct thread private preprocessing objects out of a shared pointer
-  for (int i = 0; i < nthreads; ++i) {
-    m_pps[i] = lbann::make_unique<cv_process>(pp);
-  }
-
-  bool ok = true;
-  for (int i = 0; ok && (i < nthreads); ++i) {
-    if (!m_pps[i]) ok = false;
-  }
-
-  if (!ok || (nthreads <= 0)) {
-    _THROW_LBANN_EXCEPTION_(get_type(), " cannot replicate image processor");
-    return false;
-  }
+#ifdef _USE_IO_HANDLE_
+bool data_reader_jag_conduit::has_path(const data_reader_jag_conduit::file_handle_t& h,
+                                       const std::string& path) const {
+  return m_sample_list.is_file_handle_valid(h) && h->has_path(path);
+}
 
-  const std::vector<unsigned int> dims = pp.get_data_dims();
-  if ((dims.size() == 2u) && (dims[0] != 0u) && (dims[1] != 0u)) {
-    m_image_width = static_cast<int>(dims[0]);
-    m_image_height = static_cast<int>(dims[1]);
+void data_reader_jag_conduit::read_node(const data_reader_jag_conduit::file_handle_t& h,
+                                        const std::string& path,
+                                        conduit::Node& n) const {
+  if (!h) {
+    return;
   }
+  h->read(path, n);
+}
+#else
+bool data_reader_jag_conduit::has_path(const hid_t& h, const std::string& path) const {
+  return (m_sample_list.is_file_handle_valid(h) &&
+          conduit::relay::io::hdf5_has_path(h, path));
+}
 
-  return true;
+void data_reader_jag_conduit::read_node(const hid_t& h, const std::string& path, conduit::Node& n) const {
+  conduit::relay::io::hdf5_read(h, path, n);
 }
+#endif
 
 const conduit::Node& data_reader_jag_conduit::get_conduit_node(const conduit::Node& n_base, const std::string key) {
   return n_base[key];
 }
 
 bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::string& key, conduit::Node& node) const {
-
   if (m_io_thread_pool != nullptr && m_using_random_node.count(m_io_thread_pool->get_local_thread_id())) {
     LBANN_ERROR("previously retrieved a random conduit node from data_store, so shouldn't be here");
   }
@@ -337,11 +301,11 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin
   const std::string path = sample_name + key;
 
   sample_file_id_t id = s.first;
-  hid_t h = m_sample_list.get_samples_hdf5_handle(id);
-  if (h <= static_cast<hid_t>(0) || !conduit::relay::io::hdf5_has_path(h, path)) {
+  auto h = m_sample_list.get_samples_file_handle(id);
+  if (!has_path(h, path)) {
+    const std::string& file_name = m_sample_list.get_samples_filename(id);
     if (m_data_store != nullptr) {
-      const std::string& file_name = m_sample_list.get_samples_filename(id);
-      if (! m_data_store->is_preloaded()) {
+      if (! m_data_store->is_fully_loaded()) {
         const conduit::Node obj = m_data_store->get_random_node();
         node = obj["data"];
         const std::vector<std::string>& child_names = node.child_names();
@@ -354,13 +318,11 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin
                   <<" and key: " << key << "\n";
         return false;
       } else {
-        if (h <= static_cast<hid_t>(0) ) {
+        if (!m_sample_list.is_file_handle_valid(h)) {
           LBANN_ERROR("failed to get file handle for file " + file_name);
-        } else if (!conduit::relay::io::hdf5_has_path(h, path)) {
+        } else {
           LBANN_ERROR("got file handle for file " + file_name + \
                       " but the path doesn't exist in the file: " + path);
-        } else {
-          LBANN_ERROR("it should not be possible to be here");
         }
       }
     }
@@ -368,8 +330,7 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin
     // this block fires if we cannot load a conduit node, either from file
     // or from the data_store
     else {
-      const std::string& file_name = m_sample_list.get_samples_filename(id);
-      if (h <= static_cast<hid_t>(0)) {
+      if (!m_sample_list.is_file_handle_valid(h)) {
         LBANN_ERROR(get_type() + ":: Cannot open file " + file_name + \
                     " in dir: " + m_sample_list.get_samples_dirname() +
                     " for sample "+ sample_name + " ran_in_trainer: " \
@@ -387,10 +348,7 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin
     }
   }
 
-  /// @todo explore the possibility of putting the sample name in
-  /// node's hierarchy, e.g. node[sample_name]
-  conduit::relay::io::hdf5_read(h, path, node);
-
+  read_node(h, path, node);
   return true;
 }
 
@@ -398,16 +356,16 @@ bool data_reader_jag_conduit::has_conduit_path(const size_t i, const std::string
   const sample_t& s = m_sample_list[i];
   sample_file_id_t id = s.first;
   const std::string& sample_name = s.second;
-  const hid_t h = m_sample_list.get_samples_hdf5_handle(id);
+  const auto h = m_sample_list.get_samples_file_handle(id);
   const std::string path = sample_name + key;
-  if (h <= static_cast<hid_t>(0) || !conduit::relay::io::hdf5_has_path(h, path)) {
+  if (!has_path(h, path)) {
     const std::string& file_name = m_sample_list.get_samples_filename(id);
-    _THROW_LBANN_EXCEPTION_(get_type(), "Cannot open file " + file_name + \
-                                        " for sample "+ sample_name);
+    LBANN_ERROR(get_type() +  ":: Cannot open file " + file_name + \
+                " for sample "+ sample_name);
     return false;
   }
 
-  return conduit::relay::io::hdf5_has_path(h, std::string("/") + sample_name + key);
+  return true;
 }
 
 
@@ -426,7 +384,7 @@ void data_reader_jag_conduit::set_independent_variable_type(
 void data_reader_jag_conduit::add_independent_variable_type(
   const data_reader_jag_conduit::variable_t independent) {
   if (!(independent == JAG_Image || independent == JAG_Scalar || independent == JAG_Input)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized independent variable type ");
+    LBANN_ERROR(_CN_ + ":: unrecognized independent variable type ");
   }
   m_independent.push_back(independent);
 }
@@ -446,7 +404,7 @@ void data_reader_jag_conduit::set_dependent_variable_type(
 void data_reader_jag_conduit::add_dependent_variable_type(
   const data_reader_jag_conduit::variable_t dependent) {
   if (!(dependent == JAG_Image || dependent == JAG_Scalar || dependent == JAG_Input)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized dependent variable type ");
+    LBANN_ERROR(_CN_ + ":: unrecognized dependent variable type ");
   }
   m_dependent.push_back(dependent);
 }
@@ -461,17 +419,25 @@ data_reader_jag_conduit::get_dependent_variable_type() const {
   return m_dependent;
 }
 
+/**
+ * Note: this method is called by init_image_data_reader in
+ *       src/proto/init_image_data_readers.cpp
+ */
 void data_reader_jag_conduit::set_image_dims(const int width, const int height, const int ch) {
   if ((width > 0) && (height > 0) && (ch > 0)) { // set and valid
     m_image_width = width;
     m_image_height = height;
     m_image_num_channels = ch;
   } else if (!((width == 0) && (height == 0) && (ch == 1))) { // set but not valid
-    _THROW_LBANN_EXCEPTION_(_CN_, "set_image_dims() : invalid image dims");
+    LBANN_ERROR(_CN_ + ":: set_image_dims() : invalid image dims");
   }
   set_linearized_image_size();
 }
 
+/**
+ * Note: this method is called by init_image_data_reader in
+ *       src/proto/init_image_data_readers.cpp
+ */
 void data_reader_jag_conduit::set_image_choices(const std::vector<std::string> image_keys) {
   m_emi_image_keys = image_keys;
   // For example, in the data reader prototext file, have a line similar to the one below
@@ -522,6 +488,10 @@ bool data_reader_jag_conduit::filter(const std::set<std::string>& key_filter,
   return false;
 }
 
+/**
+ * Note: this method is called by init_image_data_reader in
+ *       src/proto/init_image_data_readers.cpp
+ */
 void data_reader_jag_conduit::set_scalar_choices(const std::vector<std::string>& keys) {
   m_scalar_keys = keys;
   check_scalar_keys();
@@ -551,6 +521,8 @@ const std::vector<std::string>& data_reader_jag_conduit::get_scalar_choices() co
 /**
  * To use no key, set 'Undefined' to the corresponding variable type,
  * or call this with an empty vector argument after loading data.
+ * Note: this method is called by init_image_data_reader in
+ *       src/proto/init_image_data_readers.cpp
  */
 void data_reader_jag_conduit::set_input_choices(const std::vector<std::string>& keys) {
   m_input_keys = keys;
@@ -601,7 +573,7 @@ void data_reader_jag_conduit::check_image_data() {
 
   size_t first_idx = (m_sample_list[0]).first;
   if (!has_conduit_path(first_idx, "")) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no sample by " + m_sample_list[first_idx].second);
+    LBANN_ERROR(_CN_ + ":: check_image_data() : no sample by " + m_sample_list[first_idx].second);
     return;
   }
   conduit::Node n_imageset;
@@ -614,7 +586,7 @@ void data_reader_jag_conduit::check_image_data() {
   }
   for (const auto& emi_tag: m_emi_image_keys) {
     if (!has_conduit_path(first_idx, m_output_image_prefix + emi_tag)) {
-      _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no emi image by " + emi_tag);
+      LBANN_ERROR(_CN_ + ":: check_image_data() : no emi image by " + emi_tag);
       return;
     }
   }
@@ -629,18 +601,18 @@ void data_reader_jag_conduit::check_image_data() {
       m_image_num_channels = 1;
       set_linearized_image_size();
     } else {
-      std::string msg = "expected linearized emi image size: "
+      std::string msg = ":: expected linearized emi image size: "
                       + std::to_string(emi.number_of_elements()) + '\n';
-      _THROW_LBANN_EXCEPTION_(_CN_, msg + get_description());
+      LBANN_ERROR(_CN_ +  msg + get_description());
     }
   }
 
   if (m_image_normalization_params.empty()) {
     m_image_normalization_params.assign(m_emi_image_keys.size()*m_image_num_channels, linear_transform_t(1.0, 0.0));
   } else if (m_image_normalization_params.size() != static_cast<size_t>(m_image_num_channels)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of image normalization parameter sets!" \
-                                + std::to_string(m_image_normalization_params.size()) + " != " \
-                                + std::to_string(m_image_num_channels));
+    LBANN_ERROR(_CN_ + ":: Incorrect number of image normalization parameter sets!" \
+                + std::to_string(m_image_normalization_params.size()) + " != " \
+                + std::to_string(m_image_num_channels));
   }
 #if defined(LBANN_DEBUG)
   std::cout << "image normalization parameters: " << std::endl;
@@ -700,15 +672,15 @@ void data_reader_jag_conduit::check_scalar_keys() {
         msg += ' ' + m_scalar_keys[i];
       }
     }
-    _THROW_LBANN_EXCEPTION_(_CN_, "check_scalar_keys() : " + msg);
+    LBANN_ERROR(_CN_ + ":: check_scalar_keys() : " + msg);
   }
 
   if (m_scalar_normalization_params.empty()) {
     m_scalar_normalization_params.assign(m_scalar_keys.size(), linear_transform_t(1.0, 0.0));
   } else if (m_scalar_normalization_params.size() != m_scalar_keys.size()) {
-     _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of scalar normalization parameter sets! " \
-                                 + std::to_string(m_scalar_normalization_params.size()) + " != " \
-                                 + std::to_string(m_scalar_keys.size()));
+     LBANN_ERROR(_CN_ + ":: Incorrect number of scalar normalization parameter sets! " \
+                 + std::to_string(m_scalar_normalization_params.size()) + " != " \
+                 + std::to_string(m_scalar_keys.size()));
   }
 #if defined(LBANN_DEBUG)
   std::cout << "scalar normalization parameters: " << std::endl;
@@ -771,7 +743,7 @@ void data_reader_jag_conduit::check_input_keys() {
         msg += ' ' + m_input_keys[i];
       }
     }
-    _THROW_LBANN_EXCEPTION_(_CN_, "check_input_keys() : " + msg);
+    LBANN_ERROR(_CN_ + ":: check_input_keys() : " + msg);
   }
 
   m_uniform_input_type = (m_input_keys.size() == 0u)? false : is_input_t;
@@ -779,9 +751,9 @@ void data_reader_jag_conduit::check_input_keys() {
   if (m_input_normalization_params.empty()) {
     m_input_normalization_params.assign(m_input_keys.size(), linear_transform_t(1.0, 0.0));
   } else if (m_input_normalization_params.size() != m_input_keys.size()) {
-     _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of input normalization parameter sets! " \
-                                 + std::to_string(m_input_normalization_params.size()) + " != " \
-                                 + std::to_string(m_input_keys.size()));
+     LBANN_ERROR(_CN_ + ":: Incorrect number of input normalization parameter sets! " \
+                 + std::to_string(m_input_normalization_params.size()) + " != " \
+                 + std::to_string(m_input_keys.size()));
   }
 #if defined(LBANN_DEBUG)
   std::cout << "input normalization parameters: " << std::endl;
@@ -812,19 +784,24 @@ void data_reader_jag_conduit::load() {
   m_shuffled_indices.clear();
 
   if(is_master()) {
-    std::cout << "starting load" << std::endl;
+    std::cout << "data_reader_jag_conduit - starting load" << std::endl;
   }
   const std::string data_dir = add_delimiter(get_file_dir());
   const std::string sample_list_file = data_dir + get_data_index_list();
 
   options *opts = options::get();
+  bool check_data = opts->get_bool("check_data");
 
   /// The use of these flags need to be updated to properly separate
   /// how index lists are used between trainers and models
   /// @todo m_list_per_trainer || m_list_per_model
+  double tm2 = get_time();
   load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer());
   if(is_master()) {
-    std::cout << "Finished sample list, check data" << std::endl;
+      std::cout << "Finished loading sample list; time: " << get_time() - tm2 << std::endl;
+    if (!check_data) {
+      std::cout << "Skipping check data" << std::endl;
+    }
   }
 
   /// Check the data that each rank loaded
@@ -832,63 +809,69 @@ void data_reader_jag_conduit::load() {
     m_is_data_loaded = true;
 
     /// Open the first sample to make sure that all of the fields are correct
-    m_sample_list.open_samples_hdf5_handle(0, true);
+    m_sample_list.open_samples_file_handle(0, true);
 
     if (m_scalar_keys.size() == 0u) {
       set_all_scalar_choices(); // use all by default if none is specified
     }
-    check_scalar_keys();
+    if (check_data) {
+      check_scalar_keys();
+    }
 
     if (m_input_keys.size() == 0u) {
       set_all_input_choices(); // use all by default if none is specified
     }
-    check_input_keys();
+    if (check_data) {
+      check_input_keys();
+    }
 
-    check_image_data();
+    if (check_data) {
+      check_image_data();
+    }
 
-    m_sample_list.close_if_done_samples_hdf5_handle(0);
+    m_sample_list.close_if_done_samples_file_handle(0);
   }
   if(is_master()) {
     std::cout << "Done with data checking" << std::endl;
   }
 
-
-  // need to resize and init shuffled indices here, since it's needed in
-  // preload_data_store, which must be called before merging the sample lists
-  int sz = m_sample_list.size();
-  std::vector<int> local_list_sizes(m_comm->get_procs_per_trainer());
-  m_comm->trainer_all_gather(sz, local_list_sizes);
-
-  if(is_master()) {
-    std::cout << "We now have the proper size" << std::endl;
-  }
-
   /// Merge all of the sample lists
+  tm2 = get_time();
   m_sample_list.all_gather_packed_lists(*m_comm);
   if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) {
-    const std::string msg = " writing sample list " + sample_list_file;
-    log_msg(msg.c_str());
+    {
+      const std::string msg = " writing sample list " + sample_list_file;
+      LBANN_WARNING(msg);
+    }
     std::stringstream s;
     std::string basename = get_basename_without_ext(sample_list_file);
     std::string ext = get_ext_name(sample_list_file);
     s << basename << "." << ext;
     m_sample_list.write(s.str());
   }
+  if (is_master()) {
+    std::cout << "time for all_gather_packed_lists: " << get_time() - tm2 << std::endl;
+  }
+
   m_shuffled_indices.resize(m_sample_list.size());
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
 
   if(is_master()) {
     std::cout << "Lists have been gathered" << std::endl;
   }
 
-  instantiate_data_store(local_list_sizes);
-
+  instantiate_data_store();
   select_subset_of_data();
 }
 
+void data_reader_jag_conduit::preload_helper(const hid_t& h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node) {
+  const std::string path = sample_name + field_name;
+  const std::string key2 = '/' + LBANN_DATA_ID_STR(data_id) + field_name;
+  read_node(h, path, node[key2]);
+}
 
-void data_reader_jag_conduit::preload_data_store() {
-  m_data_store->set_preload();
+void data_reader_jag_conduit::do_preload_data_store() {
   conduit::Node work;
   const std::string key; // key = "" is intentional
 
@@ -899,40 +882,47 @@ void data_reader_jag_conduit::preload_data_store() {
   double tm1 = get_time();
   if (get_comm()->am_world_master() ||
       (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) {
-    std::stringstream msg;
-    msg << " for role: " << get_role() << " starting preload";
-    log_msg(msg.str().c_str());
+    LBANN_WARNING("starting preload for role: ", get_role());
   }
 
   for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) {
-    if(m_data_store->get_index_owner(idx) != m_rank_in_model) {
+    int index = m_shuffled_indices[idx];
+    if(m_data_store->get_index_owner(index) != m_rank_in_model) {
       continue;
     }
     try {
-      work.reset();
-      m_sample_list.open_samples_hdf5_handle(idx, true);
-      load_conduit_node(idx, key, work);
-      conduit::Node & node = m_data_store->get_empty_node(idx);
-      const std::string padded_idx = '/' + LBANN_DATA_ID_STR(idx);
-      node[padded_idx] = work;
-
-      m_data_store->set_preloaded_conduit_node(idx, node);
-    }catch (conduit::Error const& e) {
-      LBANN_ERROR(" :: trying to load the node " + std::to_string(idx) + " with key " + key + " and got " + e.what());
+      const sample_t& s = m_sample_list[index];
+      const std::string& sample_name = s.second;
+      sample_file_id_t id = s.first;
+      m_sample_list.open_samples_file_handle(index, true);
+      auto h = m_sample_list.get_samples_file_handle(id);
+      conduit::Node & node = m_data_store->get_empty_node(index);
+
+      preload_helper(h, sample_name, m_output_scalar_prefix, index, node);
+      preload_helper(h, sample_name, m_input_prefix, index, node);
+      for (auto t : m_emi_image_keys) {
+        const std::string field_name = m_output_image_prefix + t;
+        preload_helper(h, sample_name, field_name, index, node);
+      }
+      m_data_store->set_preloaded_conduit_node(index, node);
+    } catch (conduit::Error const& e) {
+      LBANN_ERROR(" :: trying to load the node " + std::to_string(index) + " with key " + key + " and got " + e.what());
     }
   }
   /// Once all of the data has been preloaded, close all of the file handles
   for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) {
-    if(m_data_store->get_index_owner(idx) != m_rank_in_model) {
+    int index = m_shuffled_indices[idx];
+    if(m_data_store->get_index_owner(index) != m_rank_in_model) {
       continue;
     }
-    m_sample_list.close_if_done_samples_hdf5_handle(idx);
+    m_sample_list.close_if_done_samples_file_handle(index);
   }
+
   if (get_comm()->am_world_master() ||
       (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) {
     std::stringstream msg;
     msg << " loading data for role: " << get_role() << " took " << get_time() - tm1 << "s";
-    log_msg(msg.str().c_str());
+    LBANN_WARNING(msg.str());
   }
 }
 
@@ -992,8 +982,8 @@ size_t data_reader_jag_conduit::get_linearized_size(const data_reader_jag_condui
     case JAG_Input:
       return get_linearized_input_size();
     default: { // includes Unefined case
-      _THROW_LBANN_EXCEPTION2_(_CN_, "get_linearized_size() : ", \
-                                     "unknown or undefined variable type");
+      LBANN_ERROR(_CN_ + ":: get_linearized_size() : " \
+                  + "unknown or undefined variable type");
     }
   }
   return 0u;
@@ -1049,8 +1039,8 @@ const std::vector<int> data_reader_jag_conduit::get_dims(const data_reader_jag_c
     case JAG_Input:
       return {static_cast<int>(get_linearized_input_size())};
     default: { // includes Undefined case
-      _THROW_LBANN_EXCEPTION2_(_CN_, "get_dims() : ", \
-                                     "unknown or undefined variable type");
+      LBANN_ERROR(_CN_ + ":: get_dims() : " \
+                  + "unknown or undefined variable type");
     }
   }
   return {};
@@ -1072,7 +1062,25 @@ const std::vector<int> data_reader_jag_conduit::get_data_dims() const {
 #endif
 }
 
-std::vector<El::Int> data_reader_jag_conduit::get_slice_points(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const {
+std::vector<El::Int> data_reader_jag_conduit::get_slice_points(
+  const slice_points_mode var_category,
+  bool& is_supported) {
+  std::vector<El::Int> slice_points;
+  is_supported = true;
+  if (var_category == slice_points_mode::INDEPENDENT) {
+    slice_points = get_slice_points_independent();
+  } else if (var_category == slice_points_mode::DEPENDENT) {
+    slice_points = get_slice_points_dependent();
+  } else if (var_category == slice_points_mode::NA) {
+    is_supported = false;
+  } else {
+    LBANN_ERROR("Unknown variable category \"" + lbann::to_string(var_category) \
+                + "\". Must be either \"independent\" or \"dependent\".");
+  }
+  return slice_points;
+}
+
+std::vector<El::Int> data_reader_jag_conduit::get_slice_points_impl(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const {
   std::vector<El::Int> points(var.size()+1u, static_cast<El::Int>(0));
   for (size_t i = 0u; i < var.size(); ++i) {
     const auto& group = var[i];
@@ -1086,11 +1094,11 @@ std::vector<El::Int> data_reader_jag_conduit::get_slice_points(const std::vector
 }
 
 std::vector<El::Int> data_reader_jag_conduit::get_slice_points_independent() const {
-  return get_slice_points(m_independent_groups);
+  return get_slice_points_impl(m_independent_groups);
 }
 
 std::vector<El::Int> data_reader_jag_conduit::get_slice_points_dependent() const {
-  return get_slice_points(m_independent_groups);
+  return get_slice_points_impl(m_dependent_groups);
 }
 
 int data_reader_jag_conduit::get_num_data() const {
@@ -1114,7 +1122,7 @@ int data_reader_jag_conduit::get_linearized_size(const std::string& desc) const
   } else if (desc == "JAG_Input") {
     return get_linearized_size(JAG_Input);
   } else {
-    _THROW_LBANN_EXCEPTION_(_CN_, "get_linearized_size() : unknown key " + desc);
+    LBANN_ERROR(_CN_ + ":: get_linearized_size() : unknown key " + desc);
   }
   return generic_data_reader::get_linearized_size(desc);
 }
@@ -1224,18 +1232,20 @@ bool data_reader_jag_conduit::check_non_numeric(const std::string key) {
 }
 
 
-std::vector< std::vector<data_reader_jag_conduit::ch_t> >
+std::vector< std::vector<DataType> >
 data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& sample) const {
-  std::vector< std::vector<ch_t> > image_ptrs;
+  std::vector< std::vector<DataType> > image_ptrs;
   image_ptrs.reserve(m_emi_image_keys.size());
 
   for (const auto& emi_tag : m_emi_image_keys) {
     const std::string conduit_field = m_output_image_prefix + emi_tag;
-    const std::string conduit_obj = '/' + LBANN_DATA_ID_STR(sample_id) + '/' + conduit_field;
+    const std::string conduit_obj = LBANN_DATA_ID_STR(sample_id) + conduit_field;
     if(sample[conduit_obj].schema().dtype().is_empty()) {
       if (data_store_active()) {
-        LBANN_ERROR("Unable to find field " + conduit_obj
-                    + " in conduit node: " + std::to_string(sample_id));
+        LBANN_ERROR("Unable to find field ", conduit_obj,
+                    " in conduit node: ", std::to_string(sample_id),
+                    ": num nodes successfully loaded by this rank: ",
+                    m_data_store->get_data_size(), " num successful calls to get_image_data on this rank: ");
       }
       conduit::Node n_image;
       bool from_file = load_conduit_node(sample_id, conduit_field, n_image);
@@ -1248,107 +1258,13 @@ data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& s
     conduit_ch_t emi = sample[conduit_obj].value();
     const size_t num_vals = emi.number_of_elements();
     const ch_t* emi_data = sample[conduit_obj].value();
+    // Note that data will be cast from ch_t to DataType format
     image_ptrs.emplace_back(emi_data, emi_data + num_vals);
   }
 
   return image_ptrs;
 }
 
-cv::Mat data_reader_jag_conduit::cast_to_cvMat(
-  const std::pair<size_t, const ch_t*> img, const int height, const int num_ch) {
-  const int num_pixels = static_cast<int>(img.first);
-  const ch_t* ptr = img.second;
-
-  // add a zero copying view to data
-  using InputBuf_T = cv_image_type<ch_t>;
-  const cv::Mat image(num_pixels, 1, InputBuf_T::T(1u),
-                      reinterpret_cast<void*>(const_cast<ch_t*>(ptr)));
-  // reshape the image. Furter need to clone (deep-copy) the image
-  // to preserve the constness of the original data
-  return (image.reshape(num_ch, height));
-}
-
-/// Assumes the same parameters for the same channel from different views
-void data_reader_jag_conduit::image_normalization(cv::Mat& img, size_t i, size_t ch) const {
-  const auto& tr = m_image_normalization_params.at(ch);
-  img.convertTo(img, -1, tr.first, tr.second);
-}
-
-std::vector<cv::Mat> data_reader_jag_conduit::get_cv_images(const size_t sample_id, conduit::Node& sample) const {
-  const std::vector< std::vector<ch_t> > img_data(get_image_data(sample_id, sample));
-  std::vector<cv::Mat> images;
-
-  if (m_split_channels) {
-    images.reserve(img_data.size()*m_image_num_channels);
-    for (size_t i = 0u; i < img_data.size(); ++i) {
-      const auto& img = img_data[i];
-      cv::Mat ch[m_image_num_channels];
-      cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch);
-      for(int c = 0; c < m_image_num_channels; ++c) {
-    #if 1 // with normalization
-        image_normalization(ch[c], i, static_cast<size_t>(c));
-    #endif
-        images.emplace_back(ch[c].clone());
-      }
-    }
-  } else {
-    images.reserve(img_data.size());
-    for (size_t i = 0u; i < img_data.size(); ++i) {
-      const auto& img = img_data[i];
-    #if 1 // with normalization
-      cv::Mat ch[m_image_num_channels];
-      cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch);
-      for(int c = 0; c < m_image_num_channels; ++c) {
-        image_normalization(ch[c], i, static_cast<size_t>(c));
-      }
-      cv::Mat img_normalized;
-      cv::merge(ch, m_image_num_channels, img_normalized);
-      images.emplace_back(img_normalized);
-    #else
-      images.emplace_back(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels).clone());
-    #endif
-    }
-  }
-  return images;
-}
-
-std::vector<data_reader_jag_conduit::ch_t> data_reader_jag_conduit::get_images(const size_t sample_id, conduit::Node& sample) const {
-  std::vector< std::vector<ch_t> > img_data(get_image_data(sample_id, sample));
-  std::vector<ch_t> images;
-
-  if (m_split_channels) {
-    images.resize(get_linearized_size(JAG_Image));
-    size_t i = 0u;
-    size_t j = 0u;
-    for (const auto& img: img_data) {
-      const ch_t * const ptr_end = img.data() + img.size();
-      for (int c=0; c < m_image_num_channels; ++c) {
-        const auto& tr = m_image_normalization_params.at(c);
-        for (const ch_t* ptr = img.data() + c; ptr < ptr_end; ptr += m_image_num_channels) {
-        #if 1 // with normalization
-          images[i++] = cv::saturate_cast<ch_t>(*ptr * tr.first + tr.second);
-        #else
-          images[i++] = *ptr;
-        #endif
-        }
-      }
-      j ++;
-    }
-  } else {
-    images.reserve(get_linearized_size(JAG_Image));
-    for (const auto& img: img_data) {
-    #if 1 // with normalization
-      // TODO: normalization needed
-      _THROW_LBANN_EXCEPTION_(_CN_, "get_images() : normalization not implemented yet");
-      (void) img;
-    #else
-      images.insert(images.end(), img.cbegin(), ptr + img.cend());
-    #endif
-    }
-  }
-
-  return images;
-}
 
 std::vector<data_reader_jag_conduit::scalar_t> data_reader_jag_conduit::get_scalars(const size_t sample_id, conduit::Node& sample) const {
   std::vector<scalar_t> scalars;
@@ -1439,7 +1355,6 @@ std::vector<data_reader_jag_conduit::input_t> data_reader_jag_conduit::get_input
   return inputs;
 }
 
-
 std::vector<CPUMat>
 data_reader_jag_conduit::create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const {
   std::vector<CPUMat> X_v(sizes.size());
@@ -1457,21 +1372,39 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sampl
   const data_reader_jag_conduit::variable_t vt, const std::string tag) {
   switch (vt) {
     case JAG_Image: {
-      const size_t num_images = get_num_img_srcs()
-                              * static_cast<size_t>(m_split_channels? m_image_num_channels : 1u);
-      const size_t image_size = m_split_channels? get_linearized_1ch_image_size() : get_linearized_image_size();
+      const size_t num_images = get_num_img_srcs();
+      const size_t num_channels = m_image_num_channels;
+      const size_t image_size = get_linearized_image_size();
       const std::vector<size_t> sizes(num_images, image_size);
       std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-      std::vector<cv::Mat> images = get_cv_images(data_id, sample);
+      std::vector< std::vector<DataType> > img_data(get_image_data(data_id, sample));
 
-      if (images.size() != num_images) {
-        _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected", \
-          std::to_string(images.size()) + "!=" + std::to_string(num_images));
+      if (img_data.size() != num_images) {
+        LBANN_ERROR(_CN_ + ":: fetch() : the number of images is not as expected " \
+                    + std::to_string(img_data.size()) + "!=" + std::to_string(num_images));
       }
+      if (!m_split_channels && m_image_num_channels != 1) {
+        LBANN_ERROR(_CN_ + ":: fetch() : transform pipeline now requires single channel images: num_channels=" \
+                    + std::to_string(m_image_num_channels) + " split_channel=" + std::to_string(m_split_channels));
+      }
+
+      std::vector<size_t> dims = {num_channels, static_cast<size_t>(m_image_height), static_cast<size_t>(m_image_width)};
+      std::vector<size_t> ch_dims = {static_cast<size_t>(m_image_height), static_cast<size_t>(m_image_width)};
+      auto tll = lbann::transform::repack_HWC_to_CHW_layout();
 
       for(size_t i=0u; i < num_images; ++i) {
-        int width, height, img_type;
-        image_utils::process_image(images[i], width, height, img_type, *(m_pps[tid]), X_v[i]);
+        CPUMat img_mat = CPUMat(utils::get_linearized_size(dims), 1, img_data[i].data(), utils::get_linearized_size(dims));
+        utils::type_erased_matrix te_img(std::move(img_mat));
+        CPUMat tgt_mat = CPUMat(utils::get_linearized_size(dims), 1);
+        tll.apply(te_img, X_v[i], dims);
+        const std::vector<size_t> ch_sizes(num_channels, m_image_height * m_image_width);
+        std::vector<CPUMat> X_ch_v = create_datum_views(X_v[i], ch_sizes, mb_idx);
+        for(size_t ch = 0; ch < num_channels; ch++) {
+          const auto& tr = m_image_normalization_params.at(ch);
+          auto s = lbann::transform::scale_and_translate(tr.first, tr.second);
+          utils::type_erased_matrix te_img_plane(std::move(X_ch_v[ch]));
+          s.apply(te_img_plane, ch_dims);
+        }
       }
       break;
     }
@@ -1486,7 +1419,7 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sampl
       break;
     }
     default: { // includes Undefined case
-      _THROW_LBANN_EXCEPTION_(_CN_, "fetch_" + tag + "() : unknown or undefined variable type");
+      LBANN_ERROR(_CN_ + ":: fetch_" + tag + "() : unknown or undefined variable type");
     }
   }
   return true;
@@ -1549,7 +1482,7 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
     const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
     node.set_external(ds_node);
   }else {
-    m_sample_list.open_samples_hdf5_handle(data_id);
+    m_sample_list.open_samples_file_handle(data_id);
   }
 
   for(size_t i = 0u; ok && (i < X_v.size()); ++i) {
@@ -1562,26 +1495,27 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
     m_data_store->set_conduit_node(data_id, node);
   }
 
-  m_sample_list.close_if_done_samples_hdf5_handle(data_id);
+  m_sample_list.close_if_done_samples_file_handle(data_id);
   m_using_random_node.erase(m_io_thread_pool->get_local_thread_id());
   return ok;
 }
 
 bool data_reader_jag_conduit::fetch_response(CPUMat& X, int data_id, int mb_idx) {
+  const auto& c = static_cast<const sgd_execution_context&>(m_trainer->get_data_coordinator().get_execution_context());
   int tid = m_io_thread_pool->get_local_thread_id();
   std::vector<size_t> sizes = get_linearized_response_sizes();
   std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
   bool ok = true;
   // Create a node to hold all of the data
   conduit::Node node;
-  if (m_data_store != nullptr && m_model->get_epoch() > 0) {
+  if (m_data_store != nullptr && c.get_epoch() > 0) {
     const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
     node.set_external(ds_node);
   }
   for(size_t i = 0u; ok && (i < X_v.size()); ++i) {
     ok = fetch(X_v[i], data_id, node, 0, tid, m_dependent[i], "response");
   }
-  if (m_data_store != nullptr && m_model->get_epoch() == 0) {
+  if (m_data_store != nullptr && c.get_epoch() == 0) {
     // Once the node has been populated save it in the data store
     if (m_data_store != nullptr) {
       m_data_store->set_conduit_node(data_id, node);
@@ -1606,10 +1540,6 @@ void data_reader_jag_conduit::setup_data_store(int mini_batch_size) {
    }
 }
 
-void data_reader_jag_conduit::save_image(Mat& pixels, const std::string filename, bool do_scale) {
-  internal_save_image(pixels, filename, m_image_height, m_image_width, 1, do_scale);
-}
-
 void data_reader_jag_conduit::print_schema(const size_t sample_id) const {
   //@TODO revisit later -- don't know how to handle this yet
   if (m_data_store != nullptr) {
@@ -1648,4 +1578,3 @@ void data_reader_jag_conduit::add_input_normalization_param(const data_reader_ja
 } // end of namespace lbann
 
 #undef _CN_
-#endif // LBANN_HAS_CONDUIT
diff --git a/src/data_readers/data_reader_merge_features.cpp b/src/data_readers/data_reader_merge_features.cpp
index 6f444a04b7f..80dd97dce33 100644
--- a/src/data_readers/data_reader_merge_features.cpp
+++ b/src/data_readers/data_reader_merge_features.cpp
@@ -87,6 +87,7 @@ void data_reader_merge_features::load() {
   // Reset indices.
   m_shuffled_indices.resize(num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_merge_samples.cpp b/src/data_readers/data_reader_merge_samples.cpp
index ec6560968d7..046263d9deb 100644
--- a/src/data_readers/data_reader_merge_samples.cpp
+++ b/src/data_readers/data_reader_merge_samples.cpp
@@ -92,6 +92,7 @@ void data_reader_merge_samples::setup_indices(int num_samples) {
   // That's not strictly necessary, but does not impact anything.
   m_shuffled_indices.resize(num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_mesh.cpp b/src/data_readers/data_reader_mesh.cpp
index 1d3d2318294..27d0a4dd54b 100644
--- a/src/data_readers/data_reader_mesh.cpp
+++ b/src/data_readers/data_reader_mesh.cpp
@@ -63,6 +63,7 @@ void mesh_reader::load() {
   // Reset indices.
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_mnist.cpp b/src/data_readers/data_reader_mnist.cpp
index b7b0612efac..109242a4f85 100644
--- a/src/data_readers/data_reader_mnist.cpp
+++ b/src/data_readers/data_reader_mnist.cpp
@@ -57,9 +57,11 @@ bool mnist_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
   }
 
   auto pixel_col = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1));
-  augment(pixel_col, m_image_height, m_image_width, 1);
-  normalize(pixel_col, 1);
-  pixel_noise(pixel_col); //add noise to image, disable by default
+  std::vector<size_t> dims = {
+    1ull,
+    static_cast<size_t>(m_image_height),
+    static_cast<size_t>(m_image_width)};
+  m_transform_pipeline.apply(pixel_col, dims);
   return true;
 }
 
@@ -139,7 +141,7 @@ void load_mnist_data(const std::string imagepath, const std::string labelpath,
 
 void mnist_reader::load() {
   if (is_master()) {
-    std::cerr << "starting lbann::mnist_reader::load\n";
+    std::cout << "starting lbann::mnist_reader::load\n";
   }
   m_image_data.clear();
 
@@ -154,7 +156,7 @@ void mnist_reader::load() {
   const std::string labelpath = FileDir + "/" + LabelFile;
 
   if (is_master()) {
-    std::cerr << "read labels!\n";
+    std::cout << "read labels!\n";
   }
 
   load_mnist_data(imagepath, labelpath, m_first_n, m_image_data);
@@ -171,9 +173,10 @@ void mnist_reader::load() {
     m_shuffled_indices[n] = n;
   }
   if (is_master()) {
-    std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " <<
+    std::cout << "calling select_subset_of_data; m_shuffled_indices.size: " <<
       m_shuffled_indices.size() << std::endl;
   }
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_mnist_siamese.cpp b/src/data_readers/data_reader_mnist_siamese.cpp
deleted file mode 100644
index 9f7f2fce6a4..00000000000
--- a/src/data_readers/data_reader_mnist_siamese.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_mnist_siamese .hpp .cpp - data reader class for mnist dataset
-//                     employing two images per sample to feed siamese model
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_mnist_siamese.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <fstream>
-#include <sstream>
-#include <omp.h>
-#include <algorithm> // shuffle()
-#include <array>
-#include <limits>
-
-namespace lbann {
-
-data_reader_mnist_siamese::data_reader_mnist_siamese(const std::shared_ptr<cv_process>& pp, bool shuffle)
-  : data_reader_multi_images(pp, shuffle) {
-  set_defaults();
-}
-
-data_reader_mnist_siamese::data_reader_mnist_siamese(const data_reader_mnist_siamese& rhs)
-  : data_reader_multi_images(rhs),
-    m_shuffled_indices2(rhs.m_shuffled_indices2),
-    m_image_data(rhs.m_image_data)
-{}
-
-data_reader_mnist_siamese& data_reader_mnist_siamese::operator=(const data_reader_mnist_siamese& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  data_reader_multi_images::operator=(rhs);
-  m_shuffled_indices2 = rhs.m_shuffled_indices2;
-  m_image_data = rhs.m_image_data;
-
-  return (*this);
-}
-
-data_reader_mnist_siamese::~data_reader_mnist_siamese() {
-}
-
-void data_reader_mnist_siamese::set_defaults() {
-  m_image_width = 28;
-  m_image_height = 28;
-  m_image_num_channels = 1;
-  set_linearized_image_size();
-  m_num_labels = 2;
-  m_num_img_srcs = 2;
-}
-
-
-void data_reader_mnist_siamese::set_input_params(
-  const int, const int, const int, const int) {
-  set_defaults();
-}
-
-
-/**
- * Fill the input minibatch matrix with the samples of image pairs by using
- * the overloaded fetch_datum()
- */
-int data_reader_mnist_siamese::fetch_data(CPUMat& X, El::Matrix<El::Int>& indices_fetched) {
-  int nthreads = m_io_thread_pool->get_num_threads();
-  if(!position_valid()) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-      + " :: " + get_type() + "  load error: !position_valid"
-      + " -- current pos = " + std::to_string(m_current_pos)
-      + " and there are " + std::to_string(m_shuffled_indices.size()) + " indices");
-  }
-
-  /// Allow each thread to perform any preprocessing necessary on the
-  /// data source prior to fetching data
-  for (int t = 0; t < nthreads; t++) {
-    preprocess_data_source(t);
-  }
-
-  int loaded_batch_size = get_loaded_mini_batch_size();
-  const int end_pos = std::min(static_cast<size_t>(m_current_pos+loaded_batch_size),
-                               m_shuffled_indices.size());
-  const int mb_size = std::min(
-    El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride},
-    X.Width());
-
-  El::Zeros_seq(X, X.Height(), X.Width());
-  El::Zeros_seq(indices_fetched, mb_size, 1);
-
-  std::string error_message;
-  for (int s = 0; s < mb_size; s++) {
-    int n = m_current_pos + (s * m_sample_stride);
-    sample_t index = std::make_pair(m_shuffled_indices[n], m_shuffled_indices2[n]);
-    bool valid = fetch_datum(X, index, s);
-    if (valid) {
-      El::Int index_coded = m_shuffled_indices[n] + m_shuffled_indices2[n]*(std::numeric_limits<label_t>::max()+1);
-      indices_fetched.Set(s, 0, index_coded);
-    } else{
-      error_message = "invalid datum";
-    }
-  }
-  if (!error_message.empty()) { LBANN_ERROR(error_message); }
-
-  /// Allow each thread to perform any postprocessing necessary on the
-  /// data source prior to fetching data
-  for (int t = 0; t < nthreads; t++) {
-    postprocess_data_source(t);
-  }
-
-  return mb_size;
-}
-
-
-/**
- * Fill the ground truth table by using the overloaded fetch_label()
- */
-int data_reader_mnist_siamese::fetch_labels(CPUMat& Y) {
-  if(!position_valid()) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: generic data reader load error: !position_valid");
-  }
-
-  int loaded_batch_size = get_loaded_mini_batch_size();
-  const int end_pos = std::min(static_cast<size_t>(m_current_pos+loaded_batch_size),
-                               m_shuffled_indices.size());
-  const int mb_size = std::min(
-    El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride},
-    Y.Width());
-
-  El::Zeros(Y, Y.Height(), Y.Width());
-
-  std::string error_message;
-  for (int s = 0; s < mb_size; s++) {
-    int n = m_current_pos + (s * m_sample_stride);
-    sample_t index = std::make_pair(m_shuffled_indices[n], m_shuffled_indices2[n]);
-    bool valid = fetch_label(Y, index, s);
-    if (!valid) {
-      error_message = "invalid label";
-    }
-  }
-  if (!error_message.empty()) { LBANN_ERROR(error_message); }
-
-  return mb_size;
-}
-
-
-bool data_reader_mnist_siamese::fetch_datum(CPUMat& X, std::pair<int, int> data_id, int mb_idx) {
-  int tid = m_io_thread_pool->get_local_thread_id();
-  std::vector<CPUMat> X_v = create_datum_views(X, mb_idx);
-
-  using raw_data_t = std::vector<unsigned char>;
-  using local_sample_t = std::array<raw_data_t*, 2>;
-  local_sample_t sample;
-  sample[0] = &m_image_data[data_id.first];
-  sample[1] = &m_image_data[data_id.second];
-
-  for(size_t i=0u; i < sample.size(); ++i) {
-    int width=0, height=0, img_type=0;
-    bool ret = true;
-
-#if 1
-    // Construct a zero copying view to a portion of a preloaded data buffer
-    // This has nothing to do with the image type but only to create view on a block of bytes
-    using InputBuf_T = lbann::cv_image_type<uint8_t>;
-    const cv::Mat image_buf(1, sample[i]->size()-1, InputBuf_T::T(1), &((*sample[i])[1]));
-#else
-    raw_data_t image_buf(sample[i]->begin()+1, sample[i]->end()); // make copy of the raw data
-#endif
-    ret = lbann::image_utils::import_image(image_buf, width, height, img_type, *(m_pps[tid]), X_v[i]);
-
-    if(!ret) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": image_utils::import_image failed to load");
-    }
-    if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": mismatch data size -- either width, height or channel - "
-                            + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                            + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size));
-    }
-  }
-  return true;
-}
-
-
-bool data_reader_mnist_siamese::fetch_label(CPUMat& Y, std::pair<int, int> data_id, int mb_idx) {
-  const label_t label_1 = m_image_data[data_id.first][0];
-  const label_t label_2 = m_image_data[data_id.second][0];
-  const label_t label = static_cast<label_t>(label_1 == label_2);
-  Y.Set(label, mb_idx, 1);
-  return true;
-}
-
-
-bool data_reader_mnist_siamese::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                        + get_type() + ": unused interface is called");
-  return false;
-}
-
-
-bool data_reader_mnist_siamese::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-  throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                        + get_type() + ": unused interface is called");
-  return false;
-}
-
-
-// The function is defined in data_readers/data_reader_mnist.cpp
-extern void load_mnist_data(const std::string imagepath, const std::string labelpath,
-  const int first_n, std::vector<std::vector<unsigned char> >& m_image_data);
-
-
-void data_reader_mnist_siamese::load() {
-  if (is_master()) {
-    std::cerr << "starting lbann::" << get_type() << "::load\n";
-  }
-  m_image_data.clear();
-
-  const std::string FileDir = get_file_dir();
-  const std::string ImageFile = get_data_filename();
-  const std::string LabelFile = get_label_filename();
-
-  // set filepath
-  const std::string imagepath = FileDir + "/" + ImageFile;
-  const std::string labelpath = FileDir + "/" + LabelFile;
-
-  if (is_master()) {
-    std::cerr << "read labels!\n";
-  }
-
-  load_mnist_data(imagepath, labelpath, m_first_n, m_image_data);
-
-  if (m_first_n > 0) {
-    set_use_percent(1.0);
-    set_absolute_sample_count(0u);
-  }
-
-  // reset indices
-  m_shuffled_indices.clear();
-  m_shuffled_indices.resize(m_image_data.size());
-  for (size_t n = 0; n < m_shuffled_indices.size(); n++) {
-    m_shuffled_indices[n] = n;
-  }
-  if (is_master()) {
-    std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " <<
-      m_shuffled_indices.size() << std::endl;
-  }
-  select_subset_of_data();
-}
-
-
-void data_reader_mnist_siamese::shuffle_indices() {
-  if (m_shuffled_indices2.size() != m_shuffled_indices.size()) {
-    m_shuffled_indices2 = m_shuffled_indices;
-    if (!m_shuffle) {
-      std::shuffle(m_shuffled_indices2.begin(), m_shuffled_indices2.end(),
-                   get_data_seq_generator());
-    }
-  }
-  if (m_shuffle) {
-    std::shuffle(m_shuffled_indices.begin(), m_shuffled_indices.end(),
-                 get_data_seq_generator());
-    std::shuffle(m_shuffled_indices2.begin(), m_shuffled_indices2.end(),
-                 get_data_seq_generator());
-  }
-}
-
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_moving_mnist.cpp b/src/data_readers/data_reader_moving_mnist.cpp
deleted file mode 100644
index b570f826e4e..00000000000
--- a/src/data_readers/data_reader_moving_mnist.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_moving_mnist.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include "lbann/models/model.hpp"
-#include <fstream>
-#include <functional>
-
-namespace lbann {
-
-namespace {
-
-/** Called repeatedly to incrementally create a hash value from
- *  several variables.
- *
- *  Copied from Boost. See
- *  https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine.
- */
-template <typename T>
-inline void hash_combine(size_t& seed, T v) {
-  seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-} // namespace
-
-moving_mnist_reader::moving_mnist_reader(El::Int num_frames,
-                                         El::Int image_height,
-                                         El::Int image_width,
-                                         El::Int num_objects)
-  : generic_data_reader(true),
-    m_num_frames(num_frames),
-    m_image_height(image_height),
-    m_image_width(image_width),
-    m_num_objects(num_objects) {}
-
-// Data dimension access functions
-const std::vector<int> moving_mnist_reader::get_data_dims() const {
-  std::vector<int> dims(4);
-  dims[0] = m_num_frames;
-  dims[1] = 3;
-  dims[2] = m_image_height;
-  dims[3] = m_image_width;
-  return dims;
-}
-int moving_mnist_reader::get_num_labels() const {
-  return 1 + 9 * m_num_objects;
-}
-int moving_mnist_reader::get_linearized_data_size() const {
-  const auto& dims = get_data_dims();
-  return std::accumulate(dims.begin(), dims.end(), 1,
-                         std::multiplies<int>());
-}
-int moving_mnist_reader::get_linearized_label_size() const {
-  return get_num_labels();
-}
-
-bool moving_mnist_reader::fetch_datum(CPUMat& X, int data_id, int col) {
-
-  // Useful constants
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-
-  // Choose raw images
-  /// @todo Implementation with uniform distribution
-  std::vector<El::Int> raw_image_indices(m_num_objects);
-  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-    size_t hash = 1234;
-    hash_combine(hash, data_id);
-    hash_combine(hash, m_model->get_epoch());
-    hash_combine(hash, obj);
-    raw_image_indices[obj] = hash % m_num_raw_images;
-  }
-
-  // Determine object boundaries
-  std::vector<std::array<El::Int, 4>> bounds(m_num_objects);
-  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-    auto& xmin = bounds[obj][0] = m_raw_image_width;
-    auto& xmax = bounds[obj][1] = 0;
-    auto& ymin = bounds[obj][2] = m_raw_image_height;
-    auto& ymax = bounds[obj][3] = 0;
-    const auto& raw_image_offset = (raw_image_indices[obj]
-                                    * m_raw_image_height
-                                    * m_raw_image_width);
-    const auto* raw_image = &m_raw_image_data[raw_image_offset];
-    for (El::Int j = 0; j < m_raw_image_height; ++j) {
-      for (El::Int i = 0; i < m_raw_image_width; ++i) {
-        if (raw_image[i + j * m_raw_image_width] != 0) {
-          xmin = std::min(xmin, i);
-          xmax = std::max(xmax, i+1);
-          ymin = std::min(ymin, j);
-          ymax = std::max(ymax, j+1);
-        }
-      }
-    }
-    xmin = std::min(xmin, xmax);
-    ymin = std::min(ymin, ymax);
-  }
-
-  // Initial positions and velocities
-  /// @todo Ensure objects don't overlap
-  std::vector<std::vector<std::array<DataType, 2>>> pos(m_num_objects);
-  std::vector<std::array<DataType, 2>> v(m_num_objects);
-  std::uniform_real_distribution<DataType> dist(zero, one);
-  const DataType vmax = std::hypot(m_image_width, m_image_height) / 5;
-  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-    const auto& object_width = bounds[obj][1] - bounds[obj][0];
-    const auto& object_height = bounds[obj][3] - bounds[obj][2];
-    pos[obj].resize(m_num_frames);
-    pos[obj][0][0] = (m_image_width - object_width + 1) * dist(get_io_generator());
-    pos[obj][0][1] = (m_image_height - object_height + 1) * dist(get_io_generator());
-    const DataType vnorm = vmax * dist(get_io_generator());
-    const DataType theta = 2 * M_PI * dist(get_io_generator());
-    v[obj][0] = vnorm * std::sin(theta);
-    v[obj][1] = vnorm * std::cos(theta);
-  }
-
-  // Determine object positions
-  /// @todo Ensure objects don't overlap
-  for (El::Int frame = 1; frame < m_num_frames; ++frame) {
-    for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-
-      // Linear motion
-      auto& x = pos[obj][frame][0];
-      auto& y = pos[obj][frame][1];
-      auto& vx = v[obj][0];
-      auto& vy = v[obj][1];
-      x = pos[obj][frame-1][0] + vx;
-      y = pos[obj][frame-1][1] + vy;
-
-      // Reflections at boundaries
-      const auto& object_width = bounds[obj][1] - bounds[obj][0];
-      const auto& object_height = bounds[obj][3] - bounds[obj][2];
-      const DataType xmax = m_image_width - object_width + 1;
-      const DataType ymax = m_image_height - object_height + 1;
-      if (x <= zero || x >= xmax) {
-        x = std::min(std::max(x, zero), xmax);
-        vx = -vx;
-      }
-      if (y <= zero || y >= ymax) {
-        y = std::min(std::max(y, zero), ymax);
-        vy = -vy;
-      }
-    }
-  }
-
-  // Populate frames
-  std::memset(X.Buffer(0, col), 0, X.Height() * sizeof(DataType));
-  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-
-    // Get raw image
-    const auto& object_width = bounds[obj][1] - bounds[obj][0];
-    const auto& object_height = bounds[obj][3] - bounds[obj][2];
-    const auto& object_width_offset = bounds[obj][0];
-    const auto& object_height_offset = bounds[obj][2];
-    const auto& raw_image_offset = ((raw_image_indices[obj]
-                                     * m_raw_image_height
-                                     * m_raw_image_width)
-                                    + object_width_offset
-                                    + (object_height_offset
-                                       * m_raw_image_width));
-    const auto* raw_image = &m_raw_image_data[raw_image_offset];
-
-    // Copy raw image into each frame
-    const auto& xmax = m_image_width - object_width + 1;
-    const auto& ymax = m_image_height - object_height + 1;
-    for (El::Int frame = 0; frame < m_num_frames; ++frame) {
-
-      // Get image position in current frame
-      El::Int xoff = pos[obj][frame][0];
-      El::Int yoff = pos[obj][frame][1];
-      xoff = std::min(std::max(xoff, El::Int(0)), xmax-1);
-      yoff = std::min(std::max(yoff, El::Int(0)), ymax-1);
-
-      // Copy raw image into position
-      for (El::Int channel = 0; channel < 3; ++channel) {
-        for (El::Int j = 0; j < object_height; ++j) {
-          for (El::Int i = 0; i < object_width; ++i) {
-            const auto& row = (frame * 3 * m_image_height * m_image_width
-                               + channel * m_image_height * m_image_width
-                               + (yoff+j) * m_image_width
-                               + (xoff+i));
-            auto& pixel = X(row, col);
-            pixel += raw_image[i + j * m_raw_image_width] / 255.0;
-            pixel = std::min(pixel, one);
-          }
-        }
-      }
-
-    }
-
-  }
-
-  return true;
-}
-
-bool moving_mnist_reader::fetch_label(CPUMat& Y, int data_id, int col) {
-
-  // Choose raw images
-  /// @todo Implementation with uniform distribution
-  std::vector<El::Int> raw_image_indices(m_num_objects);
-  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
-    size_t hash = 1234;
-    hash_combine(hash, data_id);
-    hash_combine(hash, m_model->get_epoch());
-    hash_combine(hash, obj);
-    raw_image_indices[obj] = hash % m_num_raw_images;
-  }
-
-  // Label is sum of raw image labels
-  El::Int sum = 0;
-  for (const auto& i : raw_image_indices) {
-    sum += m_raw_label_data[i];
-  }
-  auto&& Y_col = El::View(Y, El::ALL, El::IR(col));
-  El::Zero(Y_col);
-  Y_col(sum, 0) = DataType(1);
-
-  return true;
-}
-
-void moving_mnist_reader::load() {
-
-  // Read image data
-  const auto& image_file = get_file_dir() + "/" + get_data_filename();
-  std::ifstream fs_image(image_file.c_str(),
-                         std::fstream::in | std::fstream::binary);
-  unsigned int num_images = 0;
-  unsigned int image_height = 0;
-  unsigned int image_width = 0;
-  fs_image.ignore(4);
-  fs_image.read(reinterpret_cast<char*>(&num_images), 4);
-  fs_image.read(reinterpret_cast<char*>(&image_height), 4);
-  fs_image.read(reinterpret_cast<char*>(&image_width), 4);
-  __swapEndianInt(num_images);
-  __swapEndianInt(image_height);
-  __swapEndianInt(image_width);
-  m_num_raw_images = num_images;
-  m_raw_image_height = image_height;
-  m_raw_image_width = image_width;
-  m_raw_image_data.resize(num_images * image_height * image_width);
-  fs_image.read(reinterpret_cast<char*>(m_raw_image_data.data()),
-                num_images * image_height * image_width);
-  fs_image.close();
-
-  // Read labels
-  const auto& label_file = get_file_dir() + "/" + get_label_filename();
-  std::ifstream fs_label(label_file.c_str(),
-                         std::fstream::in | std::fstream::binary);
-  fs_label.ignore(8);
-  m_raw_label_data.resize(num_images);
-  fs_label.read(reinterpret_cast<char*>(m_raw_label_data.data()), num_images);
-  fs_label.close();
-
-  // Reset indices
-  m_shuffled_indices.resize(num_images);
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-  select_subset_of_data();
-
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_multi_images.cpp b/src/data_readers/data_reader_multi_images.cpp
deleted file mode 100644
index f1f5208758f..00000000000
--- a/src/data_readers/data_reader_multi_images.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_multi_images .hpp .cpp - generic data reader class for datasets
-//                                      employing multiple images per sample
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_multi_images.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <fstream>
-#include <sstream>
-#include <omp.h>
-
-namespace lbann {
-
-data_reader_multi_images::data_reader_multi_images(const std::shared_ptr<cv_process>& pp, bool shuffle)
-  : imagenet_reader(pp, shuffle) {
-  set_defaults();
-}
-
-data_reader_multi_images::data_reader_multi_images(const data_reader_multi_images& rhs)
-  : imagenet_reader(rhs),
-    m_image_list(rhs.m_image_list),
-    m_num_img_srcs(rhs.m_num_img_srcs)
-{}
-
-data_reader_multi_images& data_reader_multi_images::operator=(const data_reader_multi_images& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  imagenet_reader::operator=(rhs);
-  m_image_list = rhs.m_image_list;
-  m_num_img_srcs = rhs.m_num_img_srcs;
-
-  return (*this);
-}
-
-data_reader_multi_images::~data_reader_multi_images() {
-}
-
-void data_reader_multi_images::set_defaults() {
-  m_image_width = 28;
-  m_image_height = 28;
-  m_image_num_channels = 1;
-  set_linearized_image_size();
-  m_num_labels = 2;
-  m_num_img_srcs = 1;
-}
-
-void data_reader_multi_images::set_input_params(const int width, const int height, const int num_ch, const int num_labels, const int num_img_srcs) {
-  imagenet_reader::set_input_params(width, height, num_ch, num_labels);
-  if (num_img_srcs > 0) {
-    m_num_img_srcs = num_img_srcs;
-  } else if (num_img_srcs < 0) {
-    std::stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " setup error: invalid number of image sources";
-    throw lbann_exception(err.str());
-  }
-}
-
-void data_reader_multi_images::set_input_params(const int width, const int height, const int num_ch, const int num_labels) {
-  set_input_params(width, height, num_ch, num_labels, 1);
-}
-
-std::vector<CPUMat> data_reader_multi_images::create_datum_views(CPUMat& X, const int mb_idx) const {
-  std::vector<CPUMat> X_v(m_num_img_srcs);
-  El::Int h = 0;
-  for(unsigned int i=0u; i < m_num_img_srcs; ++i) {
-    El::View(X_v[i], X, El::IR(h, h + m_image_linearized_size), El::IR(mb_idx, mb_idx + 1));
-    h = h + m_image_linearized_size;
-  }
-  return X_v;
-}
-
-bool data_reader_multi_images::fetch_datum(CPUMat& X, int data_id, int mb_idx) {
-  int tid = m_io_thread_pool->get_local_thread_id();
-  std::vector<CPUMat> X_v = create_datum_views(X, mb_idx);
-
-  const img_src_t& img_src = m_image_list[data_id].first;
-  for(size_t i=0u; i < m_num_img_srcs; ++i) {
-    int width=0, height=0, img_type=0;
-    const std::string imagepath = get_file_dir() + img_src[i];
-    bool ret = true;
-    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
-
-    if(!ret) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": image_utils::load_image failed to load - "
-                            + imagepath);
-    }
-    if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": mismatch data size -- either width, height or channel - "
-                            + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                            + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size));
-    }
-  }
-  return true;
-}
-
-bool data_reader_multi_images::fetch_label(CPUMat& Y, int data_id, int mb_idx) {
-  const label_t label = m_image_list[data_id].second;
-  Y.Set(label, mb_idx, 1);
-  return true;
-}
-
-std::vector<data_reader_multi_images::sample_t> data_reader_multi_images::get_image_list_of_current_mb() const {
-  std::vector<sample_t> ret;
-  ret.reserve(m_mini_batch_size);
-
-  // for (El::Int i = 0; i < m_indices_fetched_per_mb.Height(); ++i) {
-  //   El::Int index = m_indices_fetched_per_mb.Get(i, 0);
-  //   ret.push_back(m_image_list[index]);
-  // }
-  return ret;
-}
-
-bool data_reader_multi_images::read_text_stream(std::istream& text_stream,
-  std::vector<data_reader_multi_images::sample_t>& list)
-{
-  std::string line;
-
-  if (!std::getline(text_stream, line)) {
-    return false;
-  }
-
-  while (text_stream) {
-    img_src_t img_srcs;
-    {
-      std::stringstream sstr(line.c_str());
-      for (unsigned int i=0u; i < m_num_img_srcs; ++i) {
-        std::string path;
-        sstr >> path;
-        img_srcs.push_back(path);
-      }
-      label_t label;
-      sstr >> label;
-      if (sstr.bad()) {
-        return false;
-      }
-      list.emplace_back(img_srcs, label);
-    }
-    std::getline(text_stream, line);
-  }
-  list.shrink_to_fit();
-  return true;
-}
-
-bool data_reader_multi_images::load_list(const std::string file_name,
-  std::vector<data_reader_multi_images::sample_t>& list, const bool fetch_list_at_once)
-{
-  bool ok = true;
-
-  if (fetch_list_at_once) {
-    int tid = omp_get_thread_num();
-    ok = load_file(file_name, m_thread_buffer[tid]);
-    std::istringstream text_stream(m_thread_buffer[tid].data());
-    ok = ok && read_text_stream(text_stream, list);
-  } else {
-    std::ifstream text_stream(file_name.c_str(), std::ios_base::in);
-    ok = read_text_stream(text_stream, list);
-  }
-  return ok;
-}
-
-void data_reader_multi_images::load() {
-  m_image_list.clear();
-  const std::string image_list_file = get_data_filename();
-
-  bool ok = load_list(image_list_file, m_image_list);
-  if (!ok) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: failed to load: " + image_list_file);
-  }
-
-  // reset indices
-  m_shuffled_indices.clear();
-  m_shuffled_indices.resize(m_image_list.size());
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-
-  select_subset_of_data();
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_multihead_siamese.cpp b/src/data_readers/data_reader_multihead_siamese.cpp
deleted file mode 100644
index cdb4c582042..00000000000
--- a/src/data_readers/data_reader_multihead_siamese.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_multihead_siamese .hpp .cpp - data reader to use m patches
-//                                 generated offline.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_multihead_siamese.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <fstream>
-#include <sstream>
-#include <omp.h>
-
-#include <iostream>
-
-namespace lbann {
-
-data_reader_multihead_siamese::data_reader_multihead_siamese(const std::shared_ptr<cv_process>& pp, unsigned int nimages, bool shuffle) : data_reader_multi_images(pp, shuffle) {
-  set_defaults();
-  m_num_img_srcs = nimages;
-  m_samples = offline_patches_npz (m_num_img_srcs);
-}
-
-data_reader_multihead_siamese::data_reader_multihead_siamese(const std::shared_ptr<cv_process>& pp, bool shuffle)
-  : data_reader_multi_images(pp, shuffle) {
-  set_defaults();
-}
-
-data_reader_multihead_siamese::data_reader_multihead_siamese(const data_reader_multihead_siamese& rhs)
-  : data_reader_multi_images(rhs),
-    m_samples(rhs.m_samples)
-{}
-
-data_reader_multihead_siamese& data_reader_multihead_siamese::operator=(const data_reader_multihead_siamese& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  data_reader_multi_images::operator=(rhs);
-  m_samples = rhs.m_samples;
-
-  return (*this);
-}
-
-data_reader_multihead_siamese::~data_reader_multihead_siamese() {
-}
-
-void data_reader_multihead_siamese::set_defaults() {
-  m_image_width = 110;
-  m_image_height = 110;
-  m_image_num_channels = 3;
-  set_linearized_image_size();
-  m_num_labels = 20;
-  m_num_img_srcs = 3;
-}
-
-/**
- * Same as the parent class method except the default value of the last argument,
- * num_img_srcs, which is 4 here.
- */
-void data_reader_multihead_siamese::set_input_params(const int width, const int height, const int num_ch, const int num_labels) {
-  data_reader_multi_images::set_input_params(width, height, num_ch, num_labels, 4);
-}
-
-
-bool data_reader_multihead_siamese::fetch_datum(Mat& X, int data_id, int mb_idx) {
-
-  int tid = m_io_thread_pool->get_local_thread_id();
-  std::vector<Mat> X_v = create_datum_views(X, mb_idx);
-
-  sample_t sample = m_samples.get_sample(data_id);
-  for(size_t i=0u; i < m_num_img_srcs; ++i) {
-    int width=0, height=0, img_type=0;
-    const std::string imagepath = get_file_dir() + sample.first[i];
-    bool ret = true;
-    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
-
-    if(!ret) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": image_utils::load_image failed to load - "
-                            + imagepath);
-    }
-    if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": mismatch data size -- either width, height or channel - "
-                            + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                            + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size));
-    }
-  }
-
-  return true;
-}
-
-
-bool data_reader_multihead_siamese::fetch_label(Mat& Y, int data_id, int mb_idx) {
-  const label_t label = m_samples.get_label(data_id);
-  Y.Set(label, mb_idx, 1);
-  return true;
-}
-
-
-std::vector<data_reader_multihead_siamese::sample_t> data_reader_multihead_siamese::get_image_list_of_current_mb() const {
-  std::vector<sample_t> ret;
-  ret.reserve(m_mini_batch_size);
-  return ret;
-}
-
-
-std::vector<data_reader_multihead_siamese::sample_t> data_reader_multihead_siamese::get_image_list() const {
-  const size_t num_samples = m_samples.get_num_samples();
-  std::vector<sample_t> ret;
-  ret.reserve(num_samples);
-
-  for (size_t i=0; i < num_samples; ++i) {
-    ret.emplace_back(m_samples.get_sample(i));
-  }
-  return ret;
-}
-
-
-void data_reader_multihead_siamese::load() {
-  const std::string data_filename = get_data_filename();
-
-  // To support m_first_n semantic, m_samples.load() takes m_first_n
-  // as an argument and attempt to shrink the CNPY arrays loaded as needed
-  if (!m_samples.load(data_filename, m_first_n)) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": failed to load the file " + data_filename);
-  }
-
-  size_t num_samples = m_samples.get_num_samples();
-
-  if (m_first_n > 0) {
-    num_samples = (static_cast<size_t>(m_first_n) <= num_samples)?
-                   static_cast<size_t>(m_first_n) : num_samples;
-
-    m_first_n = num_samples;
-    set_use_percent(1.0);
-    set_absolute_sample_count(0u);
-  }
-
-  // reset indices
-  m_shuffled_indices.clear();
-
-  m_shuffled_indices.resize(num_samples);
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-
-  select_subset_of_data();
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/data_reader_npz_ras_lipid.cpp b/src/data_readers/data_reader_npz_ras_lipid.cpp
new file mode 100644
index 00000000000..acca8f042ad
--- /dev/null
+++ b/src/data_readers/data_reader_npz_ras_lipid.cpp
@@ -0,0 +1,657 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/data_readers/data_reader_npz_ras_lipid.hpp"
+#include "lbann/data_store/data_store_conduit.hpp"
+#include "lbann/utils/jag_utils.hpp" 
+#include "lbann/utils/timer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/commify.hpp"
+#include "lbann/utils/lbann_library.hpp"
+//#include <valarray>
+
+namespace lbann {
+
+ras_lipid_conduit_data_reader::ras_lipid_conduit_data_reader(const bool shuffle)
+  : generic_data_reader(shuffle) {}
+
+ras_lipid_conduit_data_reader::ras_lipid_conduit_data_reader(const ras_lipid_conduit_data_reader& rhs)  : generic_data_reader(rhs) {
+  copy_members(rhs);
+}
+
+ras_lipid_conduit_data_reader& ras_lipid_conduit_data_reader::operator=(const ras_lipid_conduit_data_reader& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+  generic_data_reader::operator=(rhs);
+  copy_members(rhs);
+  return (*this);
+}
+
+
+void ras_lipid_conduit_data_reader::copy_members(const ras_lipid_conduit_data_reader &rhs) {
+  if(rhs.m_data_store != nullptr) {
+      m_data_store = new data_store_conduit(rhs.get_data_store());
+  }
+  m_data_store->set_data_reader_ptr(this);
+  m_filenames = rhs.m_filenames;
+  m_samples_per_file = rhs.m_samples_per_file;
+  m_data_id_map = rhs.m_data_id_map;
+  m_datum_shapes = rhs.m_datum_shapes;
+  m_datum_word_sizes = rhs.m_datum_word_sizes;
+  m_datum_num_bytes = rhs.m_datum_num_bytes;
+  m_datum_num_words = rhs.m_datum_num_words;
+  m_num_features = rhs.m_num_features;
+  m_num_labels = rhs.m_num_labels;
+  m_num_response_features = rhs.m_num_response_features;
+  m_data_dims = rhs.m_data_dims;
+  m_seq_len = rhs.m_seq_len;
+  m_multi_sample_to_owner = rhs.m_multi_sample_to_owner;
+  m_filename_to_multi_sample = rhs.m_filename_to_multi_sample;
+  m_multi_sample_id_to_first_sample = rhs.m_multi_sample_id_to_first_sample;
+}
+
+void ras_lipid_conduit_data_reader::load() {
+  if(is_master()) {
+    std::cout << "starting load for role: " << get_role() << std::endl;
+  }
+
+  options *opts = options::get();
+  opts->set_option("preload_data_store", 1);
+
+  // Error check some settings 
+  size_t count = get_absolute_sample_count();
+  if (count) {
+    LBANN_ERROR("You cannot use absolute sample count with this data reader");
+  }
+  double use_percent = get_use_percent();
+  if (use_percent != 1) {
+    LBANN_ERROR("use_percent for < 1.0 is not yet implemented; please contact Dave Hysom");
+  }
+
+  // The input file should contain, on each line, the complete
+  // pathname of an npz file. 
+  std::string infile = get_data_filename();
+  read_filelist(m_comm, infile, m_filenames);
+
+  // Read or compute the number of samples per file (this is the number
+  // of samples before we sequentially-concatenate them)
+  if (opts->has_string("pilot2_read_file_sizes")) {
+    read_file_sizes();
+  } else {
+    double tm3 = get_time();
+    get_samples_per_file();
+    if (is_master()) std::cout << "time to compute samples_per_file: " << get_time() - tm3 << std::endl;
+  }
+  // Optionally save the samples-per-file info to file
+  if (opts->has_string("pilot2_save_file_sizes")) {
+    write_file_sizes();
+  }
+
+  // Get the number of samples that will be combined into a multi-sample
+  m_seq_len = 1;
+  if (opts->has_int("seq_len")) {
+    m_seq_len = opts->get_int("seq_len");
+  }
+
+  // set the number of labels
+  set_num_labels(3);
+
+  // Get the number of global multi-samples, and the number of
+  // multi-samples in each file
+  std::vector<int> multi_samples_per_file;
+  multi_samples_per_file.reserve(m_filenames.size());
+  m_num_global_samples = 0;
+  for (const auto &t : m_samples_per_file) {
+    int n = t / m_seq_len; // this is the number of multi-samples
+    multi_samples_per_file.push_back(n);
+    m_num_global_samples += n;
+  }
+
+  // Compute the data_id of the first multi-sample in each file
+  std::unordered_map<int, int> first_multi_id_per_file;
+  first_multi_id_per_file[0] = 0;
+  for (size_t j=1; j<m_samples_per_file.size()+1; j++) {
+    first_multi_id_per_file[j] = (first_multi_id_per_file[j-1] + multi_samples_per_file[j-1]);
+  }
+
+  // Build owner map
+  int np = m_comm->get_procs_per_trainer();
+  for (size_t j=0; j<m_filenames.size(); j++) {
+    int owner = j % np;
+    int first = first_multi_id_per_file[j];
+    for (int k=0; k<multi_samples_per_file[j]; ++k) {
+      m_multi_sample_to_owner[k+first] = owner;
+    }
+  }
+
+  int my_rank = m_comm->get_rank_in_trainer();
+
+  //m_filename_to_multi_sample maps filename -> multi-sample data_ids
+  //m_multi_sample_id_to_first_sample maps multi-sample data_id 
+  //    -> first single-sample that is part of the multi-sample. 
+  //Note: multi-sample data_id is global; single-sample data_id is 
+  //      local (WRT the current file)
+  
+  //Note: m_filename_to_multi_sample contains all multi-samples in the file;
+  //      some of these may be marked for transfer to the validation set
+  //      (during select_subset_of_data)
+
+  for (size_t j=my_rank; j<m_filenames.size(); j += np) { 
+    int first_multi_sample_id = first_multi_id_per_file[j];
+    int num_multi_samples = multi_samples_per_file[j];
+    for (int k=0; k<num_multi_samples; k++) {
+      m_filename_to_multi_sample[m_filenames[j]].insert(first_multi_sample_id+k);
+      m_multi_sample_id_to_first_sample[first_multi_sample_id+k] = k*m_seq_len;
+    }
+  }
+
+  fill_in_metadata();
+
+  m_shuffled_indices.clear();
+  m_shuffled_indices.resize(m_num_global_samples);
+  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
+
+  instantiate_data_store();
+  select_subset_of_data();
+
+  m_num_train_samples = m_shuffled_indices.size();
+  m_num_validate_samples = m_num_global_samples - m_num_train_samples;
+}
+
+void ras_lipid_conduit_data_reader::do_preload_data_store() {
+  if (is_master()) std::cout << "starting ras_lipid_conduit_data_reader::do_preload_data_store; num indices: " << utils::commify(m_shuffled_indices.size()) << " for role: " << get_role() << std::endl;
+
+#if 0
+==========================================================================
+data types, from python+numpy:
+
+  rots          numpy.float64  RAS rotation angle    
+  states        numpy.float64  the assigned state
+  tilts         numpy.float64  RAS tilt angle
+  density_sig1  numpy.float64  13x13x14 lipid density data
+  frames        numpy.int64    list of frame ids
+  bbs           numpy.float32  184x3 xyz coordinates for 
+                               each of the 184 RAS backbone beads
+  probs         numpy.float64  probability to be in each of the three states
+
+  Notes: 
+    1. Adam talks about 'frames,' which is equivalent, in lbannese, to 'sample'
+    2. the "frames" field is simply a set of sequential integers, 0 .. 539
+
+==========================================================================
+#endif
+
+  m_data_store->set_owner_map(m_multi_sample_to_owner);
+
+  // get normalization data
+  read_normalization_data();
+
+  // get the set of shuffled indices
+  std::unordered_set<int> this_readers_indices;
+  for (const auto &data_id : m_shuffled_indices) {
+    this_readers_indices.insert(data_id);
+  }
+
+  // Variables only used for user feedback
+  bool verbose = options::get()->get_bool("verbose");
+  int np = m_comm->get_procs_per_trainer();
+  size_t nn = 0; 
+
+  std::vector<conduit::Node> work(m_seq_len);
+
+  // option and variables only used for testing during development
+  bool debug_concatenate = options::get()->get_bool("debug_concatenate");
+  if (m_seq_len > 1) {
+    debug_concatenate = false;
+  }
+  bool testme = true;
+
+  // Determine which branch to use when forming multi-sample and inserting
+  // in the data store
+  int which = 2;
+  if (m_seq_len == 1 && !debug_concatenate) {
+    which = 1;
+  }
+  //TODO: fix this
+  which = 2;
+
+  // Loop over the files owned by this processer
+  for (const auto &t : m_filename_to_multi_sample) {
+
+    // Load the next data file
+    std::map<std::string, cnpy::NpyArray> data = cnpy::npz_load(t.first);
+
+    for (const auto &multi_sample_id : t.second) {
+      if (this_readers_indices.find(multi_sample_id) != this_readers_indices.end()) {
+        int starting_id = m_multi_sample_id_to_first_sample[multi_sample_id];
+
+        // Load the single-samples that will be concatenated to form
+        // the next multi-sample
+        for (int k=0; k<m_seq_len; ++k) {
+          load_the_next_sample(work[k], starting_id+k, data);
+
+          ++nn;
+          if (verbose && is_master() && nn % 1000 == 0) {
+            std::cout << "estimated number of single-samples processed: "
+                      << utils::commify(nn/1000*np) << "K" << std::endl;
+          }
+        }
+
+        // First branch: seq_len = 1
+        if (which == 1) {
+          // debug block; will go away
+          if (testme && is_master()) {
+            std::cout << "Taking first branch (seq_len == 1)" << std::endl;
+            testme = false;
+          }
+
+          work[0]["states"].value();
+          m_data_store->set_conduit_node(multi_sample_id, work[0]);
+        }  
+
+        // Second branch: seq_len > 1, or seq_len = 1 and we're using this
+        //        branch for debugging
+        else {
+          // debug block; will go away
+          if (is_master() && m_seq_len == 1 && testme) {
+            std::cout << "Taking second branch (seq_len == 1)" << std::endl;
+            testme = false;
+          }
+
+          // Construct the multi-sample and set it in the data store
+          conduit::Node n3;
+          construct_multi_sample(work, multi_sample_id, n3);
+          m_data_store->set_conduit_node(multi_sample_id, n3);
+        }
+      }
+    }
+  }
+
+  // user feedback
+  if (get_role() == "train") {
+    print_shapes_etc();
+  }
+}
+
+bool ras_lipid_conduit_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
+  const conduit::Node& node = m_data_store->get_conduit_node(data_id);
+  const double *data = node[LBANN_DATA_ID_STR(data_id) + "/density_sig1"].value();
+
+  size_t n = m_seq_len*m_datum_num_words["density_sig1"];
+  for (size_t j = 0; j < n; ++j) {
+    X(j, mb_idx) = data[j];
+  }
+
+#if 0
+Notes from Adam:
+The keras model that I gave you only looks at the density_sig1 data as input data and it uses the states data as labels.  We�ll want to also extract bbs to merge that with density_sig1 in various ways as input data in future models that we�re putting together.
+
+ The probs field can be useful as an alternate label if building a regression model instead of a classification model.  I�ve also been using the probs field as a filter on the training data to only consider those input data whose state probability exceeds some threshold.
+
+  So that works out to:
+
+   bb, density_sig1 - datum
+   states           - label
+   probs            - used as a filter to include/exclude certain samples
+
+#endif
+  return true;
+}
+
+std::map<double,int> m2;
+
+bool ras_lipid_conduit_data_reader::fetch_label(Mat& Y, int data_id, int mb_idx) {
+  const conduit::Node node = m_data_store->get_conduit_node(data_id);
+  const int *labels = node[LBANN_DATA_ID_STR(data_id) + "/states"].value();
+  for (int j=0; j<m_seq_len; j++) {
+    Y.Set(3*j + labels[j], mb_idx, 1);
+  }
+  return true;
+}
+
+bool ras_lipid_conduit_data_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
+  LBANN_ERROR("ras_lipid_conduit_data_reader: do not have responses");
+  return true;
+}
+
+void ras_lipid_conduit_data_reader::fill_in_metadata() {
+  std::map<std::string, cnpy::NpyArray> aa = cnpy::npz_load(m_filenames[0]);
+  for (const auto &t : aa) {
+    const std::string &name = t.first;
+    size_t word_size = t.second.word_size;
+    const std::vector<size_t> &shape = t.second.shape;
+    size_t num_words = 1;
+    //size_t num_words = m_seq_len;
+    if (shape.size() == 1) {
+      m_datum_shapes[name].push_back(1*m_seq_len);
+    } else {
+//      m_datum_shapes[name].push_back(m_seq_len);
+      for (size_t x=1; x<shape.size(); x++) {
+        num_words *= shape[x];
+        m_datum_shapes[name].push_back(shape[x]);
+      }
+    }
+    m_datum_num_words[name] = num_words;
+    m_datum_word_sizes[name] = word_size;
+    m_datum_num_bytes[name] = num_words*word_size;
+  }
+
+  //TODO: this should be more generic, will need to change depending on what we fetch
+  if (m_datum_shapes.find("density_sig1") == m_datum_shapes.end()) {
+    LBANN_ERROR("m_datum_shapes.find(\"density_sig1\") = m_datum_shapes.end()");
+  }
+  m_num_features = 1;
+
+  for (auto t : m_datum_shapes["density_sig1"]) {
+    m_data_dims.push_back(t);
+    m_num_features *= t;
+  }
+}
+
+void ras_lipid_conduit_data_reader::rebuild_data_store_owner_map() {
+  m_data_store->clear_owner_map();
+  int np = m_comm->get_procs_per_trainer();
+  size_t data_id = 0;
+  for (size_t j=0; j<m_filenames.size(); ++j) {
+    int file_owner = j % np;
+    for (int h=0; h<m_samples_per_file[j]; h++) {
+      m_data_store->add_owner(data_id, file_owner);
+      ++data_id;
+    }
+  }
+  m_data_store->set_finished_building_map();
+}
+
+void ras_lipid_conduit_data_reader::get_samples_per_file() {
+  int me = m_comm->get_rank_in_trainer();
+  int np = m_comm->get_procs_per_trainer();
+  std::vector<int> work;
+  int x = 0;
+  for (size_t j=me; j<m_filenames.size(); j+=np) {
+    ++x;
+    std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(m_filenames[j]);
+    size_t n = 0;
+    for (const auto &t2 : a) {
+      size_t n2 = t2.second.shape[0];
+      if (n == 0) {
+        n = n2;
+      } else {
+        if (n2 != n) {
+          LBANN_ERROR("n2 != n; ", n2, n);
+        }
+      }
+    }
+    work.push_back(j);
+    work.push_back(n);
+  }
+
+  std::vector<int> num_files(np, 0);
+  for (size_t j=0; j<m_filenames.size(); ++j) {
+    int owner = j % np;
+    num_files[owner] += 1;
+  }
+
+  m_samples_per_file.resize(m_filenames.size());
+  std::vector<int> work_2;
+  std::vector<int> *work_ptr;
+  for (int j=0; j<np; j++) {
+    if (me == j) {
+      work_ptr = &work;
+    } else {
+      work_2.resize(num_files[j]*2);
+      work_ptr = &work_2;
+    }
+    m_comm->trainer_broadcast<int>(j, work_ptr->data(), work_ptr->size());
+    for (size_t h=0; h<work_ptr->size(); h+= 2) {
+      m_samples_per_file[(*work_ptr)[h]] = (*work_ptr)[h+1];
+    }
+  }
+}
+
+void ras_lipid_conduit_data_reader::write_file_sizes() {
+  if (! is_master()) {
+    return;
+  }
+  std::string fn = options::get()->get_string("pilot2_save_file_sizes");
+  std::ofstream out(fn.c_str());
+  if (!out) {
+    LBANN_ERROR("failed to open ", fn, " for writing");
+  }
+  for (size_t j=0; j<m_samples_per_file.size(); j++) {
+    out << m_filenames[j] << " " << m_samples_per_file[j] << std::endl;
+  }
+  out.close();
+}
+
+void ras_lipid_conduit_data_reader::read_file_sizes() {
+  std::string fn = options::get()->get_string("pilot2_read_file_sizes");
+  std::ifstream in(fn.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", fn, " for reading");
+  }
+  std::unordered_map<std::string, int> mp;
+  std::string filename;
+  int num_samples;
+  while (in >> filename >> num_samples) {
+    mp[filename] = num_samples;
+  }
+  in.close();
+
+  m_samples_per_file.resize(m_filenames.size());
+  for (size_t h=0; h<m_filenames.size(); h++) {
+    if (mp.find(m_filenames[h]) == mp.end()) {
+      LBANN_ERROR("failed to find filename '", m_filenames[h], "' in the map");
+    }
+    m_samples_per_file[h] = mp[m_filenames[h]];
+  }
+}
+
+void ras_lipid_conduit_data_reader::read_normalization_data() {
+  m_use_min_max = false;
+  m_use_z_score = false;
+  if (options::get()->has_string("normalization")) {
+   m_use_min_max = true;
+    m_use_z_score = options::get()->get_bool("z_score");
+    if (is_master()) {
+      if (m_use_z_score) {
+        std::cout << "Normalizing data using z-score" << std::endl;
+      } else {
+        std::cout << "Normalizing data using min-max" << std::endl;
+      }
+    }
+
+    std::string fn = options::get()->get_string("normalization");
+    std::ifstream in(fn.c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open ", fn, " for reading");
+    }
+    std::string line;
+    getline(in, line);
+    m_max_min.reserve(14);
+    m_min.reserve(14);
+    m_mean.reserve(14);
+    m_std_dev.reserve(14);
+    double v_max, v_min, v_mean, v_std_dev;
+    while (in >> v_max >> v_min >> v_mean >> v_std_dev) { 
+      m_min.push_back(v_min);
+      m_max_min.push_back(v_max - v_min);
+      m_mean.push_back(v_mean);
+      m_std_dev.push_back(v_std_dev);
+    }
+    in.close();
+    if (m_min.size() != 14) {
+      LBANN_ERROR("normalization.size() = ", m_min.size(), "; should be 14");
+    }
+  } else {
+    if (is_master()) {
+      std::cout << "NOT Normalizing data!" << std::endl;
+    }
+  }
+}
+
+//user feedback
+void ras_lipid_conduit_data_reader::print_shapes_etc() {
+  if (!is_master()) {
+    return;
+  }
+
+  // master prints statistics
+  std::cout << "\n======================================================\n";
+  std::cout << "num train samples=" << m_num_train_samples << std::endl;
+  std::cout << "num validate samples=" << m_num_validate_samples << std::endl;
+  std::cout << "sequence length=" << m_seq_len << std::endl;
+  std::cout << "num features=" << get_linearized_data_size() << std::endl;
+  std::cout << "num labels=" << get_num_labels() << std::endl;
+  std::cout << "data dims=";
+  for (size_t h=0; h<m_datum_shapes["density_sig1"].size(); h++) {
+    std::cout << m_datum_shapes["density_sig1"][h];
+    if (h < m_datum_shapes["density_sig1"].size() - 1) {
+      std::cout << "x";
+    }
+  }
+  std::cout << std::endl;
+
+  if (options::get()->get_bool("verbose_print")) {
+    std::cout << "\nAll data shapes:\n";
+    for (const auto &t : m_datum_shapes) {
+      std::cout << "  " << t.first << " ";
+      for (const auto &t2 : t.second) {
+        std::cout << t2 << " ";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+  }  
+
+  std::cout << "======================================================\n\n";
+}
+
+void ras_lipid_conduit_data_reader::load_the_next_sample(conduit::Node &node, int sample_index, std::map<std::string, cnpy::NpyArray> &a) {
+  node.reset();
+  size_t offset;
+  for (const auto &t5 : m_datum_shapes) {
+    const std::string &name = t5.first;
+    if (name == "bbs") {
+      conduit::float32 *data = reinterpret_cast<conduit::float32*>(a[name].data_holder->data());
+      offset = sample_index*m_datum_num_words["bbs"];
+      node[name].set(data + offset, m_datum_num_words[name]);
+    } 
+
+    else { // rots, states, tilts, density_sig1, probs
+      offset = sample_index*m_datum_num_words[name];
+      conduit::float64 *data = reinterpret_cast<conduit::float64*>(a[name].data_holder->data());
+
+      if (name == "states") {
+        int label = static_cast<int>((data + offset)[0]);
+        node["states"].set(label);
+      } else if (name == "density_sig1") {
+        int s = 0;
+        if (m_use_z_score) {
+          for (size_t j=offset; j<offset+m_datum_num_words[name]; j++) {
+            data[j]= (data[j] - m_mean[s]) / m_std_dev[s];
+            ++s;
+            if (s == 14) {
+              s = 0;
+            }
+          }
+        } else if (m_use_min_max) {
+          for (size_t j=offset; j<offset+m_datum_num_words[name]; j++) {
+            data[j] = (data[j] - m_min[s]) / m_max_min[s];
+            ++s;
+            if (s == 14) {
+              s = 0;
+            }
+          }
+        }
+        node[name].set(data + offset, m_datum_num_words[name]);
+
+      // rots, tilts, probs
+      } else {
+        node[name].set(data + offset, m_datum_num_words[name]);
+      }  
+    }
+  }
+}
+
+void ras_lipid_conduit_data_reader::construct_multi_sample(std::vector<conduit::Node> &work, int data_id, conduit::Node &node) {
+  node.reset();
+  std::vector<double> work_d;
+  std::vector<float> work_f;
+  std::vector<int> work_i;
+  if (m_datum_num_words["states"] != 1) {
+    LBANN_ERROR("m_data_num_words[states] = ", m_datum_num_words["states"], "; should be 1");
+  }
+  for (const auto &t42 : m_datum_num_words) {
+    const std::string &name = t42.first;
+    int n_words = t42.second;
+  
+    if (name == "frames") {
+      continue;
+    }
+
+    if (name == "states") {
+      work_i.clear();
+      for (const auto &t5 : work) {
+        const int label = t5[name].value();
+        work_i.push_back(label);
+      }
+      node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(work_i.data(), m_seq_len * m_datum_num_words[name]);
+    }
+            
+    // 'bbs' is float32
+    else if (name == "bbs") {
+      work_f.resize(m_seq_len*n_words);
+      int offset = 0;
+      for (const auto &t5 : work) {
+        const float *d = t5[name].value();
+        for (size_t u=0; u<m_datum_num_words[name]; u++) {
+          work_f[offset++] = d[u];
+        }
+      }
+      node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(work_f.data(), m_seq_len * m_datum_num_words[name]);
+  
+    }
+            
+    // rots, tilts, density_sig1, probs are float64
+    else {
+      work_d.resize(m_seq_len*n_words);
+      int offset = 0;
+      for (const auto &t5 : work) {
+        const double *d = t5[name].value();
+        for (size_t u=0; u<m_datum_num_words[name]; u++) {
+          work_d[offset++] = d[u];
+        }
+      }
+      node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(work_d.data(), m_seq_len * m_datum_num_words[name]);
+    }
+  }
+}
+
+}  // namespace lbann
diff --git a/src/data_readers/data_reader_numpy.cpp b/src/data_readers/data_reader_numpy.cpp
index 8f7b9715977..d39b334e4b1 100644
--- a/src/data_readers/data_reader_numpy.cpp
+++ b/src/data_readers/data_reader_numpy.cpp
@@ -127,6 +127,7 @@ void numpy_reader::load() {
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_numpy_npz.cpp b/src/data_readers/data_reader_numpy_npz.cpp
index 695242dd2bd..5ab9ac19af8 100644
--- a/src/data_readers/data_reader_numpy_npz.cpp
+++ b/src/data_readers/data_reader_numpy_npz.cpp
@@ -157,6 +157,7 @@ namespace lbann {
     m_shuffled_indices.clear();
     m_shuffled_indices.resize(m_num_samples);
     std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+    resize_shuffled_indices();
     select_subset_of_data();
   }
 
@@ -198,6 +199,19 @@ namespace lbann {
     if (!m_has_responses) {
       throw lbann_exception("numpy_npz_reader: do not have responses");
     }
+
+    Mat Y_v = El::View(Y, El::IR(0, Y.Height()), El::IR(mb_idx, mb_idx + 1));
+    if(m_responses.word_size == 2) {
+      // Convert int16 to DataType.
+      const short *data = m_responses.data<short>() + data_id * m_num_response_features;
+      DataType *dest = Y_v.Buffer();
+      // OPTIMIZE
+      LBANN_OMP_PARALLEL_FOR
+        for(int j = 0; j < m_num_response_features; j++)
+          dest[j] = data[j];
+      return true;
+    }
+
     void *responses = NULL;
     if (m_responses.word_size == 4) {
       responses = (void *) (m_responses.data<float>()
@@ -206,7 +220,6 @@ namespace lbann {
       responses = (void *) (m_responses.data<double>()
                             + data_id * m_num_response_features);
     }
-    Mat Y_v = El::View(Y, El::IR(0, Y.Height()), El::IR(mb_idx, mb_idx + 1));
     std::memcpy(Y_v.Buffer(), responses,
                 m_num_response_features * m_responses.word_size);
     return true;
diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp
index 79dfd8196ec..69c31d79b27 100644
--- a/src/data_readers/data_reader_numpy_npz_conduit.cpp
+++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp
@@ -27,13 +27,12 @@
 
 #include "lbann/data_readers/data_reader_numpy_npz_conduit.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
-#include "lbann/data_readers/numpy_conduit_converter.hpp"
 #include <unordered_set>
 #include "lbann/utils/file_utils.hpp" // pad()
 #include "lbann/utils/jag_utils.hpp"  // read_filelist(..) TODO should be move to file_utils
 #include "lbann/utils/timer.hpp"
-#include "lbann/models/model.hpp"
-
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/utils/lbann_library.hpp"
 
 namespace lbann {
 
@@ -94,63 +93,96 @@ void numpy_npz_conduit_reader::load() {
   std::string infile = get_data_filename();
   read_filelist(m_comm, infile, m_filenames);
 
-  // fills in: m_num_samples, m_num_features, m_num_response_features,
+  // fills in: m_num_features, m_num_response_features,
   // m_data_dims, m_data_word_size, m_response_word_size
   fill_in_metadata();
 
-  if (m_num_labels == 0 && !opts->get_bool("preload_data_store") && opts->get_bool("use_data_store")) {
-    LBANN_WARNING("when not preloading you must specify the number of labels in the prototext file if you are doing classification");
-  }
-
-  std::vector<int> local_list_sizes;
-  if (opts->get_bool("preload_data_store")) {
-    int np = m_comm->get_procs_per_trainer();
-    int base_files_per_rank = m_filenames.size() / np;
-    int extra = m_filenames.size() - (base_files_per_rank*np);
-    if (extra > np) {
-      LBANN_ERROR("extra > np");
-    }
-    local_list_sizes.resize(np, 0);
-    for (int j=0; j<np; j++) {
-      local_list_sizes[j] = base_files_per_rank;
-      if (j < extra) {
-        local_list_sizes[j] += 1;
-      }
-    }
-  }
-
   // Reset indices.
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
+  m_num_samples = m_shuffled_indices.size();
+
+  if (m_num_labels == 0 && !opts->get_bool("preload_data_store") && opts->get_bool("use_data_store")) {
+    LBANN_WARNING("when not preloading you must specify the number of labels in the prototext file if you are doing classification");
+  }
 
-  instantiate_data_store(local_list_sizes);
+  instantiate_data_store();
 
-  // TODO: this may need fixing up for efficiency. If using an absolute
-  //       num samples, or percentage of samples, and we've preloaded,
-  //       this is wasteful and not what we want
   select_subset_of_data();
 }
 
-void numpy_npz_conduit_reader::preload_data_store() {
+void numpy_npz_conduit_reader::do_preload_data_store() {
   double tm1 = get_time();
-  m_data_store->set_preload();
+
+  if (is_master()) std::cout << "Starting numpy_npz_conduit_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl;
+
+  size_t count = get_absolute_sample_count();
+  double use_percent = get_use_percent();
+  if (count != 0 || use_percent != 1) {
+    LBANN_ERROR("numpy_npz_conduit_reader currently assumes you are using 100% of the data set; you specified get_absolute_sample_count() = ", count, " and get_use_percent() = ", use_percent, "; please ask Dave Hysom to modify the code, if you want to use less than 100%");
+  }
+
   int rank = m_comm->get_rank_in_trainer();
 
   std::unordered_set<int> label_classes;
-  for (size_t data_id=0; data_id<m_filenames.size(); data_id++) {
-    if (m_data_store->get_index_owner(data_id) != rank) {
-      continue;
+
+  bool threaded = ! options::get()->get_bool("data_store_no_thread");
+
+  //threaded mode
+  if (threaded) {
+    if (is_master()) {
+      std::cout << "mode: data_store_thread\n";
+    }
+    std::shared_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(m_comm, options::get());
+    int num_threads = static_cast<int>(io_thread_pool->get_num_threads());
+
+    //collect the set of indices that belong to this rank
+    std::vector<std::unordered_set<int>> data_ids(num_threads);
+    int j = 0;
+    for (size_t data_id=0; data_id<m_shuffled_indices.size(); data_id++) {
+      int index = m_shuffled_indices[data_id];
+      if (m_data_store->get_index_owner(index) != rank) {
+        continue;
+      }
+      data_ids[j++].insert(index);
+      if (j == num_threads) {
+        j = 0;
+      }
     }
 
-    conduit::Node node;
-    numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node);
-    const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value();
-    const int* label_ptr = reinterpret_cast<const int*>(char_ptr);
-    label_classes.insert(*label_ptr);
-    m_data_store->set_conduit_node(data_id, node);
-  }
+    //load the samples
+    for (int t = 0; t < num_threads; t++) {
+      if(t == io_thread_pool->get_local_thread_id()) {
+        continue;
+      } else {
+        io_thread_pool->submit_job_to_work_group(std::bind(&numpy_npz_conduit_reader::load_numpy_npz_from_file, this, data_ids[t], label_classes));
+      }
+    }
+    load_numpy_npz_from_file(data_ids[io_thread_pool->get_local_thread_id()], label_classes);
+    io_thread_pool->finish_work_group();
+  } //end: threaded mode
+
+  //non-threaded mode
+  else {
+    for (size_t data_id=0; data_id<m_filenames.size(); data_id++) {
+      if (m_data_store->get_index_owner(data_id) != rank) {
+        continue;
+      }
+
+      conduit::Node node;
+      load_npz(m_filenames[data_id], data_id, node);
+      const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value();
+      const int* label_ptr = reinterpret_cast<const int*>(char_ptr);
+      label_classes.insert(*label_ptr);
+      m_data_store->set_conduit_node(data_id, node);
+    }
+  } //end: non-threaded mode
 
+  // Nikoli says we're not using labels, so I'm commenting this section out
+  // (this section is a mess, anyway)
+  #if 0
   if (m_has_labels) {
 
     // get max element. Yes, I know you can do this with, e.g, lambda
@@ -199,12 +231,26 @@ void numpy_npz_conduit_reader::preload_data_store() {
     m_num_labels = label_classes.size();
     #endif
   }
+  #endif
+
   double tm2 = get_time();
   if (is_master()) {
     std::cout << "time to preload: " << tm2 - tm1 << " for role: " << get_role() << "\n";
   }
 }
 
+bool numpy_npz_conduit_reader::load_numpy_npz_from_file(const std::unordered_set<int> &data_ids, std::unordered_set<int> &label_classes) {
+  for (auto data_id : data_ids) {
+    conduit::Node node;
+    load_conduit_node(m_filenames[data_id], data_id, node);
+    const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value();
+    const int* label_ptr = reinterpret_cast<const int*>(char_ptr);
+    label_classes.insert(*label_ptr);
+    m_data_store->set_conduit_node(data_id, node);
+  }
+  return true;
+}
+
 bool numpy_npz_conduit_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
   Mat X_v = El::View(X, El::IR(0, X.Height()), El::IR(mb_idx, mb_idx+1));
   conduit::Node node;
@@ -212,10 +258,11 @@ bool numpy_npz_conduit_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
     const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
     node.set_external(ds_node);
   } else {
-    numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node);
+    load_npz(m_filenames[data_id], data_id, node);
     //note: if testing, and test set is touched more than once, the following
     //      will through an exception TODO: relook later
-    if (priming_data_store() || m_model->get_execution_mode() == execution_mode::testing) {
+    const auto& c = static_cast<const execution_context&>(m_trainer->get_data_coordinator().get_execution_context());
+    if (priming_data_store() || c.get_execution_mode() == execution_mode::testing) {
       m_data_store->set_conduit_node(data_id, node);
     }
   }
@@ -286,7 +333,7 @@ bool numpy_npz_conduit_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
     const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id);
     node.set_external(ds_node);
   } else {
-    numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node);
+    load_npz(m_filenames[data_id], data_id, node);
     if (priming_data_store()) {
       m_data_store->set_conduit_node(data_id, node);
     } else {
@@ -316,12 +363,15 @@ bool numpy_npz_conduit_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
 void numpy_npz_conduit_reader::fill_in_metadata() {
   int rank = m_comm->get_rank_in_trainer();
   // to avoid contention, each rank opens a separate file
-  std::ifstream in(m_filenames[rank]);
+  size_t my_file = rank;
+  if (my_file >= m_filenames.size()) {
+    my_file = 0;
+  }
+  std::ifstream in(m_filenames[my_file]);
   if (!in) {
-    LBANN_ERROR("failed to open " + m_filenames[rank] + " for reading");
+    LBANN_ERROR("failed to open " + m_filenames[my_file] + " for reading");
   }
   in.close();
-
   m_num_samples = m_filenames.size();
   if (is_master()) {
     std::cout << "num samples: " << m_num_samples << "\n";
@@ -329,7 +379,7 @@ void numpy_npz_conduit_reader::fill_in_metadata() {
 
   int data_id = 0; //meaningless
   conduit::Node node;
-  numpy_conduit_converter::load_conduit_node(m_filenames[rank], data_id, node);
+  load_npz(m_filenames[my_file], data_id, node);
 
   //fill in m_data_dims
   auto shape = node[LBANN_DATA_ID_STR(data_id) + "/data/shape"].as_uint64_array();
@@ -378,4 +428,74 @@ void numpy_npz_conduit_reader::fill_in_metadata() {
   }
 }
 
+void numpy_npz_conduit_reader::load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset) {
+
+  try {
+    if (reset) {
+      output.reset();
+    }
+
+    std::vector<size_t> shape;
+    std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filename);
+
+    for (auto &&t : a) {
+      cnpy::NpyArray &b = t.second;
+      if (b.shape[0] != 1) {
+        LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename);
+      }
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape;
+
+      if (b.data_holder->size() / b.word_size != b.num_vals) {
+        LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals));
+      }
+
+      // conduit makes a copy of the data, hence owns the data, hence it
+      // will be properly deleted when then conduit::Node is deleted
+      char *data = b.data_holder->data();
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_char_ptr(data, b.word_size*b.num_vals);
+    }
+  } catch (...) {
+    //note: npz_load throws std::runtime_error, but I don't want to assume
+    //      that won't change in the future
+    LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load");
+  }
+}
+
+void numpy_npz_conduit_reader::load_npz(const std::string filename, int data_id, conduit::Node &output) {
+
+  try {
+    output.reset();
+
+    std::vector<size_t> shape;
+    m_npz_cache[data_id] = cnpy::npz_load(filename);
+    std::map<std::string, cnpy::NpyArray> &a = m_npz_cache[data_id];
+
+    for (auto &&t : a) {
+      cnpy::NpyArray &b = t.second;
+      if (b.shape[0] != 1) {
+        LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename);
+      }
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals;
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape;
+
+      if (b.data_holder->size() / b.word_size != b.num_vals) {
+        LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals));
+      }
+
+      conduit::uint8 *data = reinterpret_cast<conduit::uint8*>(b.data_holder->data());
+      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_external_uint8_ptr(data, b.word_size*b.num_vals);
+    }
+  } catch (...) {
+    //note: npz_load throws std::runtime_error, but I don't want to assume
+    //      that won't change in the future
+    LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load");
+  }
+}
+
+
 }  // namespace lbann
diff --git a/src/data_readers/data_reader_pilot2_molecular.cpp b/src/data_readers/data_reader_pilot2_molecular.cpp
index e48a56610e1..b85d0f29642 100644
--- a/src/data_readers/data_reader_pilot2_molecular.cpp
+++ b/src/data_readers/data_reader_pilot2_molecular.cpp
@@ -115,6 +115,7 @@ void pilot2_molecular_reader::load() {
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp
index 13853fd0a65..baf4af38b0e 100644
--- a/src/data_readers/data_reader_python.cpp
+++ b/src/data_readers/data_reader_python.cpp
@@ -25,184 +25,32 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/data_readers/data_reader_python.hpp"
+#include "lbann/trainers/trainer.hpp"
 #ifdef LBANN_HAS_PYTHON
 #include <cstdio>
 #include <algorithm>
+#include <regex>
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/utils/python.hpp"
 
 namespace lbann {
 
-namespace python {
-
-// Static variables
-std::unique_ptr<manager> manager::m_instance;
-
-manager& manager::get_instance() {
-  if (m_instance == nullptr) { create(); }
-  return *m_instance;
-}
-
-void manager::create() {
-  m_instance.reset(new manager());
-}
-
-void manager::destroy() {
-  m_instance.reset(nullptr);
-}
-
-manager::manager() {
-  if (!Py_IsInitialized()) {
-
-    // Hack to display output from Python
-    // Note: Python outputs didn't appear because MPI intercepts
-    // stdout and stderr. See
-    // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program
-    Py_UnbufferedStdioFlag = 1;
-
-    // Initialize embedded Python session
-    Py_Initialize();
-    PyEval_InitThreads();
-
-    // Release GIL
-    m_thread_state = PyEval_SaveThread();
-
-  }
-  if (!Py_IsInitialized()) {
-    LBANN_ERROR("error creating embedded Python session");
-  }
-}
-
-manager::~manager() {
-  if (Py_IsInitialized()) {
-    if (m_thread_state != nullptr) {
-      PyEval_RestoreThread(m_thread_state);
-    }
-    Py_Finalize();
-  }
-}
-
-void manager::check_error(bool force_error) const {
-  global_interpreter_lock gil(*this);
-  if (force_error || PyErr_Occurred()) {
-
-    // Get error information from Python session
-    PyObject *type, *value, *traceback;
-    PyErr_Fetch(&type, &value, &traceback);
-
-    // Construct error message
-    std::ostringstream err;
-    err << "detected Python error";
-    if (value != nullptr) {
-      auto msg = PyObject_Repr(value);
-      auto msg_str = PyUnicode_AsEncodedString(msg, "utf-8", "Error -");
-      err << " (" << PyBytes_AS_STRING(msg_str) << ")";
-      Py_XDECREF(msg_str);
-      Py_XDECREF(msg);
-    }
-
-    // Print Python traceback if available
-    if (traceback != nullptr) {
-
-      // Format traceback
-      auto module = PyImport_ImportModule("traceback");
-      auto func = PyObject_GetAttrString(module, "format_tb");
-      auto args = PyTuple_Pack(1, traceback);
-      auto message = PyObject_CallObject(func, args);
-
-      // Print traceback
-      err << "\n\n" << "Python traceback:";
-      auto iter = PyObject_GetIter(message);
-      for (auto line = PyIter_Next(iter);
-           line != nullptr;
-           line = PyIter_Next(iter)) {
-        const char* line_ = PyUnicode_AsUTF8(line);
-        err << "\n" << (line_ ? line_ : "");
-        Py_DECREF(line);
-      }
-
-      // Clean up
-      Py_XDECREF(iter);
-      Py_XDECREF(message);
-      Py_XDECREF(args);
-      Py_XDECREF(func);
-      Py_XDECREF(module);
-
-    }
-
-    // Clean up and throw exception
-    Py_XDECREF(type);
-    Py_XDECREF(value);
-    Py_XDECREF(traceback);
-    LBANN_ERROR(err.str());
-
-  }
-}
-
-global_interpreter_lock::global_interpreter_lock(const manager&)
-  : m_gil_state(PyGILState_Ensure()) {}
-
-global_interpreter_lock::~global_interpreter_lock() {
-  if (Py_IsInitialized()) {
-    PyGILState_Release(m_gil_state);
-  }
-}
-
-object::object(PyObject* ptr) : m_ptr(ptr) {
-  if (Py_IsInitialized() && PyErr_Occurred()) {
-    manager::get_instance().check_error();
-  }
-}
-
-object::object(std::string val)
-  : object(PyUnicode_FromStringAndSize(val.c_str(), val.size())) {}
-object::object(El::Int val) : object(PyLong_FromLong(val)) {}
-object::object(DataType val) : object(PyFloat_FromDouble(val)) {}
-
-object::object(const object& other) : m_ptr(other.m_ptr) {
-  Py_XINCREF(m_ptr);
-}
-
-object& object::operator=(const object& other) {
-  Py_XDECREF(m_ptr);
-  m_ptr = other.m_ptr;
-  Py_XINCREF(m_ptr);
-  return *this;
-}
-
-object::object(object&& other) : m_ptr(other.m_ptr) {
-  other.m_ptr = nullptr;
-}
-
-object& object::operator=(object&& other) {
-  Py_XDECREF(m_ptr);
-  m_ptr = other.m_ptr;
-  other.m_ptr = nullptr;
-  return *this;
-}
-
-object::~object() {
-  if (Py_IsInitialized()) {
-    Py_XDECREF(m_ptr);
-  }
-}
-
-} // namespace python
-
 python_reader::python_reader(std::string module,
                              std::string module_dir,
                              std::string sample_function,
                              std::string num_samples_function,
-                             std::string sample_dims_function)
-  : generic_data_reader(true) {
+                             std::string sample_dims_function,
+                             bool shuffle)
+  : generic_data_reader(shuffle) {
 
-  // Acquire Python GIL
-  auto& manager = python::manager::get_instance();
-  python::global_interpreter_lock gil(manager);
+  // Make sure Python is running and acquire GIL
+  python::global_interpreter_lock gil;
 
   // Import Python module for data
   if (!module_dir.empty()) {
     auto path = PySys_GetObject("path");  // Borrowed reference
     PyList_Append(path, python::object(module_dir));
-    manager.check_error();
+    python::check_error();
   }
   python::object data_module = PyImport_ImportModule(module.c_str());
 
@@ -211,7 +59,7 @@ python_reader::python_reader(std::string module,
     = PyObject_GetAttrString(data_module, num_samples_function.c_str());
   python::object num = PyObject_CallObject(num_func, nullptr);
   m_num_samples = PyLong_AsLong(num);
-  manager.check_error();
+  python::check_error();
 
   // Get sample dimensions
   python::object dims_func
@@ -222,17 +70,19 @@ python_reader::python_reader(std::string module,
     m_sample_dims.push_back(PyLong_AsLong(d));
     Py_DECREF(d);
   }
-  manager.check_error();
+  python::check_error();
 
-  // Get sample function
+  // Get sample access function
   m_sample_function = PyObject_GetAttrString(data_module,
                                              sample_function.c_str());
 
 }
 
 python_reader::~python_reader() {
-  if (Py_IsInitialized() && m_process_pool != nullptr) {
+  if (python::is_active() && m_process_pool != nullptr) {
+    python::global_interpreter_lock gil;
     PyObject_CallMethod(m_process_pool, "terminate", nullptr);
+    PyObject_CallMethod(m_process_pool, "join", nullptr);
   }
 }
 
@@ -256,42 +106,53 @@ int python_reader::get_linearized_label_size() const {
 }
 
 bool python_reader::fetch_data_block(CPUMat& X,
-                                     El::Int thread_id,
+                                     El::Int block_offset,
+                                     El::Int block_stride,
                                      El::Int mb_size,
                                      El::Matrix<El::Int>& indices_fetched) {
 
   // Acquire Python GIL on first IO thread
   // Note: Do nothing on other IO threads.
-  if (thread_id != 0) { return true; }
-  auto& manager = python::manager::get_instance();
-  python::global_interpreter_lock gil(manager);
+  if (block_offset != 0) { return true; }
+  python::global_interpreter_lock gil;
 
-  // Get sample indices
-  python::object indices = PyList_New(0);
+  // Check that shared memory array is large enough
+  const El::Int sample_size = get_linearized_data_size();
+  const El::Int array_size = PyObject_Length(m_shared_memory_array);
+  if (array_size < sample_size * mb_size) {
+    std::stringstream err;
+    err << "Python data reader attempted to load "
+        << sample_size * mb_size * sizeof(DataType) << " B "
+        << "into shared memory array, but only "
+        << array_size * sizeof(DataType) << " B is available";
+    LBANN_ERROR(err.str());
+  }
+
+  // Get arguments for sample access function
+  python::object args_list = PyList_New(0);
   for (El::Int i = 0; i < mb_size; ++i) {
-    El::Int index = m_shuffled_indices[m_current_pos + i * m_sample_stride];
-    PyList_Append(indices, python::object(index));
-    indices_fetched.Set(i, 0, index);
+    El::Int sample_index = m_shuffled_indices[m_current_pos + i * m_sample_stride];
+    El::Int array_offset = sample_size * i;
+    PyList_Append(args_list,
+                  python::object(Py_BuildValue("(l,l)",
+                                               sample_index,
+                                               array_offset)));
+    indices_fetched.Set(i, 0, sample_index);
   }
 
   // Get samples using Python process pool
   python::object samples = PyObject_CallMethod(m_process_pool,
-                                               "map",
+                                               "starmap",
                                                "(O,O)",
-                                               m_sample_function.get(),
-                                               indices.get());
+                                               m_sample_function_wrapper.get(),
+                                               args_list.get());
 
-  // Extract sample entries from Python objects
-  const El::Int sample_size = get_linearized_data_size();
-  samples = PyObject_GetIter(samples);
-  for (El::Int col = 0; col < mb_size; ++col) {
-    python::object sample = PyIter_Next(samples);
-    sample = PyObject_GetIter(sample);
-    for (El::Int row = 0; row < sample_size; ++row) {
-      python::object val = PyIter_Next(sample);
-      X(row, col) = PyFloat_AsDouble(val);
-    }
-  }
+  // Copy data from shared memory to output matrix
+  CPUMat shared_memory_matrix(sample_size,
+                              mb_size,
+                              m_shared_memory_array_ptr,
+                              sample_size);
+  El::Copy(shared_memory_matrix, X);
 
   return true;
 }
@@ -301,22 +162,166 @@ bool python_reader::fetch_label(CPUMat& Y, int data_id, int col) {
 }
 
 void python_reader::setup(int num_io_threads,
-                          std::shared_ptr<thread_pool> io_thread_pool) {
+                          observer_ptr<thread_pool> io_thread_pool) {
   generic_data_reader::setup(num_io_threads, io_thread_pool);
 
-  // Initialize Python process pool
-  auto& manager = python::manager::get_instance();
-  python::global_interpreter_lock gil(manager);
+  // Acquire Python GIL
+  python::global_interpreter_lock gil;
+
+  // Import modules
+  python::object main_module = PyImport_ImportModule("__main__");
+  python::object ctypes_module = PyImport_ImportModule("ctypes");
   python::object multiprocessing_module
     = PyImport_ImportModule("multiprocessing");
-  m_process_pool = PyObject_CallMethod(multiprocessing_module, "Pool",
-                                       "(L)", num_io_threads);
+
+  // Stop process pool if needed
+  if (m_process_pool != nullptr) {
+    PyObject_CallMethod(m_process_pool, "terminate", nullptr);
+    m_process_pool = nullptr;
+  }
+
+  // Allocate shared memory array
+  /// @todo Figure out more robust way to get max mini-batch size
+  const El::Int sample_size = get_linearized_data_size();
+  const El::Int mini_batch_size
+    = generic_data_reader::get_trainer().get_max_mini_batch_size();
+  std::string datatype_typecode;
+  switch (sizeof(DataType)) {
+  case 4: datatype_typecode = "f"; break;
+  case 8: datatype_typecode = "d"; break;
+  default: LBANN_ERROR("invalid data type for Python data reader "
+                       "(only float and double are supported)");
+  }
+  m_shared_memory_array
+    = PyObject_CallMethod(multiprocessing_module,
+                          "RawArray",
+                          "(s, l)",
+                          datatype_typecode.c_str(),
+                          sample_size * mini_batch_size);
+
+  // Get address of shared memory buffer
+  python::object shared_memory_ptr
+    = PyObject_CallMethod(ctypes_module,
+                          "addressof",
+                          "(O)",
+                          m_shared_memory_array.get());
+  m_shared_memory_array_ptr
+    = reinterpret_cast<DataType*>(PyLong_AsLong(shared_memory_ptr));
+
+  // Create global variables in Python
+  // Note: The static counter makes sure variable names are unique.
+  static El::Int instance_id = 0;
+  instance_id++;
+  const std::string sample_func_name
+    = ("_DATA_READER_PYTHON_CPP_sample_function_wrapper"
+       + std::to_string(instance_id));
+  PyObject_SetAttrString(main_module,
+                         sample_func_name.c_str(),
+                         m_sample_function);
+  python::check_error();
+  const std::string shared_array_name
+    = ("_DATA_READER_PYTHON_CPP_shared_memory_array"
+       + std::to_string(instance_id));
+  PyObject_SetAttrString(main_module,
+                         shared_array_name.c_str(),
+                         m_shared_memory_array);
+  python::check_error();
+
+  // Create wrapper around sample function
+  // Note: We attempt accessing the sample with the buffer protocol
+  // since they can be copied more efficiently. If this fails, we just
+  // iterate through the sample entries.
+  /// @todo Handle multi-dimensional NumPy arrays.
+  const std::string wrapper_func_name
+    = ("_DATA_READER_PYTHON_CPP_sample_function"
+       + std::to_string(instance_id));
+  std::string wrapper_func_def = R"(
+def @wrapper_func@(sample_index, array_offset):
+    """Get data sample and copy to shared memory array."""
+
+    # Get sample
+    sample = @sample_func@(sample_index)
+
+    # Copy entries from sample to shared memory array
+    # Note: We attempt to copy via the buffer protocol since it is
+    # much more efficient than naively looping through the arrays.
+    try:
+        # Note: ctypes arrays explicitly specify their endianness, but
+        # memoryview copies only work when the endianness is
+        # explicitly set to the system default. We need to do some
+        # type casting to get around this excessive error checking.
+        input_buffer = memoryview(sample)
+        output_buffer = memoryview(@shared_array@)
+        output_buffer = output_buffer[array_offset:array_offset+@sample_size@]
+        output_buffer = output_buffer.cast('B').cast('@datatype_typecode@')
+        output_buffer[:] = input_buffer
+    except:
+        for i, val in enumerate(sample):
+            @shared_array@[i + array_offset] = val
+)";
+  wrapper_func_def = std::regex_replace(wrapper_func_def,
+                                        std::regex("\\@wrapper_func\\@"),
+                                        wrapper_func_name);
+  wrapper_func_def = std::regex_replace(wrapper_func_def,
+                                        std::regex("\\@sample_func\\@"),
+                                        sample_func_name);
+  wrapper_func_def = std::regex_replace(wrapper_func_def,
+                                        std::regex("\\@shared_array\\@"),
+                                        shared_array_name);
+  wrapper_func_def = std::regex_replace(wrapper_func_def,
+                                        std::regex("\\@sample_size\\@"),
+                                        std::to_string(sample_size));
+  wrapper_func_def = std::regex_replace(wrapper_func_def,
+                                        std::regex("\\@datatype_typecode\\@"),
+                                        datatype_typecode);
+  PyRun_SimpleString(wrapper_func_def.c_str());
+  python::check_error();
+  m_sample_function_wrapper
+    = PyObject_GetAttrString(main_module,
+                             wrapper_func_name.c_str());
+
+  // Create initializer function for worker processes
+  const std::string init_func_name
+    = "_DATA_READER_PYTHON_CPP_init_function";
+  std::string init_func_def = R"(
+def @init_func@():
+    """Initialize worker process.
+
+    Disables the LBANN signal handler since it reports a spurious error
+    when the worker process recieves SIGTERM from the master process.
+
+    """
+
+    # Disable LBANN signal handler
+    import signal
+    for sig in range(signal.NSIG):
+        try:
+            signal.signal(sig, signal.SIG_DFL)
+            pass
+        except: pass
+)";
+  init_func_def = std::regex_replace(init_func_def,
+                                     std::regex("\\@init_func\\@"),
+                                     init_func_name);
+  PyRun_SimpleString(init_func_def.c_str());
+  python::check_error();
+  python::object init_func
+    = PyObject_GetAttrString(main_module,
+                             init_func_name.c_str());
+
+  // Start Python process pool
+  m_process_pool = PyObject_CallMethod(multiprocessing_module,
+                                       "Pool",
+                                       "(L,O)",
+                                       num_io_threads,
+                                       init_func.get());
 
 }
 
 void python_reader::load() {
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_smiles.cpp b/src/data_readers/data_reader_smiles.cpp
new file mode 100644
index 00000000000..45ee74e1fcb
--- /dev/null
+++ b/src/data_readers/data_reader_smiles.cpp
@@ -0,0 +1,776 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/data_readers/data_reader_smiles.hpp"
+#include "lbann/data_store/data_store_conduit.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/utils/commify.hpp"
+#include "lbann/utils/lbann_library.hpp"
+#include <mutex>
+#include <random>
+#include <time.h>
+
+namespace lbann {
+
+smiles_data_reader::smiles_data_reader(const bool shuffle)
+  : generic_data_reader(shuffle) {}
+
+smiles_data_reader::smiles_data_reader(const smiles_data_reader& rhs)  : generic_data_reader(rhs) {
+  copy_members(rhs);
+}
+
+smiles_data_reader::~smiles_data_reader() {
+  if (m_missing_chars.size()) {
+    if (is_master()) {
+      std::cout << std::endl << "The following tokens were in SMILES strings, but were missing from the vocabulary: ";
+      for (const auto t : m_missing_chars) {
+        std::cout << t << " ";
+      }
+    }
+    std::cout << std::endl;
+  }
+}
+
+smiles_data_reader& smiles_data_reader::operator=(const smiles_data_reader& rhs) {
+  // check for self-assignment
+  if (this == &rhs) {
+    return (*this);
+  }
+  generic_data_reader::operator=(rhs);
+  copy_members(rhs);
+  return (*this);
+}
+
+
+void smiles_data_reader::copy_members(const smiles_data_reader &rhs) {
+  m_data_store = nullptr;
+  if (rhs.m_data_store != nullptr) {
+      m_data_store = new data_store_conduit(rhs.get_data_store());
+      m_data_store->set_data_reader_ptr(this);
+  }
+  m_linearized_data_size = rhs.m_linearized_data_size;
+  m_linearized_label_size = rhs.m_linearized_label_size;
+  m_linearized_response_size = rhs.m_linearized_response_size;
+  m_num_labels = rhs.m_num_labels;
+  m_pad = rhs.m_pad;
+  m_unk = rhs.m_unk;
+  m_bos = rhs.m_bos;
+  m_eos = rhs.m_eos;
+  m_has_header = rhs.m_has_header;
+  m_delimiter = rhs.m_delimiter;
+  m_missing_char_in_vocab_count = rhs.m_missing_char_in_vocab_count;
+  m_missing_chars = rhs.m_missing_chars;
+  m_vocab = rhs.m_vocab;
+}
+
+void smiles_data_reader::load() {
+  if(is_master()) {
+    std::cout << "starting load for role: " << get_role() << std::endl;
+  }
+
+  options *opts = options::get();
+
+  if (opts->get_bool("ltfb")) {
+    opts->set_option("use_data_store", 1);
+    opts->set_option("preload_data_store", 1);
+  }
+
+  if (!opts->has_int("sequence_length")) {
+    LBANN_ERROR("you must pass --sequence_length=<int> on the cmd line");
+  }
+  m_linearized_data_size = opts->get_int("sequence_length") +2;
+
+  // load the vocabulary; this is a map: string -> short
+  load_vocab();
+
+  // m_has_header = !opts->get_bool("no_header");
+  //  side effects -- hard code for now, relook later
+  m_has_header = true;
+
+  // get the total number of samples in the file
+  const std::string infile = get_file_dir() + "/" + get_data_filename();
+  int num_samples = get_num_lines(infile);
+  if (m_has_header) {
+    --num_samples;
+  }
+
+  m_shuffled_indices.clear();
+  m_shuffled_indices.resize(num_samples);
+  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
+
+  // Optionally run "poor man's" LTFB
+  if (opts->get_bool("ltfb")) {
+    if (is_master()) {
+      std::cout << "running poor man's LTFB\n";
+    }
+    size_t my_trainer = m_comm->get_trainer_rank();
+    size_t num_trainers = m_comm->get_num_trainers();
+    std::set<int> my_trainers_indices;
+
+    // Use two loops here, to assure all trainers have
+    // the same number of samples
+    for (size_t j=0; j<m_shuffled_indices.size(); j += num_trainers) {
+      for (size_t k=0; k<num_trainers; k++) {
+        int idx = j+k;
+        if (idx % num_trainers == my_trainer) 
+        my_trainers_indices.insert(m_shuffled_indices[idx]);
+      }
+    }
+
+    m_shuffled_indices.clear();
+    for (const auto &t : my_trainers_indices) {
+      m_shuffled_indices.push_back(t);
+    }
+
+  } else {
+    if (is_master()) std::cout << "NOT running ltfb\n";
+  }
+
+  instantiate_data_store();
+  select_subset_of_data();
+
+  // get values for ad-hoc sanity checking; see: do_preload()
+  m_min_index = INT_MAX;
+  m_max_index = 0;
+  for (const auto &idx : m_shuffled_indices) {
+    if (idx < m_min_index) m_min_index = idx;
+    if (idx > m_max_index) m_max_index = idx;
+  }
+
+  get_delimiter();
+
+  // TODO: does this work if we carve off a validation set?
+  // NOTE: if ltfb is run, we've hard-coded above to use the
+  //       data-store
+  if (m_data_store == nullptr) {
+    double tm4 = get_time();
+    setup_local_cache();
+    if (is_master()) {
+      std::cout << "time for setup_local_cache(): " << get_time()-tm4
+                << "; num samples: " << m_sample_lookup.size() << std::endl;
+    }
+  }
+  print_statistics();
+}
+
+void smiles_data_reader::do_preload_data_store() {
+  double tm1 = get_time();
+  if (is_master()) {
+    std::cout << "starting do_preload_data_store; num indices: " 
+              << utils::commify(m_shuffled_indices.size()) 
+              << " for role: " << get_role() << std::endl;
+  }
+
+  m_data_store->set_node_sizes_vary();
+  //m_data_store->set_is_local_cache();
+  const std::string infile = get_file_dir() + "/" + get_data_filename();
+  std::ifstream in(infile.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open data file: ", infile, " for reading");
+  }
+  std::string line;
+  if (m_has_header) {
+    getline(in, line); 
+  }
+
+  // Collect the (global) set of sample_ids to be used in this experiment
+  std::unordered_set<int> valid_ids;
+  int sanity_min = INT_MAX;
+  int sanity_max = 0;
+  /*for (const auto &id : m_shuffled_indices) {
+    valid_ids.insert(id);
+    if (id < sanity_min) sanity_min = id;
+    if (id > sanity_max) sanity_max = id;
+  }*/
+  for (size_t idx=0; idx<m_shuffled_indices.size(); idx++) {
+    int id = m_shuffled_indices[idx];
+    if (id < sanity_min) sanity_min = id;
+    if (id > sanity_max) sanity_max = id;
+    if (m_data_store->get_index_owner(id) != m_comm->get_rank_in_world()) {
+      continue;
+    }
+    valid_ids.insert(id);
+  }
+  int max_index = sanity_max;
+
+  // cheap sanity check 
+  if ( (sanity_min != m_min_index || sanity_max != m_max_index)
+        &&
+        get_role() == "train") {
+    LBANN_ERROR("sanity_min != m_min_index || sanity_max != m_max_index: ", sanity_min, " ", m_min_index, " ", sanity_max, " ", m_max_index);
+  }
+
+  // Load the samples. As currently written, each rank loads all samples,
+  // hence, there will be no data exchange phases. This can be expanded
+  // in the future, to use the data store for sharding, instead of a purely
+  // local cache
+  int sample_id = -1;
+  size_t sanity = 0;
+  while (true) {
+    ++sample_id;
+    getline(in, line);
+    if (valid_ids.find(sample_id) != valid_ids.end()) {
+      conduit::Node &node = m_data_store->get_empty_node(sample_id);
+      construct_conduit_node(sample_id, line, node);
+      m_data_store->set_preloaded_conduit_node(sample_id, node);
+      ++sanity;
+    }
+    if (sample_id >= max_index) {
+      break;
+    }
+    if (is_master() && (sanity % 1000000 == 0) && sanity > 0) {
+      std::cout << sanity/1000000 << "M " << get_role() << " samples loaded" << std::endl;
+    }
+  }
+  in.close();
+  m_data_store->set_loading_is_complete();
+
+  // Sanity check
+  if (sanity != valid_ids.size()) {
+    LBANN_ERROR("sanity != valid_ids.size() (sanity=", sanity, "; valid_ids.size()=", valid_ids.size());
+  }
+
+  if (is_master()) {
+    std::cout << " do_preload_data_store time: " << get_time() - tm1 << std::endl;
+  }
+}
+
+bool smiles_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) {
+  short *data_ptr = nullptr;
+  size_t sz = 0;
+  std::vector<short> data;
+  // no data_store: all data is stored locally
+  if (m_data_store == nullptr) {
+    get_sample(data_id, data);
+    data_ptr = data.data();  
+    sz = data.size();
+  }
+
+  // get sample from the data_store
+  else {
+    if (! data_store_active()) {
+      LBANN_ERROR("it should be impossible you you to be here; please contact Dave Hysom");
+    }
+
+    //get data from node from data store
+    // TODO: change if/when using sharing:
+    //   node["/data/" + ...
+    const conduit::Node& node = m_data_store->get_conduit_node(data_id);
+    const std::string &smiles_string = node["/" + LBANN_DATA_ID_STR(data_id) + "/data"].as_string();
+    //const std::string &smiles_string = node["/data/" + LBANN_DATA_ID_STR(data_id) + "/data"].as_string();
+    encode_smiles(smiles_string, data, data_id);
+    data_ptr = data.data();
+    sz = data.size();
+  }
+  
+  size_t j;
+  for (j = 0; j < sz; ++j) {
+    X(j, mb_idx) = data_ptr[j]; 
+  }
+  for (; j<static_cast<size_t>(m_linearized_data_size); j++) {
+    X(j, mb_idx) = m_pad;
+  }
+
+  return true;
+}
+
+bool smiles_data_reader::fetch_label(Mat& Y, int data_id, int mb_idx) {
+  LBANN_ERROR("smiles_data_reader::fetch_label is not implemented");
+  return true;
+}
+
+bool smiles_data_reader::fetch_response(Mat& Y, int data_id, int mb_idx) {
+  LBANN_ERROR("smiles_data_reader::fetch_response is not implemented");
+  return true;
+}
+
+
+//user feedback
+void smiles_data_reader::print_statistics() const {
+  if (!is_master()) {
+    return;
+  }
+
+  std::cout << "\n======================================================\n";
+  std::cout << "role: " << get_role() << std::endl;
+  //std::cout << "mem for data, lower bound: " << utils::commify(get_mem_usage()) << std::endl;
+  std::cout << "num samples per trainer: " << m_shuffled_indices.size() << std::endl;
+  std::cout << "max sequence length: " << utils::commify(m_linearized_data_size) << std::endl;
+  std::cout << "num features=" << utils::commify(m_linearized_data_size) << std::endl;
+  if (m_delimiter == '\t') {
+    std::cout << "delimiter: <tab>\n"; 
+  } else if (m_delimiter == ',') {
+    std::cout << "delimiter: <comma>\n"; 
+  } else if (m_delimiter == '\0') {
+    std::cout << "delimiter: <none>\n"; 
+  } else {
+    LBANN_ERROR("invalid delimiter character, as int: ", (int)m_delimiter);
+  }
+  std::cout << "pad index: " << m_pad << std::endl;
+
+  // +4 for <bos>, <eos>, <unk>, <pad>
+  std::cout << "vocab size: " << m_vocab.size() +4 << std::endl
+            << "    (includes +4 for <bos>, <eos>, <pad>, <unk>)" << std::endl
+            << "======================================================\n\n";
+}
+
+void smiles_data_reader::load_vocab() {
+  options *opts = options::get();
+  if (!opts->has_string("vocab")) {
+    LBANN_ERROR("you must pass --vocab=<string> on the command line");
+  }
+  const std::string fn = opts->get_string("vocab");
+  std::ifstream in(fn.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", fn, " for reading; this is the vocabulary file");
+  }
+  std::string token;
+  short id;
+  int sanity = 4;
+  while (in >> token >> id) {
+    if (token.size() == 1) {
+      m_vocab[token[0]] = id;
+      m_vocab_inv[id] = token[0];
+    }  
+    if (token == "<pad>") {
+      m_pad = id;
+      --sanity;
+    }
+    if (token == "<unk>") {
+      m_unk = id;
+      --sanity;
+    }
+    if (token == "<bos>") {
+      m_bos = id;
+      --sanity;
+    }
+    if (token == "<eos>") {
+      m_eos = id;
+      --sanity;
+    }
+  }
+  in.close();
+  if (sanity) {
+    LBANN_ERROR("failed to find <pad> and/or <unk> and/or <bos> and/or <eos> in vocab file: ", fn);
+  }
+  if (opts->has_int("pad_index")) {
+    short tmp = opts->get_int("pad_index");
+    if (tmp != m_pad) {
+      LBANN_ERROR("you passed --pad_index=", tmp, " but we got --pad_index=", m_pad, " from the vocabulary file");
+    }
+  }
+}
+
+int smiles_data_reader::get_num_lines(std::string fn) {
+  double tm1 = get_time();
+  int count = 0;
+  if (is_master()) {
+    std::ifstream in(fn.c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open data file: ", fn, " for reading");
+    }
+    std::cout << "opened " << fn << " for reading\n";
+    std::string line;
+    while(getline(in,line)) {
+      ++count;
+    }
+    in.close();
+
+    std::cout << "smiles_data_reader::get_num_lines; num_lines: " 
+              << utils::commify(count) << " time: " << get_time()-tm1 << std::endl;
+  }
+
+  //I'm putting temporary timing around the bcast, because it
+  //seems to be taking a long time
+  if (is_master()) std::cout << "XX calling bcast ..." << std::endl;
+  tm1 = get_time();
+  m_comm->broadcast<int>(0, &count, 1, m_comm->get_world_comm());
+  double tm = get_time() - tm1;
+  if (is_master()) std::cout << "XX DONE! calling bcast ... TIME: " << tm << std::endl;
+
+  //check if user want less than all samples in this file
+  //@todo, this (flag or entire function) should really be deprecated since it can be accomplished with absoulte sample count
+  options *opts = options::get();
+  int n_lines = INT_MAX;
+  if (opts->has_int("n_lines")) {
+     n_lines = opts->get_int("n_lines");
+     if(is_master() && count < n_lines) { 
+       std::cout << "WARNING:: number of available samples (" << count 
+                << " ) in file " << fn << " is less than number of samples requested (" << n_lines
+                << " ) I am returning number of available samples " << std::endl;
+       }
+  }
+  return std::min(count,n_lines);
+}
+
+void smiles_data_reader::construct_conduit_node(int data_id, const std::string &line, conduit::Node &node) {
+  node.reset();
+  int sz = get_smiles_string_length(line, data_id);
+  const std::string sm = line.substr(0, sz);
+  node[LBANN_DATA_ID_STR(data_id) + "/data"] = sm;
+}
+
+void smiles_data_reader::encode_smiles(const std::string &smiles, std::vector<short> &data, int data_id) {
+  encode_smiles(smiles.data(), smiles.size(), data, data_id);
+}
+
+void smiles_data_reader::encode_smiles(const char *smiles, short size, std::vector<short> &data, int data_id) {
+  static int count = 0;
+
+  int stop = size;
+  if (stop+2 > m_linearized_data_size) { //+2 is for <bos> and <eos>
+    stop = m_linearized_data_size-2;
+    if (m_verbose && count < 20) {
+      ++count;
+      LBANN_WARNING("data_id: ", data_id, " smiles string size is ", size, "; losing ", (size-(m_linearized_data_size-2)), " characters");
+    }
+  }
+
+  data.clear();
+  data.reserve(stop+2);
+  data.push_back(m_bos);
+  for (int j=0; j<stop; j++) {
+    const char &w = smiles[j];
+    if (m_vocab.find(w) == m_vocab.end()) {
+      {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        m_missing_chars.insert(w);
+        ++m_missing_char_in_vocab_count;
+        if (m_verbose && m_missing_char_in_vocab_count < 20) {
+          std::stringstream ss;
+          ss << "world rank: " << m_comm->get_rank_in_world() << "; character not in vocab >>" << w << "<<; idx: " << j << "; data_id: " << data_id << "; string length: " << size << "; will use length: " << stop << "; vocab size: " << m_vocab.size() << std::endl;
+          std::cerr << ss.str();
+        }
+      }
+      data.push_back(m_unk);
+    } else {
+      data.push_back(m_vocab[w]);
+    }
+  }
+  data.push_back(m_eos);
+}
+
+void smiles_data_reader::get_sample(int sample_id, std::vector<short> &sample_out) {
+  std::unordered_map<int, std::pair<size_t, short>>::const_iterator iter = m_sample_lookup.find(sample_id);
+  if (iter == m_sample_lookup.end()) {
+    std::stringstream s;
+    s << "; m_sample_lookup.size: " << m_sample_lookup.size() << " known data_ids: ";
+    for (auto t : m_sample_lookup) s << t.first << " ";
+    LBANN_ERROR("failed to find data_id ", sample_id, " in m_sample_lookup", s.str());
+  }
+  size_t offset = iter->second.first;
+  short size = iter->second.second;
+  if (offset + size > m_data.size()) {
+    LBANN_ERROR("offset: ", offset, " + size: ", size, " is > m_data.size(): ", m_data.size());
+  }
+
+  const char *v = m_data.data()+offset;
+  const std::string smiles_string(v, size);
+  encode_smiles(v, size, sample_out, sample_id);
+}
+
+int smiles_data_reader::get_smiles_string_length(const std::string &line, int line_number) {
+  if (m_delimiter == '\0') {
+    return line.size();
+  }
+  size_t k = line.find(m_delimiter);
+  if (k == std::string::npos) {
+    LBANN_ERROR("failed to find delimit character; as an int: ", (int)m_delimiter, "; line: ", line, " which is line number ", line_number);
+  }
+  return k;
+}
+
+// TODO: break into several function calls ??
+// TODO: some/most of the following could/should be in the data_store ??
+void smiles_data_reader::setup_local_cache() {
+  double tm3 = get_time();
+  if (is_master()) {
+    std::cout << "\nSTARTING smiles_data_reader::setup_fast_experimental() " << std::endl << std::endl;
+  }  
+
+  // This will hold: (dataum_id, datum_offset, datum length) for each sample
+  std::vector<size_t> sample_offsets(m_shuffled_indices.size()*3);
+
+  // Will hold size of above buffer, for bcasting
+  size_t buffer_size;
+
+  if (is_master()) {
+    double tm1 = get_time();
+
+    // Open input file and discard header line, if it exists
+    const std::string infile = get_file_dir() + "/" + get_data_filename();
+    std::ifstream in(infile.c_str());
+    if (!in) {
+      LBANN_ERROR("failed to open data file: ", infile, " for reading");
+    }
+    std::string line;
+    if (m_has_header) {
+      getline(in, line);
+    }  
+
+    // Part 1: compute memory requirements for local cache
+
+    // Get max sample id, which will be the number of lines we need to 
+    // read from file. This is needed if (1) not using 100% of data,
+    // and/or (2) carving off part of train data to use as validation.
+    std::unordered_set<int> samples_to_use;
+    int max_sample_id = 0; 
+    for (size_t j=0; j<m_shuffled_indices.size(); j++) {
+      samples_to_use.insert(m_shuffled_indices[j]);
+      max_sample_id = m_shuffled_indices[j] > max_sample_id ? m_shuffled_indices[j] : max_sample_id;
+    }
+    ++max_sample_id;
+
+    // Construct sample_offsets vector
+    sample_offsets.clear();
+    size_t offset = 0;
+    for (int j=0; j<max_sample_id; j++) {
+      getline(in, line);
+      if (line.size() < 5) {
+        LBANN_ERROR("read ", j, " lines from file; could not read another. --num_samples is probably incorrect");
+      }
+      if (samples_to_use.find(j) != samples_to_use.end()) {
+        int k = get_smiles_string_length(line, j);
+        sample_offsets.push_back(j);
+        sample_offsets.push_back(offset);
+        sample_offsets.push_back(k);
+        offset += k;
+      }
+    }
+    buffer_size = offset;
+    m_data.resize(buffer_size);
+
+    // Part 2: Fill in the data buffer
+    in.seekg(0);
+    if (m_has_header) {
+      getline(in, line); 
+    }  
+    offset = 0;
+    for (int j=0; j<max_sample_id; j++) {
+      getline(in, line);
+      if (samples_to_use.find(j) != samples_to_use.end()) {
+        int k = get_smiles_string_length(line, j);
+        for (int n=0; n<k; n++) {
+          m_data[n+offset] = line[n];
+        }
+        offset += k;
+      }
+    }
+
+    if (sample_offsets.size()/3 != m_shuffled_indices.size()) {
+      LBANN_ERROR("sample_offsets.size()/3: ", sample_offsets.size()/3, " should be equal to m_shuffled_indices.size which is ", m_shuffled_indices.size());
+    }
+    std::cout << "P_0 time for computing sample sizes and filling buffer: " << get_time() - tm1 << std::endl;
+  }
+
+  // Construct lookup table for locating samples in the m_data vector (aka, the sample buffer)
+  m_comm->broadcast<size_t>(0, sample_offsets.data(), sample_offsets.size(), m_comm->get_world_comm());
+  for (size_t j=0; j<sample_offsets.size(); j += 3) {
+    m_sample_lookup[sample_offsets[j]] = 
+      std::make_pair(sample_offsets[j+1], sample_offsets[j+2]);
+  }
+
+  // Bcast the sample buffer
+  m_comm->broadcast<size_t>(0, &buffer_size, 1, m_comm->get_world_comm());
+  m_data.resize(buffer_size);
+
+  int full_rounds = m_data.size() / INT_MAX;
+  int last_round = m_data.size() % INT_MAX;
+  size_t the_offset = 0;
+
+  for (int j=0; j<full_rounds; j++) {
+    m_comm->broadcast<char>(0, m_data.data()+the_offset, INT_MAX, m_comm->get_world_comm());
+    the_offset += INT_MAX;
+  }
+  if (last_round) {
+    m_comm->broadcast<char>(0, m_data.data()+the_offset, last_round, m_comm->get_world_comm());
+  }
+
+  if (is_master()) {
+    std::cout << "total time for loading data: " << get_time()-tm3 << std::endl
+              << "num samples: " << m_sample_lookup.size() << std::endl;
+  }
+
+  // Only used for testing/debugging during development
+  if (options::get()->get_bool("test_encode")) {
+    test_encode();
+  }
+
+  if (is_master()) {
+    pid_t p = getpid();
+    char buf[80];
+    sprintf(buf, "cat /proc/%d/status >& status.txt", p);
+    system(buf);
+    std::cout << "wrote proc/[pid]/status to 'status.txt'" << std::endl;
+  }
+}
+
+void smiles_data_reader::test_encode() {
+  // What this does: at this point, P_0 has read and bcast the data set,
+  // and each rank has built a lookup table. Below, P_1 looks up each
+  // data_id; encodes the string (E1); reads the string from file (S2); 
+  // decodes E1 to produce string S1; compares S1 and S2 for equality.
+  double tm1 = get_time();
+  if (is_master()) {
+    std::cout << "STARTING TEST_ENCODE" << std::endl;
+  }
+  if (m_comm->get_rank_in_world() != 1) {
+    return;
+  }
+  
+  // option: testing the test ;)
+  bool fail = options::get()->get_bool("make_test_fail");
+
+  // Build ordered set of data_ids so we can more easily iterate
+  // through the file -- instead of jumping around
+  std::set<int> data_ids;
+  for (auto t : m_sample_lookup) {
+    data_ids.insert(t.first);
+  }
+
+  // Open input file and discard header (if it exists)
+  const std::string infile = get_file_dir() + "/" + get_data_filename();
+  std::ifstream in(infile.c_str());
+  if (!in) {
+    LBANN_ERROR("failed to open ", infile, " for reading");
+  }
+  std::string line;
+  if (m_has_header) {
+    getline(in, line);
+  }
+
+  size_t num_tested = 0;
+
+  std::vector<short> encoded;
+  std::string decoded;
+  int sample_id = -1;
+  while (getline(in, line)) {
+    ++sample_id;
+
+    if (data_ids.find(sample_id) != data_ids.end()) {
+      ++num_tested;
+      // encode then decode the datum that is stored in memory
+      get_sample(sample_id, encoded);
+      decode_smiles(encoded, decoded); 
+
+      // get datum length from the line we've just read from file
+      size_t k = get_smiles_string_length(line, sample_id);
+      std::string S2(line.data(), k);
+
+      // test the test! Optionally make the test fail;
+      // assumes smiles string contains at least 8 characters,
+      // and no string contains "~~~~"
+      if (num_tested > 10 && fail) {
+        for (size_t h=0; h<S2.size(); h++) {
+          S2[h] = '~';
+        }  
+      }
+
+      // conduct tests
+      // It would be simpler to throw exceptions here, but currently that
+      // would cause all other procs to hang
+      if (S2.size() != decoded.size()) {
+        LBANN_ERROR("S2.size (", S2.size(), ") != decoded.size (", decoded.size());
+      }
+      if (S2 != decoded) {
+        LBANN_ERROR("test_encoded failed; string from memory: ", decoded, "; string from file: ", S2, "; should be equal");
+      }
+    }
+  }
+  in.close();
+
+  if (num_tested != m_sample_lookup.size()) {
+    LBANN_ERROR("num_tested= ", num_tested, "; m_sample_lookup.size()= ", m_sample_lookup.size(), "; should be equal");
+  }
+
+  std::cout << "ENDING TEST_ENCODE; time: " << get_time()-tm1 
+            << " >>> TESTS PASSED <<< " << std::endl;
+}
+
+void smiles_data_reader::decode_smiles(const std::vector<short> &data, std::string &out) {
+  std::stringstream s;
+  for (const auto &t : data) {
+    if (!(t == m_eos || t == m_bos || t == m_pad || t == m_unk)) {
+      if (m_vocab_inv.find(t) == m_vocab_inv.end()) {
+      std::stringstream s2;
+      s2 <<"failed to find: " << t <<" in m_vocab_inv for input data: ";
+      for (auto tt : data) {
+        s2 << tt << " ";
+      }
+      s2 << "; m_vocab_inv.size(): " << m_vocab_inv.size() 
+         << " m_vocab_inv keys: ";
+      for (auto tt : m_vocab_inv) {
+        s2 << tt.first << " ";
+      }
+      LBANN_ERROR(s2.str());
+    }
+    }
+    const std::string &x = m_vocab_inv[t];
+    if (x == "<unk>") {
+      s << "<unk>";
+    } else if (!(x == "<bos>" || x == "<eos>" || x == "<pad>")) {
+      s << m_vocab_inv[t];
+    } 
+  }
+  out = s.str();
+}
+
+size_t smiles_data_reader::get_mem_usage() const {
+  if (m_data_store == nullptr) {
+    return m_data.size();
+  }
+  return m_data_store->get_mem_usage();
+}
+
+void smiles_data_reader::get_delimiter() {
+  // Get delimiter; default is none ('\0'), though it's likely
+  // to be ',' or '\t', since we're likely reading csv files
+  options *opts = options::get();
+  if (opts->has_string("delimiter")) {
+    const std::string d = options::get()->get_string("delimiter");
+    const char dd = d[0];
+    switch (dd) {
+      case 'c' :
+        m_delimiter = ',';
+        break;
+      case 't' :
+        m_delimiter = '\t';
+        break;
+      case '0' :
+        m_delimiter = '\0';
+        break;
+      default :
+        LBANN_ERROR("Invalid delimiter character; should be 'c', 't', '0'; you passed: ", d);
+    }  
+  }
+  if (is_master()) {
+    std::cout << "USING delimiter character: (int)" << (int)m_delimiter << std::endl;
+  }
+}
+
+}  // namespace lbann
diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp
index d8f3ea207d4..7088b9b908b 100644
--- a/src/data_readers/data_reader_synthetic.cpp
+++ b/src/data_readers/data_reader_synthetic.cpp
@@ -27,7 +27,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/data_readers/data_reader_synthetic.hpp"
-#include "lbann/utils/random.hpp"
+#include "lbann/utils/random_number_generators.hpp"
 #include <cstdio>
 #include <string>
 
@@ -91,6 +91,7 @@ void data_reader_synthetic::load() {
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_num_samples);
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  resize_shuffled_indices();
   select_subset_of_data();
 }
 
diff --git a/src/data_readers/data_reader_triplet.cpp b/src/data_readers/data_reader_triplet.cpp
deleted file mode 100644
index f2ee426a594..00000000000
--- a/src/data_readers/data_reader_triplet.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// data_reader_triplet .hpp .cpp - data reader to use triplet patches
-//                                 generated offline.
-// Depreciated and replaced by data_reader_multihead_siamese .hpp .cpp.
-// Kept here just for reference.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/data_reader_triplet.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include <fstream>
-#include <sstream>
-#include <omp.h>
-
-namespace lbann {
-
-data_reader_triplet::data_reader_triplet(const std::shared_ptr<cv_process>& pp, bool shuffle)
-  : data_reader_multi_images(pp, shuffle) {
-  set_defaults();
-}
-
-data_reader_triplet::data_reader_triplet(const data_reader_triplet& rhs)
-  : data_reader_multi_images(rhs),
-    m_samples(rhs.m_samples)
-{}
-
-data_reader_triplet& data_reader_triplet::operator=(const data_reader_triplet& rhs) {
-  // check for self-assignment
-  if (this == &rhs) {
-    return (*this);
-  }
-
-  data_reader_multi_images::operator=(rhs);
-  m_samples = rhs.m_samples;
-
-  return (*this);
-}
-
-data_reader_triplet::~data_reader_triplet() {
-}
-
-void data_reader_triplet::set_defaults() {
-  m_image_width = 110;
-  m_image_height = 110;
-  m_image_num_channels = 3;
-  set_linearized_image_size();
-  m_num_labels = 20;
-  m_num_img_srcs = 3;
-}
-
-/**
- * Same as the parent class method except the default value of the last argument,
- * num_img_srcs, which is 3 here.
- */
-void data_reader_triplet::set_input_params(const int width, const int height, const int num_ch, const int num_labels) {
-  data_reader_multi_images::set_input_params(width, height, num_ch, num_labels, 3);
-}
-
-
-bool data_reader_triplet::fetch_datum(Mat& X, int data_id, int mb_idx) {
-  int tid = m_io_thread_pool->get_local_thread_id();
-  std::vector<CPUMat> X_v = create_datum_views(X, mb_idx);
-
-  sample_t sample = m_samples.get_sample(data_id);
-  for(size_t i=0u; i < m_num_img_srcs; ++i) {
-    int width=0, height=0, img_type=0;
-    const std::string imagepath = get_file_dir() + sample.first[i];
-    bool ret = true;
-    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
-
-    if(!ret) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": image_utils::load_image failed to load - "
-                            + imagepath);
-    }
-    if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                            + get_type() + ": mismatch data size -- either width, height or channel - "
-                            + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height)
-                            + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size));
-    }
-  }
-  return true;
-}
-
-
-bool data_reader_triplet::fetch_label(Mat& Y, int data_id, int mb_idx) {
-  const label_t label = m_samples.get_label(data_id);
-  Y.Set(label, mb_idx, 1);
-  return true;
-}
-
-
-std::vector<data_reader_triplet::sample_t> data_reader_triplet::get_image_list_of_current_mb() const {
-  std::vector<sample_t> ret;
-  ret.reserve(m_mini_batch_size);
-  return ret;
-}
-
-
-std::vector<data_reader_triplet::sample_t> data_reader_triplet::get_image_list() const {
-  const size_t num_samples = m_samples.get_num_samples();
-  std::vector<sample_t> ret;
-  ret.reserve(num_samples);
-
-  for (size_t i=0; i < num_samples; ++i) {
-    ret.emplace_back(m_samples.get_sample(i));
-  }
-  return ret;
-}
-
-
-void data_reader_triplet::load() {
-  const std::string data_filename = get_data_filename();
-
-  // To support m_first_n semantic, m_samples.load() takes m_first_n
-  // as an argument and attempt to shrink the CNPY arrays loaded as needed
-  if (!m_samples.load(data_filename, m_first_n)) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " "
-                          + get_type() + ": failed to load the file " + data_filename);
-  }
-
-  size_t num_samples = m_samples.get_num_samples();
-
-  if (m_first_n > 0) {
-    num_samples = (static_cast<size_t>(m_first_n) <= num_samples)?
-                   static_cast<size_t>(m_first_n) : num_samples;
-
-    m_first_n = num_samples;
-    set_use_percent(1.0);
-    set_absolute_sample_count(0u);
-  }
-
-  // reset indices
-  m_shuffled_indices.clear();
-
-  m_shuffled_indices.resize(num_samples);
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
-
-  select_subset_of_data();
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/image_preprocessor.cpp b/src/data_readers/image_preprocessor.cpp
deleted file mode 100644
index 5166ab6ed8e..00000000000
--- a/src/data_readers/image_preprocessor.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// image_preprocessor.cpp - Preprocessing utilities for image inputs
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/image_preprocessor.hpp"
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/utils/statistics.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace {
-const float pi = std::acos(-1);
-};
-
-namespace lbann {
-
-lbann_image_preprocessor::lbann_image_preprocessor() :
-  m_horizontal_flip(false),
-  m_vertical_flip(false),
-  m_rotation_range(0.0f),
-  m_horizontal_shift(0.0f),
-  m_vertical_shift(0.0f),
-  m_shear_range(0.0f),
-  m_mean_subtraction(false),
-  m_unit_variance(false),
-  m_scale(true),  // We always did scaling by default.
-  m_z_score(false),
-  m_noise_factor(0.0f) {
-}
-
-void lbann_image_preprocessor::augment(Mat& pixels, unsigned imheight,
-                                       unsigned imwidth,
-                                       unsigned num_channels) {
-  bool do_transform = m_horizontal_flip || m_vertical_flip ||
-                      m_rotation_range || m_horizontal_shift || m_vertical_shift ||
-                      m_shear_range;
-  if (do_transform) {
-    cv::Mat sqpixels = cv_pixels(pixels, imheight, imwidth, num_channels);
-    rng_gen& gen = get_io_generator();
-    std::uniform_int_distribution<int> bool_dist(0, 1);
-    // Flips.
-    bool horiz_flip = bool_dist(gen) && m_horizontal_flip;
-    bool vert_flip = bool_dist(gen) && m_vertical_flip;
-    if (horiz_flip || vert_flip) {
-      if (horiz_flip && !vert_flip) {
-        flip(sqpixels, 1);
-      } else if (!horiz_flip && vert_flip) {
-        flip(sqpixels, 0);
-      } else {
-        flip(sqpixels, -1);
-      }
-    }
-    // Translations.
-    float x_trans = 0.0f;
-    float y_trans = 0.0f;
-    if (m_horizontal_shift) {
-      std::uniform_real_distribution<float> dist(-m_horizontal_shift,
-          m_horizontal_shift);
-      x_trans = dist(gen) * imwidth;
-    }
-    if (m_vertical_shift) {
-      std::uniform_real_distribution<float> dist(-m_vertical_shift,
-          m_vertical_shift);
-      y_trans = dist(gen) * imheight;
-    }
-    Mat trans_mat;
-    El::Diagonal(trans_mat, std::vector<DataType>({1.0f, 1.0f, 1.0f}));
-    trans_mat(0, 2) = x_trans;
-    trans_mat(1, 2) = y_trans;
-    // Shearing.
-    float shear = 0.0f;
-    if (m_shear_range) {
-      std::uniform_real_distribution<float> dist(-m_shear_range,
-          m_shear_range);
-      shear = dist(gen);
-    }
-    Mat shear_mat;
-    El::Zeros(shear_mat, 3, 3);
-    shear_mat(0, 0) = 1.0f;
-    shear_mat(2, 2) = 1.0f;
-    shear_mat(0, 1) = -std::sin(shear);
-    shear_mat(1, 1) = std::cos(shear);
-    // Rotation.
-    float rotate = 0.0f;
-    if (m_rotation_range) {
-      std::uniform_real_distribution<float> dist(-m_rotation_range,
-          m_rotation_range);
-      rotate = ::pi / 180.0f * dist(gen);
-    }
-    Mat rot_mat;
-    El::Zeros(rot_mat, 3, 3);
-    rot_mat(2, 2) = 1.0f;
-    rot_mat(0, 0) = std::cos(rotate);
-    rot_mat(0, 1) = -std::sin(rotate);
-    rot_mat(1, 0) = std::sin(rotate);
-    rot_mat(1, 1) = std::cos(rotate);
-    // Compute the final transformation.
-    Mat affine_mat_tmp(3, 3);
-    Mat affine_mat(3, 3);
-    El::Gemm(El::NORMAL, El::NORMAL, (DataType) 1.0, trans_mat, shear_mat,
-             (DataType) 0.0, affine_mat_tmp);
-    El::Gemm(El::NORMAL, El::NORMAL, (DataType) 1.0, affine_mat_tmp, rot_mat,
-             (DataType) 0.0, affine_mat);
-    affine_trans(sqpixels, affine_mat);
-    col_pixels(sqpixels, pixels, num_channels);
-  }
-}
-
-void lbann_image_preprocessor::normalize(Mat& pixels, unsigned num_channels) {
-  if (m_z_score || (m_mean_subtraction && m_unit_variance)) {
-    z_score(pixels, num_channels);
-  } else {
-    if (m_scale) {
-      unit_scale(pixels, num_channels);
-    }
-    if (m_mean_subtraction) {
-      mean_subtraction(pixels, num_channels);
-    }
-    if (m_unit_variance) {
-      unit_variance(pixels, num_channels);
-    }
-  }
-}
-
-void lbann_image_preprocessor::mean_subtraction(Mat& pixels,
-    unsigned num_channels) {
-  const unsigned height = pixels.Height();
-  const unsigned height_per_channel = height / num_channels;
-  for (unsigned channel = 0; channel < num_channels; ++channel) {
-    const unsigned channel_start = channel*height_per_channel;
-    const unsigned channel_end = (channel+1)*height_per_channel;
-    Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL);
-    DataType mean, stdev;
-    entrywise_mean_and_stdev(pixels_channel, mean, stdev);
-    for (unsigned i = 0; i < height_per_channel; ++i) {
-      DataType& pixels_entry = pixels_channel(i, 0);
-      pixels_entry -= mean;
-    }
-  }
-}
-
-void lbann_image_preprocessor::unit_variance(
-  Mat& pixels, unsigned num_channels) {
-
-  // Get image parameters
-  const unsigned height = pixels.Height();
-  const unsigned height_per_channel = height / num_channels;
-
-  // Scale each channel separately
-  for (unsigned channel = 0; channel < num_channels; ++channel) {
-    const unsigned channel_start = channel*height_per_channel;
-    const unsigned channel_end = (channel+1)*height_per_channel;
-    Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL);
-    DataType mean, stdev;
-    entrywise_mean_and_stdev(pixels_channel, mean, stdev);
-    if(stdev > DataType(1e-7)*std::abs(mean)) {
-      const DataType inv_stdev = 1 / stdev;
-      for (unsigned i = 0; i < height_per_channel; ++i) {
-        DataType& pixels_entry = pixels_channel(i, 0);
-        pixels_entry = (pixels_entry - mean) * inv_stdev + mean;
-      }
-    }
-  }
-
-}
-
-void lbann_image_preprocessor::unit_scale(Mat& pixels,
-    unsigned num_channels) {
-  // Pixels are in range [0, 255], normalize using that.
-  // Channels are not relevant here.
-  El::Scale(DataType(1) / 255, pixels);
-}
-
-
-void lbann_image_preprocessor::pixel_noise(Mat& pixels)
-{
-  if(m_noise_factor){
-    Mat X_noise;
-    El::Gaussian(X_noise, pixels.Height(), pixels.Width(), DataType(0), DataType(1));
-    El::Axpy(m_noise_factor,X_noise,pixels);
-    //@todo - clip to min and max of input entry
-    auto clip = [](const DataType& z) {
-         return std::max(DataType(0), std::min(z,DataType(1)));
-    };
-    EntrywiseMap(pixels, El::MakeFunction(clip));
-  }
-}
-
-void lbann_image_preprocessor::z_score(Mat& pixels,
-                                       unsigned num_channels) {
-
-  // Get image parameters
-  const unsigned height = pixels.Height();
-  const unsigned height_per_channel = height / num_channels;
-
-  // Shift and scale each channel separately
-  for (unsigned channel = 0; channel < num_channels; ++channel) {
-    const unsigned channel_start = channel*height_per_channel;
-    const unsigned channel_end = (channel+1)*height_per_channel;
-    Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL);
-    DataType mean, stdev;
-    entrywise_mean_and_stdev(pixels_channel, mean, stdev);
-    if(stdev > DataType(1e-7)*std::abs(mean)) {
-      const DataType inv_stdev = 1 / stdev;
-      for (unsigned i = 0; i < height_per_channel; ++i) {
-        DataType& pixels_entry = pixels_channel(i, 0);
-        pixels_entry = (pixels_entry - mean) * inv_stdev;
-      }
-    } else {
-      Zero(pixels_channel);
-    }
-  }
-
-}
-
-cv::Mat lbann_image_preprocessor::cv_pixels(const Mat& pixels,
-    unsigned imheight,
-    unsigned imwidth,
-    unsigned num_channels) {
-  if (num_channels == 1) {
-    cv::Mat m(imheight, imwidth, CV_32FC1);
-    for (unsigned y = 0; y < imheight; ++y) {
-      for (unsigned x = 0; x < imwidth; ++x) {
-        m.at<float>(y, x) = pixels(y * imwidth + x, 0);
-      }
-    }
-    return m;
-  } else if (num_channels == 3) {
-    cv::Mat m(imheight, imwidth, CV_32FC3);
-    for (unsigned y = 0; y < imheight; ++y) {
-      for (unsigned x = 0; x < imwidth; ++x) {
-        cv::Vec3f pixel;
-        unsigned offset = y * imwidth + x;
-        pixel[0] = pixels(offset, 0);
-        pixel[1] = pixels(offset + imheight*imwidth, 0);
-        pixel[2] = pixels(offset + 2*imheight*imwidth, 0);
-        m.at<cv::Vec3f>(y, x) = pixel;
-      }
-    }
-    return m;
-  } else {
-    throw lbann_exception(std::string{} + __FILE__ + " " +
-                          std::to_string(__LINE__) +
-                          "Only support 1 and 3 channels");
-  }
-}
-
-void lbann_image_preprocessor::col_pixels(const cv::Mat& sqpixels, Mat& pixels,
-    unsigned num_channels) {
-  unsigned imheight = sqpixels.rows;
-  unsigned imwidth = sqpixels.cols;
-  if (num_channels == 1) {
-    for (unsigned y = 0; y < imheight; ++y) {
-      for (unsigned x = 0; x < imwidth; ++x) {
-        pixels(y * imwidth + x, 0) = sqpixels.at<float>(y, x);
-      }
-    }
-  } else if (num_channels == 3) {
-    for (unsigned y = 0; y < imheight; ++y) {
-      for (unsigned x = 0; x < imwidth; ++x) {
-        cv::Vec3f pixel = sqpixels.at<cv::Vec3f>(y, x);
-        unsigned offset = y * imwidth + x;
-        pixels(offset, 0) = pixel[0];
-        pixels(offset + imheight*imwidth, 0) = pixel[1];
-        pixels(offset + 2*imheight*imwidth, 0) = pixel[2];
-      }
-    }
-  } else {
-    throw lbann_exception(std::string{} + __FILE__ + " " +
-                          std::to_string(__LINE__) +
-                          "Only support 1 and 3 channels");
-  }
-}
-
-void lbann_image_preprocessor::flip(cv::Mat& sqpixels, int flip_flag) {
-  // In/out must be different.
-  cv::Mat sqpixels_copy = sqpixels.clone();
-  cv::flip(sqpixels_copy, sqpixels, flip_flag);
-}
-
-void lbann_image_preprocessor::affine_trans(cv::Mat& sqpixels,
-    const Mat& trans) {
-  cv::Mat sqpixels_copy = sqpixels.clone();
-  // Construct the OpenCV transformation matrix.
-  cv::Mat cv_trans(2, 3, CV_32FC1);
-  cv_trans.at<float>(0, 0) = trans(0, 0);
-  cv_trans.at<float>(0, 1) = trans(0, 1);
-  cv_trans.at<float>(0, 2) = trans(0, 2);
-  cv_trans.at<float>(1, 0) = trans(1, 0);
-  cv_trans.at<float>(1, 1) = trans(1, 1);
-  cv_trans.at<float>(1, 2) = trans(1, 2);
-  cv::warpAffine(sqpixels_copy, sqpixels, cv_trans, sqpixels.size(),
-                 cv::INTER_LINEAR, cv::BORDER_REPLICATE);
-}
-
-void lbann_image_preprocessor::internal_save_image(
-  Mat& pixels, const std::string filename, unsigned imheight, unsigned imwidth,
-  unsigned num_channels, bool do_scale) {
-  cv::Mat sqpixels = cv_pixels(pixels, imheight, imwidth, num_channels);
-  cv::Mat converted_pixels;
-  int dst_type = 0;
-  if (num_channels == 1) {
-    dst_type = CV_8UC1;
-  } else if (num_channels == 3) {
-    dst_type = CV_8UC3;
-  }  // cv_pixels ensures no other case happens.
-  sqpixels.convertTo(converted_pixels, dst_type, do_scale ? 255.0f : 1.0f);
-  cv::imwrite(filename, converted_pixels);
-}
-
-}  // namespace lbann
diff --git a/src/data_readers/image_utils.cpp b/src/data_readers/image_utils.cpp
deleted file mode 100644
index 64102e334c5..00000000000
--- a/src/data_readers/image_utils.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// image_utils .cpp .hpp - Image I/O utility functions
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/utils/exception.hpp"
-
-#define _THROW_EXCEPTION_NO_OPENCV_() { \
-  std::stringstream err; \
-  err << __FILE__ << " " << __LINE__ \
-      << " :: not compiled with LBANN_ENABLE_OPENCV!"; \
-  throw lbann_exception(err.str()); \
-}
-
-
-namespace lbann {
-
-bool image_utils::loadIMG(const std::string& Imagefile, int& Width, int& Height, bool Flip, unsigned char *&Pixels, std::vector<char>& buf) {
-#ifdef LBANN_HAS_OPENCV
-    cv::Mat image = cv_utils::lbann_imread(Imagefile, _LBANN_CV_COLOR_, buf);
-  if (image.empty()) {
-    return false;
-  }
-
-  Width = image.cols;
-  Height = image.rows;
-
-  for (int y = 0; y < Height; y++) {
-    for (int x = 0; x < Width; x++) {
-      cv::Vec3b pixel = image.at<cv::Vec3b>(y, x);
-      int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x);
-      Pixels[offset]                  = pixel[_LBANN_CV_BLUE_];
-      Pixels[offset + Height*Width]   = pixel[_LBANN_CV_GREEN_];
-      Pixels[offset + 2*Height*Width] = pixel[_LBANN_CV_RED_];
-    }
-  }
-
-  return true;
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif
-}
-
-bool image_utils::loadIMG(std::vector<unsigned char>& image_buf, int& Width, int& Height, bool Flip, unsigned char *&Pixels) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv::imdecode(image_buf, _LBANN_CV_COLOR_);
-  //cv::Mat image = cv::imdecode(image_buf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
-  if (image.empty()) {
-    return false;
-  }
-
-  Width = image.cols;
-  Height = image.rows;
-
-  for (int y = 0; y < Height; y++) {
-    for (int x = 0; x < Width; x++) {
-      cv::Vec3b pixel = image.at<cv::Vec3b>(y, x);
-      int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x);
-      Pixels[offset]                  = pixel[_LBANN_CV_BLUE_];
-      Pixels[offset + Height*Width]   = pixel[_LBANN_CV_GREEN_];
-      Pixels[offset + 2*Height*Width] = pixel[_LBANN_CV_RED_];
-    }
-  }
-
-  return true;
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif
-}
-
-bool image_utils::saveIMG(const std::string& Imagefile, int Width, int Height, bool Flip, unsigned char *Pixels) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv::Mat(Height, Width, CV_8UC3);
-
-  for (int y = 0; y < Height; y++) {
-    for (int x = 0; x < Width; x++) {
-      cv::Vec3b pixel;
-      int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x);
-      pixel[_LBANN_CV_BLUE_]  = Pixels[offset];
-      pixel[_LBANN_CV_GREEN_] = Pixels[offset + Height*Width];
-      pixel[_LBANN_CV_RED_]   = Pixels[offset + 2*Height*Width];
-      image.at<cv::Vec3b>(y, x) = pixel;
-    }
-  }
-  cv::imwrite(Imagefile, image);
-
-  return true;
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif
-}
-
-
-#ifdef LBANN_HAS_OPENCV
-bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& out) {
-  bool ok1 = !image.empty() && pp.preprocess(image);
-  bool ok2 = ok1 && cv_utils::copy_cvMat_to_buf(image, out, pp);
-  // Disabling normalizer is needed because normalizer is not necessarily
-  // called during preprocessing but implicitly applied during data copying to
-  // reduce overhead.
-  pp.disable_lazy_normalizer();
-
-  if (!ok2) {
-    throw lbann_exception(std::string("image_utils::process_image(): image ") +
-      (image.empty()? "is empty." :
-                      (ok1? "copying failed." :
-                            "preprocessing failed.")));
-  }
-
-  Width  = image.cols;
-  Height = image.rows;
-  Type   = image.type();
-
-  return ok2;
-}
-
-bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, std::vector<uint8_t>& out) {
-  bool ok1 = !image.empty() && pp.preprocess(image);
-  bool ok2 = ok1 && cv_utils::copy_cvMat_to_buf(image, out, pp);
-  pp.disable_lazy_normalizer();
-
-  if (!ok2) {
-    throw lbann_exception(std::string("image_utils::process_image(): image ") +
-      (image.empty()? "is empty." :
-                      (ok1? "copying failed." :
-                            "preprocessing failed.")));
-  }
-
-  Width  = image.cols;
-  Height = image.rows;
-  Type   = image.type();
-
-  return ok2;
-}
-
-bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& out) {
-  std::vector<cv::Mat> patches;
-  bool ok1 = !image.empty() && pp.preprocess(image, patches);
-  bool ok2 = ok1 && (patches.size() != 0u) && (patches.size() == out.size());
-  bool ok3 = ok2;
-
-  for(size_t i=0u; ok3 && (i < patches.size()); ++i) {
-    ok3 = cv_utils::copy_cvMat_to_buf(patches[i], out[i], pp);
-  }
-  pp.disable_lazy_normalizer();
-
-  if (!ok3) {
-    throw lbann_exception(std::string("image_utils::process_image(): image ") +
-      (image.empty()? "is empty." :
-                      (ok1? (ok2? "copying failed." :
-                                  "extracted to invalid number of patches: " +
-                                   std::to_string(patches.size()) + " != " +
-                                   std::to_string(out.size())) :
-                            "preprocessing failed.")));
-  }
-
-  Width  = patches[0].cols;
-  Height = patches[0].rows;
-  Type   = patches[0].type();
-
-  return ok3;
-}
-#endif // LBANN_HAS_OPENCV
-
-/**
- *  @param filename The name of the image file to read in
- *  @param Width    The width of the image read
- *  @param Height   The height of the image read
- *  @param Type     The type of the image read (OpenCV code used for cv::Mat)
- *  @param pp       The pre-processing parameters
- *  @param data     The pre-processed image data to be stored in El::Matrix<DataType> format
- *  @param buf      A thread safe buffer for local, temporary, image decoding
- */
-bool image_utils::load_image(const std::string& filename,
-                             int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, std::vector<char>& buf, cv::Mat* cv_buf) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf);
-
-  return process_image(image, Width, Height, Type, pp, data);
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-/**
- *  @param filename The name of the image file to read in
- *  @param Width    The width of a patch from the image read
- *  @param Height   The height of a patch from the image read
- *  @param Type     The type of the image patches (OpenCV code used for cv::Mat)
- *  @param pp       The pre-processing parameters
- *  @param data     The pre-processed image data to be stored in El::Matrix<DataType> format
- *  @param buf      A thread safe buffer for local, temporary, image decoding
- */
-bool image_utils::load_image(const std::string& filename,
-                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, std::vector<char>& buf, cv::Mat* cv_buf) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf);
-
-  return process_image(image, Width, Height, Type, pp, data);
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-//XX
-/**
- *  @param filename The name of the image file to read in
- *  @param Width    The width of a patch from the image read
- *  @param Height   The height of a patch from the image read
- *  @param Type     The type of the image patches (OpenCV code used for cv::Mat)
- *  @param pp       The pre-processing parameters
- *  @param data     The pre-processed image data to be stored in El::Matrix<DataType> format
- */
-bool image_utils::load_image(std::vector<unsigned char>& image_buf,
-                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, cv::Mat* cv_buf) {
-
-  return import_image(image_buf, Width, Height, Type, pp, data, cv_buf);
-}
-
-/**
- *  @param filename The name of the image file to write
- *  @param Width    The width of the image to be written
- *  @param Height   The height of the image to be written
- *  @param Type     The type of the image to be written (OpenCV code used for cv::Mat)
- *  @param pp       The post-processing parameters
- *  @param data     The image data in El::Matrix<DataType> format to post-process and write
- */
-bool image_utils::save_image(const std::string& filename,
-                                    const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data) {
-#ifdef LBANN_HAS_OPENCV
-  pp.determine_inverse_lazy_normalization();
-  cv::Mat image = cv_utils::copy_buf_to_cvMat(data, Width, Height, Type, pp);
-  bool ok = !image.empty() && pp.postprocess(image);
-
-  _LBANN_MILD_EXCEPTION(!ok, "Image postprocessing has failed.", false)
-
-  return (ok && cv::imwrite(filename, image));
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-/**
- *  @param inbuf   The buffer that contains the raw bytes read from an image file
- *                 This can be for example, const std:vector<uchar>& or const cv::Mat&.
- *                 http://docs.opencv.org/trunk/d4/d32/classcv_1_1__InputArray.html
- *  @param Width   The width of the image consturcted out of inbuf
- *  @param Height  The height of the image consructed
- *  @param Type    The type of the image constructed (OpenCV code used for cv::Mat)
- *  @param pp      The pre-processing parameters
- *  @param data    The pre-processed image data. A set of sub-matrix Views can be used to store the data.
- */
-bool image_utils::import_image(cv::InputArray inbuf,
-                                      int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image;
-  if(cv_buf != nullptr) {
-    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
-  }else {
-    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
-  }
-
-  return process_image(image, Width, Height, Type, pp, data);
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-/**
- *  @param inbuf   The buffer that contains the raw bytes read from an image file
- *                 This can be for example, const std:vector<uchar>& or const cv::Mat&.
- *                 http://docs.opencv.org/trunk/d4/d32/classcv_1_1__InputArray.html
- *  @param Width   The width of a patch from the image consturcted out of inbuf
- *  @param Height  The height of a patch from the image consructed
- *  @param Type    The type of the image patches (OpenCV code used for cv::Mat)
- *  @param pp      The pre-processing parameters
- *  @param data    The pre-processed image data. A set of sub-matrix Views can be used to store the data.
- */
-bool image_utils::import_image(cv::InputArray inbuf,
-                                      int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<CPUMat>& data, cv::Mat* cv_buf) {
-#ifdef LBANN_HAS_OPENCV
-  cv::Mat image;
-  if(cv_buf != nullptr) {
-    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
-  }else {
-    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
-  }
-
-  return process_image(image, Width, Height, Type, pp, data);
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-/**
- *  @param fileExt The format extension name of image file: e.g., ".jpeg", ".png"
- *  @param outbuf  The preallocated buffer to contain the bytes to be written into an image file
- *  @param Width   The width of the image to be consturcted based on the given data of ::Mat
- *  @param Height  The height of the image
- *  @param Type    The type of the image (OpenCV code used for cv::Mat)
- *  @param pp      The post-processing parameters
- *  @param data    The image data. A sub-matrix View can be passed instead of the entire matrix.
- */
-bool image_utils::export_image(const std::string& fileExt, std::vector<uchar>& outbuf,
-                                      const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data) {
-#ifdef LBANN_HAS_OPENCV
-  pp.determine_inverse_lazy_normalization();
-  cv::Mat image = cv_utils::copy_buf_to_cvMat(data, Width, Height, Type, pp);
-  bool ok = !image.empty() && pp.postprocess(image);
-
-  _LBANN_MILD_EXCEPTION(!ok, "Either the image is empty or postprocessing has failed.", false)
-  _LBANN_MILD_EXCEPTION(fileExt.empty(), "Empty file format extension!", false)
-
-  const std::string ext = ((fileExt[0] != '.')? ("." + fileExt) : fileExt);
-
-  static const size_t max_img_header_size = 1024;
-  const size_t capacity = image_data_amount(image) + max_img_header_size;
-
-  if (outbuf.size() < capacity) {
-    //std::cout << "bytes reserved for the image: " << image_data_amount(image) << std::endl;
-    outbuf.resize(capacity);
-  }
-
-  return (ok && cv::imencode(ext, image, outbuf));
-#else
-  _THROW_EXCEPTION_NO_OPENCV_();
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
-
-
-
-bool image_utils::load_image(std::vector<unsigned char>& image_buf,
-                                    int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf) {
-  return import_image(image_buf, Width, Height, Type, pp, data, cv_buf);
-}
-
-} // namespace lbann
diff --git a/src/data_readers/lbann_data_generator.cpp b/src/data_readers/lbann_data_generator.cpp
deleted file mode 100644
index f215248d906..00000000000
--- a/src/data_readers/lbann_data_generator.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// lbann_data_generator .hpp .cpp - Synthetic Data Generator
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/lbann_data_generator.hpp"
-#include "lbann/utils/random.hpp"
-#include <stdio.h>
-
-lbann::DataGenerator::DataGenerator(Int num_samples, Int width, Int height, Int batchSize)
-  : DataReader(batchSize, true)
-{
-  m_num_samples = num_samples;
-  m_data_width = width;
-  m_data_height = height;
-}
-
-lbann::DataGenerator::DataGenerator(const DataGenerator& source)
-  : DataReader((const DataReader&) source),
-    m_data_width(source.m_data_width), m_data_height(source.m_data_height)
-{
-  // No need to deallocate data on a copy constuctor
-
-  //  clone_image_data(source);
-}
-
-lbann::DataGenerator::~DataGenerator()
-{
-  //  this->free();
-}
-
-void lbann::DataGenerator::load() {
-  ShuffledIndices.clear();
-  ShuffledIndices.resize(m_num_samples);
-  for (size_t n = 0; n < ShuffledIndices.size(); n++) {
-    ShuffledIndices[n] = n;
-  }
-  uniform_fill_procdet(m_data, get_linearized_data_size(), m_num_samples, 128, 128);
-}
-
-int lbann::DataGenerator::fetch_data(Mat& X)
-{
-  if(!DataReader::position_valid()) {
-    stringstream err;
-    err << __FILE__<<" "<<__LINE__<< " :: Data Generator load error: !position_valid";
-    throw lbann_exception(err.str());
-  }
-
-  int current_batch_size = getBatchSize();
-
-  int n = 0;
-  for (n = CurrentPos; n < CurrentPos + current_batch_size; n++) {
-    if (n >= (int)ShuffledIndices.size())
-      break;
-
-    int k = n - CurrentPos;
-    int index = ShuffledIndices[n];
-
-    for (int p = 0; p < get_linearized_data_size(); p++) {
-      X.Set(p, k, m_data.GetLocal(p, index));
-    }
-  }
-
-  return (n - CurrentPos);
-}
-
-// Assignment operator
-lbann::DataGenerator& lbann::DataGenerator::operator=(const DataGenerator& source)
-{
-  // check for self-assignment
-  if (this == &source)
-    return *this;
-
-  // Call the parent operator= function
-  DataReader::operator=(source);
-
-  this->m_data_width = source.m_data_width;
-  this->m_data_height = source.m_data_height;
-
-  return *this;
-}
diff --git a/src/data_readers/numpy_conduit_converter.cpp b/src/data_readers/numpy_conduit_converter.cpp
deleted file mode 100644
index 25ddf445689..00000000000
--- a/src/data_readers/numpy_conduit_converter.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/numpy_conduit_converter.hpp"
-#include "lbann/utils/exception.hpp"
-#include "lbann/data_store/data_store_conduit.hpp"
-#include <cnpy.h>
-
-namespace lbann {
-
-//static
-void numpy_conduit_converter::load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset) {
-
-  try {
-    if (reset) {
-      output.reset();
-    }
-
-    std::vector<size_t> shape;
-    std::map<std::string, cnpy::NpyArray> a = cnpy::npz_load(filename);
-
-    for (auto &&t : a) {
-      cnpy::NpyArray &b = t.second;
-      if (b.shape[0] != 1) {
-        LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples");
-      }
-      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size;
-      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order;
-      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals;
-      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape;
-
-      if (b.data_holder->size() / b.word_size != b.num_vals) {
-        LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals));
-      }
-
-      // conduit makes a copy of the data, hence owns the data, hence it
-      // will be properly deleted when then conduit::Node is deleted
-      char *data = b.data_holder->data();
-      output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_char_ptr(data, b.word_size*b.num_vals);
-    }
-  } catch (...) {
-    //note: npz_load throws std::runtime_error, but I don't want to assume
-    //      that won't change in the future
-    LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load");
-  }
-}
-
-} // end of namespace lbann
diff --git a/src/data_readers/offline_patches_npz.cpp b/src/data_readers/offline_patches_npz.cpp
deleted file mode 100644
index 506623d547a..00000000000
--- a/src/data_readers/offline_patches_npz.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/data_readers/offline_patches_npz.hpp"
-#include "lbann/utils/file_utils.hpp"
-#include "lbann/utils/cnpy_utils.hpp"
-#include <set>
-#include <algorithm>
-
-#include <iostream>
-
-namespace lbann {
-
-offline_patches_npz::offline_patches_npz(size_t npatches, std::string divider)
-  : m_checked_ok(false), m_lbann_format(false)
-{
-  m_num_patches = npatches;
-  m_variant_divider = divider;
-}
-
-offline_patches_npz::offline_patches_npz(size_t npatches)
-  : m_checked_ok(false), m_lbann_format(false)
-{
-  m_num_patches = npatches;
-  m_variant_divider = ".JPEG.";
-}
-
-offline_patches_npz::offline_patches_npz(std::string divider)
-  : m_checked_ok(false), m_lbann_format(false)
-{
-  m_num_patches = 3u;
-  m_variant_divider = divider;
-}
-
-offline_patches_npz::offline_patches_npz()
-  : m_checked_ok(false), m_num_patches(3u), m_variant_divider(".JPEG."),
-    m_lbann_format(false)
-{}
-
-bool offline_patches_npz::load(const std::string filename, size_t first_n,
-  bool keep_file_lists) {
-  m_item_class_list.clear();
-  m_file_root_list.clear();
-  m_file_variant_list.clear();
-
-  // The list of arrays expected to be packed in the file.
-  // 'max_class' and 'variant_divider' are scalar values, and known in advance.
-  const std::set<std::string> dict =
-    {"item_root_list",
-     "item_variant_list",
-     "item_class_list",
-     "file_root_list",
-     "file_variant_list",
-     "max_class",
-     "variant_divider"};
-
-  if (!check_if_file_exists(filename)) {
-    return false;
-  }
-  cnpy::npz_t dataset = cnpy::npz_load(filename);
-
-  m_lbann_format = false;
-
-  // check if all the arrays are included
-  for (const auto& np : dataset) {
-    if (dict.find(np.first) == dict.end()) {
-      if (np.first == "lbann_format") {
-        cnpy::NpyArray d_lbann_format = dataset["lbann_format"];
-        m_lbann_format = cnpy_utils::data<bool>(d_lbann_format, {0});
-      } else {
-        return false;
-      }
-    }
-  }
-
-  if (first_n > 0u) { // to use only first_n samples
-    cnpy_utils::shrink_to_fit(dataset["item_root_list"], first_n);
-    cnpy_utils::shrink_to_fit(dataset["item_variant_list"], first_n);
-  }
-  // Set the array of index sequences for root type
-  m_item_root_list = dataset["item_root_list"];
-
-  // Set the array of index sequences for variant type
-  m_item_variant_list = dataset["item_variant_list"];
-
-  { // load the label array into a vector of label_t (uint8_t)
-    cnpy::NpyArray d_item_class_list = dataset["item_class_list"];
-    m_checked_ok = (d_item_class_list.shape.size() == 1u);
-
-    if (m_checked_ok) {
-      // In case of shrinking to first_n, make sure the size is consistent
-      const size_t num_samples = m_item_root_list.shape[0];
-      if (m_lbann_format) {
-        const label_t* ptr = cnpy_utils::data_ptr<label_t>(d_item_class_list, {0});
-        m_item_class_list.assign(ptr, ptr + num_samples);
-      } else {
-        m_item_class_list.resize(num_samples);
-        for (size_t i=0u; i < num_samples; ++i) {
-          std::string digits(cnpy_utils::data_ptr<char>(d_item_class_list, {i}), d_item_class_list.word_size);
-          m_item_class_list[i] = static_cast<label_t>(atoi(digits.c_str()));
-        }
-      }
-    }
-    cnpy::npz_t::iterator it = dataset.find("item_class_list");
-    dataset.erase(it); // to keep memory footprint as low as possible
-  }
-
-  { // load the array of dictionary substrings of root type
-    cnpy::NpyArray d_file_root_list = dataset["file_root_list"];
-    m_checked_ok = m_checked_ok &&
-                   ( (d_file_root_list.shape.size() == 1u) ||
-                    ((d_file_root_list.shape.size() == 2u) && m_lbann_format));
-    if (m_checked_ok) {
-      const size_t num_roots = d_file_root_list.shape[0];
-      m_file_root_list.resize(num_roots);
-
-      const size_t len = (m_lbann_format? d_file_root_list.shape[1]
-                                        : d_file_root_list.word_size);
-
-      for (size_t i=0u; i < num_roots; ++i) {
-        std::string file_root(cnpy_utils::data_ptr<char>(d_file_root_list, {i}), len);
-        m_file_root_list[i] = std::string(file_root.c_str()); // to remove the trailing spaces
-      }
-    }
-    if (keep_file_lists) {
-      m_file_root_list_org = d_file_root_list;
-    } else {
-      cnpy::npz_t::iterator it = dataset.find("file_root_list");
-      dataset.erase(it); // to keep memory footprint as low as possible
-    }
-  }
-
-  { // load the array of dictionary substrings of variant type
-    cnpy::NpyArray d_file_variant_list = dataset["file_variant_list"];
-    m_checked_ok = m_checked_ok &&
-                   ( (d_file_variant_list.shape.size() == 1u) ||
-                    ((d_file_variant_list.shape.size() == 2u) && m_lbann_format));
-    if (m_checked_ok) {
-      const size_t num_variants = d_file_variant_list.shape[0];
-      m_file_variant_list.resize(num_variants);
-
-      const size_t len = (m_lbann_format? d_file_variant_list.shape[1]
-                                        : d_file_variant_list.word_size);
-
-      for (size_t i=0u; i < num_variants; ++i) {
-        std::string file_variant(cnpy_utils::data_ptr<char>(d_file_variant_list, {i}), len);
-        m_file_variant_list[i] = std::string(file_variant.c_str());
-      }
-    }
-    if (keep_file_lists) {
-      m_file_root_list_org = d_file_variant_list;
-    } else {
-      cnpy::npz_t::iterator it = dataset.find("file_variant_list");
-      dataset.erase(it); // to keep memory footprint as low as possible
-    }
-  }
-
-  m_checked_ok = m_checked_ok && check_data();
-
-  if (!m_checked_ok) {
-    m_item_class_list.clear();
-    m_file_root_list.clear();
-    m_file_variant_list.clear();
-    throw lbann_exception("offline_patches_npz: loaded data not consistent");
-  }
-
-  return m_checked_ok;
-}
-
-
-bool offline_patches_npz::check_data() const {
-  bool ok = (m_item_root_list.shape.size() == 2u) &&
-            (m_item_variant_list.shape.size() == 3u) &&
-            (m_file_root_list.size() > 0u) &&
-            (m_file_variant_list.size() > 0u) &&
-            (m_item_root_list.shape[0] == m_item_class_list.size()) &&
-            (m_item_variant_list.shape[0] == m_item_class_list.size()) &&
-            (m_item_root_list.shape[1] == m_num_patches) &&
-            (m_item_variant_list.shape[1] == m_num_patches) &&
-            (m_item_variant_list.shape[2] > 0u) &&
-            (m_item_root_list.word_size == sizeof(size_t)) &&
-            (m_item_variant_list.word_size == sizeof(size_t));
-
-  return ok;
-}
-
-
-std::string offline_patches_npz::get_description() const {
-  using std::string;
-  using std::to_string;
-  string ret = string("offline_patches_npz:\n")
-    + " - item_root_list: "    + cnpy_utils::show_shape(m_item_root_list) + "\n"
-    + " - item_variant_list: " + cnpy_utils::show_shape(m_item_variant_list) + "\n"
-    + " - item_class_list: "   + to_string(m_item_class_list.size()) + "\n"
-    + " - file_root_list: "    + to_string(m_file_root_list.size()) + "\n"
-    + " - file_variant_list: " + to_string(m_file_variant_list.size()) + "\n"
-    + " - variant_divider: "   + m_variant_divider + "\n"
-    + " - num of samples: "    + to_string(get_num_samples()) + "\n"
-    + " - num of patches: "    + to_string(m_num_patches) + "\n";
-  return ret;
-}
-
-
-offline_patches_npz::sample_t offline_patches_npz::get_sample(const size_t idx) const {
-  if (!m_checked_ok || idx >= get_num_samples()) {
-    throw lbann_exception("offline_patches_npz: invalid sample index");
-  }
-
-  std::vector<std::string> file_names;
-
-  for (size_t p = 0u; p < m_num_patches; ++p) {
-    const size_t root = cnpy_utils::data<size_t>(m_item_root_list, {idx, p});
-
-    if (root >= m_file_root_list.size()) {
-      using std::to_string;
-      throw lbann_exception("offline_patches_npz: invalid file_root_list index: "
-                          + to_string(root) + " >= " + to_string(m_file_root_list.size()));
-    }
-    std::string file_name = m_file_root_list.at(root);
-
-    const size_t* variant = &(cnpy_utils::data<size_t>(m_item_variant_list, {idx, p, 0u}));
-    const int ve = m_item_variant_list.shape.back()-1;
-    for (int i = 0; i < ve; ++i) {
-      file_name += m_file_variant_list.at(variant[i]) + m_variant_divider;
-    }
-
-    file_name += m_file_variant_list.at(variant[ve]);
-    file_names.push_back(file_name);
-  }
-
-  return std::make_pair(file_names, m_item_class_list[idx]);
-}
-
-
-offline_patches_npz::label_t offline_patches_npz::get_label(const size_t idx) const {
-  if (!m_checked_ok || idx >= get_num_samples()) {
-    throw lbann_exception("offline_patches_npz: invalid sample index");
-  }
-
-  return m_item_class_list[idx];
-}
-
-
-#ifdef _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_
-/// count samples of first n roots
-size_t offline_patches_npz::count_samples(const size_t num_roots) const {
-  if (!m_checked_ok || num_roots > m_file_root_list.size()) {
-    throw lbann_exception("invalid sample index");
-  }
-
-  std::vector<std::string> file_names;
-  size_t num_samples = 0u;
-  const size_t total_samples = get_num_samples();
-
-  for (size_t s = 0u; s < total_samples; ++s) {
-    const size_t root = cnpy_utils::data<size_t>(m_item_root_list, {s, 0});
-    if (root >= m_file_root_list.size()) {
-      throw lbann_exception("invalid file_root_list index");
-    }
-    if (root >= num_roots) break;
-    num_samples ++;
-  }
-  return num_samples;
-}
-
-std::vector<std::string> offline_patches_npz::get_file_roots() const {
-  std::vector<std::string> root_names;
-  const size_t num_samples = get_num_samples();
-  root_names.reserve(num_samples);
-  for (size_t i = 0u; i < num_samples; ++i) {
-    const size_t root = cnpy_utils::data<size_t>(m_item_root_list, {i, 0});
-    if (root >= m_file_root_list.size()) {
-      throw lbann_exception("invalid file_root_list index");
-    }
-    std::string file_name = m_file_root_list.at(root);
-    root_names.push_back(file_name);
-  }
-  return root_names;
-}
-
-
-bool offline_patches_npz::select(const std::string out_file, const size_t sample_start, size_t& sample_end) {
-  if ( sample_start >= sample_end) {
-    std::cerr << "sample_end (" << sample_end
-              << ") is not larger than sample_start ("
-              << sample_start << ")." << std::endl;
-    return false;
-  }
-
-  // Set the array of index sequences for root type
-  const size_t num_samples_org = m_item_root_list.shape[0];
-  if (sample_end > num_samples_org) {
-    std::cerr << "sample_end exceed the number of of samples in data."
-              << "Adjusting it to the number of existing samples." << std::endl;
-    sample_end = num_samples_org;
-  }
-
-  { // create output directory if needed
-    std::string out_dir;
-    std::string out_filename;
-    parse_path(out_file, out_dir, out_filename);
-
-    if (!check_if_dir_exists(out_dir)) {
-      create_dir(out_dir);
-    }
-  }
-
-  std::pair<size_t, size_t> file_root_range;
-  std::pair<size_t, size_t> file_variant_range;
-
-  { // write item_root_list
-    const size_t* data_ptr = cnpy_utils::data_ptr<size_t>(m_item_root_list, {sample_start});
-    size_t* out_ptr        = cnpy_utils::data_ptr<size_t>(m_item_root_list, {sample_start});
-    const size_t* data_ptr_end = cnpy_utils::data_ptr<size_t>(m_item_root_list, {sample_end});
-
-    // compute the min-max range of indieces reference
-    auto result = std::minmax_element(data_ptr, data_ptr_end);
-    file_root_range = std::make_pair(*result.first, *result.second + 1);
-
-    // adjust indices by subtracting file_root_range.first from each
-    std::transform(data_ptr, data_ptr_end, out_ptr, [&](size_t id) -> size_t { return (id - file_root_range.first); });
-
-    // write the updated array into the output file
-    std::vector<size_t> out_shape = m_item_root_list.shape;
-    out_shape[0] = sample_end - sample_start;
-    cnpy::npz_save(out_file, "item_root_list", data_ptr, out_shape, "w");
-  }
-
-  { // write item_variant_list
-    const size_t* data_ptr = cnpy_utils::data_ptr<size_t>(m_item_variant_list, {sample_start});
-    size_t* out_ptr        = cnpy_utils::data_ptr<size_t>(m_item_variant_list, {sample_start});
-    const size_t* data_ptr_end = cnpy_utils::data_ptr<size_t>(m_item_variant_list, {sample_end});
-
-    // compute the min-max range of indieces reference
-    auto result = std::minmax_element(data_ptr, data_ptr_end);
-    file_variant_range = std::make_pair(*result.first, *result.second + 1);
-
-    // adjust indices by subtracting file_variant_range.first from each
-    std::transform(data_ptr, data_ptr_end, out_ptr, [&](size_t id) -> size_t { return (id - file_variant_range.first); });
-
-    // write the updated array into the output file
-    std::vector<size_t> out_shape = m_item_variant_list.shape;
-    out_shape[0] = sample_end - sample_start;
-    cnpy::npz_save(out_file, "item_variant_list", data_ptr, out_shape, "a");
-  }
-
-  { // write item_class_list
-    const label_t* data_ptr = &(m_item_class_list[sample_start]);
-    cnpy::npz_save(out_file, "item_class_list", data_ptr, {sample_end - sample_start}, "a");
-  }
-
-  { // load the array of dictionary substrings of root type
-    cnpy::NpyArray org_list = m_file_root_list_org;
-    bool org_readable = (((!m_lbann_format && (org_list.shape.size() == 1u)) &&
-                           ( m_lbann_format && (org_list.shape.size() == 2u))) &&
-                          (org_list.shape[0] > 0u));
-
-    std::vector<size_t> out_shape(2u);
-    const size_t id_start = file_root_range.first;
-    const size_t id_end = file_root_range.second;
-    out_shape[0] = id_end - id_start;
-    size_t len = 0u;
-
-    std::vector<char> tmp;
-    char* data_ptr = nullptr;
-
-    if (org_readable) {
-      len = (m_lbann_format? org_list.shape[1] : org_list.word_size);
-      data_ptr = cnpy_utils::data_ptr<char>(org_list, {id_start});
-    } else {
-      for (size_t i = id_start; i < id_end; ++i) {
-        size_t sz = m_file_root_list[i].size();
-        if (len < sz) {
-          len = sz;
-        }
-      }
-      tmp.resize((id_end - id_start)*len, '\0');
-      data_ptr = &(tmp[0]);
-      for (size_t i = id_start; i < id_end; ++i) {
-        const std::string& str = m_file_root_list[i];
-        std::copy(str.begin(), str.end(), data_ptr);
-        data_ptr += len;
-      }
-      data_ptr = &(tmp[0]);
-    }
-    out_shape[1] = len;
-    cnpy::npz_save(out_file, "file_root_list", data_ptr, out_shape, "a");
-  }
-
-  { // load the array of dictionary substrings of variant type
-    cnpy::NpyArray org_list = m_file_variant_list_org;
-    bool org_readable = (((!m_lbann_format && (org_list.shape.size() == 1u)) &&
-                           ( m_lbann_format && (org_list.shape.size() == 2u))) &&
-                          (org_list.shape[0] > 0u));
-
-    std::vector<size_t> out_shape(2u);
-    const size_t id_start = file_variant_range.first;
-    const size_t id_end = file_variant_range.second;
-    out_shape[0] = id_end - id_start;
-    size_t len = 0u;
-
-    std::vector<char> tmp;
-    char* data_ptr = nullptr;
-
-    if (org_readable) {
-      len = (m_lbann_format? org_list.shape[1] : org_list.word_size);
-      data_ptr = cnpy_utils::data_ptr<char>(org_list, {id_start});
-    } else {
-      for (size_t i = id_start; i < id_end; ++i) {
-        size_t sz = m_file_variant_list[i].size();
-        if (len < sz) {
-          len = sz;
-        }
-      }
-      tmp.resize((id_end - id_start)*len, '\0');
-      data_ptr = &(tmp[0]);
-      for (size_t i = id_start; i < id_end; ++i) {
-        const std::string& str = m_file_variant_list[i];
-        std::copy(str.begin(), str.end(), data_ptr);
-        data_ptr += len;
-      }
-      data_ptr = &(tmp[0]);
-    }
-    out_shape[1] = len;
-    cnpy::npz_save(out_file, "file_variant_list", data_ptr, out_shape, "a");
-  }
-
-  {
-    bool lbann_format = true;
-    cnpy::npz_save(out_file, "lbann_format", &lbann_format, {1}, "a");
-  }
-
-  return true;
-}
-#endif // _OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_
-
-} // end of namespace lbann
diff --git a/src/data_readers/patchworks/CMakeLists.txt b/src/data_readers/patchworks/CMakeLists.txt
deleted file mode 100644
index 860d5008980..00000000000
--- a/src/data_readers/patchworks/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Add the source files for this directory
-set_full_path(THIS_DIR_SOURCES
-  patchworks.cpp
-  patchworks_ROI.cpp
-  patchworks_patch_descriptor.cpp
-  patchworks_stats.cpp
-  )
-
-# Propagate the files up the tree
-set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/data_readers/patchworks/patchworks.cpp b/src/data_readers/patchworks/patchworks.cpp
deleted file mode 100644
index 40fa91cea9f..00000000000
--- a/src/data_readers/patchworks/patchworks.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks.cpp - LBANN PATCHWORKS main interface implementation
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS main interface implementation
- *  - includes the main interface function definitions
- */
-
-#include "lbann/data_readers/patchworks/patchworks.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#include "lbann/utils/random.hpp"
-#include "lbann/data_readers/patchworks/patchworks_stats.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-#if _PATCHWORKS_STAT_FLOAT_ == 32
-#define _f f
-#elif _PATCHWORKS_STAT_FLOAT_ == 64
-#define _f
-#else
-#error need to set _PATCHWORKS_STAT_FLOAT_
-#endif
-
-std::pair<double,double> check_min_max(const cv::Mat& _img) {
-  cv::Mat img = _img.clone();
-
-  double maxVal = 0.0;
-  double minVal = 0.0;
-  const int nCh = img.channels();
-
-  img.reshape(1);
-  cv::minMaxLoc(img, &minVal, &maxVal, nullptr, nullptr);
-  img.reshape(nCh);
-
-  //std::cout << "min max : " << minVal << ' ' << maxVal << std::endl;
-  return std::make_pair(minVal, maxVal);
-}
-
-cv::Mat correct_chromatic_aberration(const cv::Mat& _img) {
-  if (_img.channels() != 3) {
-    return _img.clone();
-  }
-
-  const int img_depth = _img.depth();
-
-  std::pair<double, double> range_org = check_min_max(_img);
-
-  cv::Mat img; // float matrix
-
-  _img.convertTo(img, _PW_CV_FP_);
-
-  static const pw_fp_t a[3] = {-1.0 _f, 2.0 _f, -1.0 _f}; // BGR order
-  static const pw_fp_t aa = a[0]*a[0] + a[1]*a[1]+ a[2]*a[2];
-  // A = a'*a/(a*a')
-  //static const pw_fp_t A[3][3] = {{a[0]*a[0]/aa, a[0]*a[1]/aa,  a[0]*a[2]/aa},
-  //                                {a[1]*a[0]/aa, a[1]*a[1]/aa,  a[1]*a[2]/aa},
-  //                                {a[2]*a[0]/aa, a[2]*a[1]/aa,  a[2]*a[2]/aa}};
-  // B = (I - A)'
-  static const pw_fp_t B[3][3] = {{1.0 _f-a[0] *a[0]/aa, a[0] *a[1]/aa,  a[0] *a[2]/aa},
-    {a[1] *a[0]/aa, 1.0 _f-a[1] *a[1]/aa,  a[1] *a[2]/aa},
-    {a[2] *a[0]/aa, a[2] *a[1]/aa,  1.0 _f-a[2] *a[2]/aa}
-  };
-
-  cv::MatIterator_<pw_cv_vec3> it = img.begin<pw_cv_vec3>();
-  cv::MatIterator_<pw_cv_vec3> itend = img.end<pw_cv_vec3>();
-
-  for ( ; it != itend; ++it) {
-    const auto b0 = static_cast<pw_fp_t>((*it)[0]);
-    const auto g0 = static_cast<pw_fp_t>((*it)[1]);
-    const auto r0 = static_cast<pw_fp_t>((*it)[2]);
-
-    pw_fp_t b = b0 * B[0][0] + g0 * B[1][0] + r0 * B[2][0];
-    pw_fp_t g = b0 * B[0][1] + g0 * B[1][1] + r0 * B[2][1];
-    pw_fp_t r = b0 * B[0][2] + g0 * B[1][2] + r0 * B[2][2];
-
-    //std::cout << r0 << ' ' << g0 << ' ' << b0 << " " << r << ' ' << g << ' ' << b << std::endl;
-    (*it) = pw_cv_vec3(b,g,r);
-  }
-
-  std::pair<double, double> range_new = check_min_max(img);
-  cv::Mat img_final;
-  //(x-range_new.first)*(range_org.second-range_org.first)/(range_new.second-range_new.first) + range_org.first;
-  const double alpha = (range_org.second-range_org.first)/(range_new.second-range_new.first);
-  const double beta = range_org.first - range_new.first * alpha;
-  img.convertTo(img_final, img_depth, alpha, beta);
-
-  //std::pair<double, double> range_final = check_min_max(img_final);
-
-  return img_final;
-}
-
-cv::Mat drop_2channels(const cv::Mat& _img) {
-  if (_img.channels() != 3) {
-    return _img.clone();
-  }
-
-  const int img_depth = _img.depth();
-
-  cv::Mat img; // pw_fp_t matrix
-  _img.convertTo(img, _PW_CV_FP_);
-
-  // compute channel to remain
-  pw_fp_t m[3] = {0.0 _f, 0.0 _f, 0.0 _f};
-
-  ::lbann::rng_gen& gen = ::lbann::get_io_generator();
-
-  std::uniform_int_distribution<int> rg_ch(0, 2);
-  const int chosenCh = rg_ch(gen);
-
-  m[chosenCh] = 1.0 _f;
-
-  // compute white noise
-  std::vector<image_stats> stats;
-  get_channel_stats(_img, stats);
-
-  const auto avg = static_cast<pw_fp_t>(stats[chosenCh].avg);
-  const auto dev = static_cast<pw_fp_t>(stats[chosenCh].stdev/100.0);
-  pw_fp_t avgs[3] = {avg, avg, avg};
-  pw_fp_t devs[3] = {dev, dev, dev};
-
-  std::normal_distribution<pw_fp_t> rg_ch0(avgs[0], devs[0]);
-  std::normal_distribution<pw_fp_t> rg_ch1(avgs[1], devs[1]);
-  std::normal_distribution<pw_fp_t> rg_ch2(avgs[2], devs[2]);
-
-  cv::MatIterator_<pw_cv_vec3> it = img.begin<pw_cv_vec3>();
-  cv::MatIterator_<pw_cv_vec3> itend = img.end<pw_cv_vec3>();
-
-  for ( ; it != itend; ++it) {
-    const auto b0 = static_cast<pw_fp_t>((*it)[0]);
-    const auto g0 = static_cast<pw_fp_t>((*it)[1]);
-    const auto r0 = static_cast<pw_fp_t>((*it)[2]);
-
-#if 1
-    pw_fp_t b = b0*m[0] + (1.0-m[0])*rg_ch0(gen);
-    pw_fp_t g = g0*m[1] + (1.0-m[1])*rg_ch1(gen);
-    pw_fp_t r = r0*m[2] + (1.0-m[2])*rg_ch2(gen);
-#else
-    pw_fp_t b = b0*m[0];
-    pw_fp_t g = g0*m[1];
-    pw_fp_t r = r0*m[2];
-#endif
-
-    //std::cout << r0 << ' ' << g0 << ' ' << b0 << " " << r << ' ' << g << ' ' << b << std::endl;
-    (*it) = pw_cv_vec3(b,g,r);
-  }
-
-  cv::Mat img_final;
-  img.convertTo(img_final, img_depth);
-
-  return img_final;
-}
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/patchworks/patchworks_ROI.cpp b/src/data_readers/patchworks/patchworks_ROI.cpp
deleted file mode 100644
index e9084640af3..00000000000
--- a/src/data_readers/patchworks/patchworks_ROI.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_ROI.cpp - LBANN PATCHWORKS ROI (region-of-interest) implementation
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS ROI implementation
- *  - Region of interest descriptor
- */
-
-#include "lbann/data_readers/patchworks/patchworks_ROI.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#include <iostream>
-
-namespace lbann {
-namespace patchworks {
-
-const int ROI::undefined_coordinate = -1;
-
-/// Reset to the initial condition indicating to cover the whole image
-void ROI::init() {
-  m_left = undefined_coordinate;
-  m_top = undefined_coordinate;
-  m_right = undefined_coordinate;
-  m_bottom = undefined_coordinate;
-}
-
-bool ROI::is_undefined() const {
-  return ((m_left == undefined_coordinate)  ||
-          (m_top == undefined_coordinate)   ||
-          (m_right == undefined_coordinate) ||
-          (m_bottom == undefined_coordinate)); // default
-
-}
-
-/// Sanity check on a set of two coordinates that defines a region of interest
-bool ROI::is_valid() const {
-  return (!is_undefined() && (m_left < m_right) && (m_top < m_bottom));
-}
-
-/**
- * Check how the region of interest overlaps with the image, and shrink it to
- * preceisely match the image boundary in case that it is out of boundary.
- */
-bool ROI::set_overlapping_region(const cv::Mat& img) {
-  if (!is_valid() || (img.data == nullptr)) {
-    return false;
-  }
-  if (m_left < 0) {
-    m_left = 0;
-  }
-  if (m_top < 0) {
-    m_top = 0;
-  }
-  if (m_right > img.cols) {
-    m_right = img.cols;
-  }
-  if (m_bottom > img.rows) {
-    m_bottom = img.rows;
-  }
-  if (m_right == undefined_coordinate) {
-    m_right = img.cols;
-  }
-  if (m_bottom == undefined_coordinate) {
-    m_bottom = img.rows;
-  }
-
-  return true;
-}
-
-bool ROI::is_whole_image(const cv::Mat& img) {
-  const bool ok = set_overlapping_region(img);
-  return ok &&
-         ((m_left == 0) &&
-          (m_top == 0) &&
-          (m_right == img.cols) &&
-          (m_bottom == img.rows));
-}
-
-bool ROI::set_by_corners(const int p0_x, const int p0_y, const int p1_x, const int p1_y) {
-  m_left = p0_x;
-  m_top = p0_y;
-  m_right = p1_x;
-  m_bottom = p1_y;
-
-  return is_valid();
-}
-
-bool ROI::set_by_center(const int px, const int py, const unsigned int _width, const unsigned int _height) {
-  m_left = px - (_width + _width%2)/2;
-  m_right = px + (_width + _width%2)/2;
-  m_top = py - (_height + _height%2)/2;
-  m_bottom = py + (_height + _height%2)/2;
-
-  return is_valid();
-}
-
-void ROI::move(const std::pair<int, int>  displ) {
-  m_left   += displ.first;
-  m_right  += displ.first;
-  m_top    += displ.second;
-  m_bottom += displ.second;
-}
-
-bool ROI::operator==(const ROI& rarea) const {
-  return ((rarea.m_left == m_left) && (rarea.m_top == m_top) &&
-          (m_right == rarea.m_right) && (m_bottom == rarea.m_bottom));
-}
-
-bool ROI::operator!=(const ROI& rarea) const {
-  return !(*this == rarea);
-}
-
-bool ROI::operator<(const ROI& rarea) const {
-  return ((*this <= rarea) && !(*this == rarea));
-}
-
-bool ROI::operator>(const ROI& rarea) const {
-  return ((*this >= rarea) && !(*this == rarea));
-}
-
-/// Stream out the content of the region of interest
-std::ostream& operator<<(std::ostream& os, const ROI& roi) {
-  return roi.Print(os);
-}
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/patchworks/patchworks_patch_descriptor.cpp b/src/data_readers/patchworks/patchworks_patch_descriptor.cpp
deleted file mode 100644
index 42751ebe550..00000000000
--- a/src/data_readers/patchworks/patchworks_patch_descriptor.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_patch_descriptor.cpp - LBANN PATCHWORKS implementation for patch descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS implementation for patch descriptor
- */
-
-#include "lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp"
-
-#ifdef LBANN_HAS_OPENCV
-#include <iostream>
-#include "lbann/utils/random.hpp"
-
-namespace lbann {
-namespace patchworks {
-
-void patch_descriptor::init() {
-  m_width = 0u;
-  m_height = 0u;
-  m_gap = 0u;
-  m_jitter = 0u;
-  m_mode_center = 1u;
-  m_mode_chrom = 0u;
-  m_self_label = false;
-  m_ext = "";
-  m_sample_area = ROI();
-  m_displacements.clear();
-  reset();
-}
-
-void patch_descriptor::reset() {
-  m_patch_center = ROI();
-  m_positions.clear();
-  m_cur_patch_idx = 0u;
-}
-
-void patch_descriptor::set_size(const int width, const int height) {
-  m_width = width;
-  m_height = height;
-}
-
-bool patch_descriptor::set_sample_area(const ROI& area) {
-  if (!area.is_valid()) {
-    return false;
-  }
-  m_sample_area = area;
-  return true;
-}
-
-bool patch_descriptor::set_sample_image(const unsigned int img_width, const unsigned int img_height) {
-  ROI whole_image;
-  whole_image.set_by_corners(0, 0, img_width, img_height);
-
-  return set_sample_area(whole_image);
-}
-
-void patch_descriptor::define_patch_set() {
-  const int wdisp = m_width + m_gap;
-  const int hdisp = m_height + m_gap;
-  m_displacements.clear();
-  m_displacements.emplace_back(-wdisp, -hdisp);
-  m_displacements.emplace_back( 0,     -hdisp);
-  m_displacements.emplace_back( wdisp, -hdisp);
-  m_displacements.emplace_back(-wdisp,  0);
-  m_displacements.emplace_back( wdisp,  0);
-  m_displacements.emplace_back(-wdisp,  hdisp);
-  m_displacements.emplace_back( 0,      hdisp);
-  m_displacements.emplace_back( wdisp,  hdisp);
-}
-
-bool patch_descriptor::get_first_patch(ROI& patch) {
-  int x_center = (m_sample_area.width()+1)/2 + m_sample_area.left();
-  int y_center = (m_sample_area.height()+1)/2 + m_sample_area.top();
-  int x_margin = 0;
-  int y_margin = 0;
-
-  if (m_mode_center == 0u) {
-    // Consider the jitter for a patch at the boundary of an image
-    x_margin = (m_width+1)/2 + m_jitter;
-    y_margin = (m_height+1)/2 + m_jitter;
-  } else if (m_mode_center == 1u) {
-    // The jitter for the center patch is a part of gap.
-    //if (m_jitter > m_gap) return false;
-    x_margin = m_width + (m_width+1)/2 + 2*m_jitter + m_gap;
-    y_margin = m_height + (m_height+1)/2 + 2*m_jitter + m_gap;
-  }
-
-  ::lbann::rng_gen& gen = ::lbann::get_io_generator();
-
-  if ((m_mode_center == 0u || m_mode_center == 1u)) {
-    // area where the center of a center patch can be in
-    ROI center_patch_area;
-    bool ok = center_patch_area.set_by_corners(x_margin + m_sample_area.left(),
-              y_margin + m_sample_area.top(),
-              m_sample_area.width() - x_margin,
-              m_sample_area.height() - y_margin);
-    if (!ok) {
-      std::cout << "invalid center patch area: " << center_patch_area << std::endl;
-      return false;
-    }
-    if (!center_patch_area.is_valid()) {
-      return false;
-    }
-
-    // randomly generate the center coordinate within the center patch area
-    std::uniform_int_distribution<int> rg_center_x(0, center_patch_area.width()-1);
-    std::uniform_int_distribution<int> rg_center_y(0, center_patch_area.height()-1);
-    x_center = rg_center_x(gen) + center_patch_area.left();
-    y_center = rg_center_y(gen) + center_patch_area.top();
-  }
-
-  if (m_jitter > 0u) { // apply position jitter if enabled
-    std::uniform_int_distribution<int> rg_jitter_x(0, 2*m_jitter);
-    std::uniform_int_distribution<int> rg_jitter_y(0, 2*m_jitter);
-    x_center += rg_jitter_x(gen) - m_jitter;
-    y_center += rg_jitter_y(gen) - m_jitter;
-  }
-
-  // set the center patch
-  ROI p;
-  if (!p.set_by_center(x_center, y_center, m_width, m_height) ||
-      !(m_sample_area >= p)) {
-    return false;
-  }
-
-  m_patch_center = p;
-  patch = p;
-  m_positions.clear();
-  m_cur_patch_idx = 0u;
-  m_positions.push_back(patch);
-
-  return true;
-}
-
-bool patch_descriptor::get_next_patch(ROI& patch) {
-  bool got_one = false;
-
-  ::lbann::rng_gen& gen = ::lbann::get_io_generator();
-
-  do {
-    ROI p = m_patch_center;
-
-    if (m_cur_patch_idx >= m_displacements.size()) {
-      return false;
-    }
-    p.move(m_displacements[m_cur_patch_idx++]);
-
-    if (m_jitter > 0u) {
-      std::uniform_int_distribution<int> rg_jitter_x(0, 2*m_jitter);
-      std::uniform_int_distribution<int> rg_jitter_y(0, 2*m_jitter);
-      const int x_jitter = rg_jitter_x(gen) - m_jitter;
-      const int y_jitter = rg_jitter_y(gen) - m_jitter;
-      p.move(displacement_type(x_jitter, y_jitter));
-    }
-
-    if (p.is_valid() && (m_sample_area >= p)) {
-      patch = p;
-      got_one = true;
-    }
-  } while (!got_one);
-
-  m_positions.push_back(patch);
-  return true;
-}
-
-bool patch_descriptor::extract_patches(const cv::Mat& img, std::vector<cv::Mat>& patches) {
-  patches.clear();
-  if (img.data == nullptr) {
-    return false;
-  }
-
-  ROI roi;
-  bool ok = get_first_patch(roi);
-  if (!ok) {
-    return false;
-  }
-
-  patches.push_back(img(roi.rect()).clone());
-
-#if 0 // to generate all the patches defined in the set
-  unsigned int i = 1u;
-
-  while (get_next_patch(roi)) {
-    patches.push_back(img(roi.rect()).clone());
-    i++;
-  }
-  if (i == 1u) {
-    return false;
-  }
-#else // to randomly generate another patch. The label will be recorded to m_cur_patch_idx.
-  if (m_displacements.size() == 0) {
-    return false;
-  }
-
-  std::uniform_int_distribution<int> rg_patch_idx(0, m_displacements.size()-1);
-  ::lbann::rng_gen& gen = ::lbann::get_io_generator();
-  m_cur_patch_idx = rg_patch_idx(gen);
-
-  if (!get_next_patch(roi)) {
-    return false;
-  }
-  patches.push_back(img(roi.rect()).clone());
-#endif
-
-  return true;
-}
-
-std::string patch_descriptor::get_description() const {
-  std::stringstream os;
-  os << "patch descriptor:" << std::endl
-     << '\t' << "m_width: " << m_width << std::endl
-     << '\t' << "m_height: " << m_height << std::endl
-     << '\t' << "m_gap: " << m_gap << std::endl
-     << '\t' << "m_jitter: " << m_jitter << std::endl
-     << '\t' << "m_mode_center: " << m_mode_center << std::endl
-     << '\t' << "m_mode_chrom: " << m_mode_chrom << std::endl
-     << '\t' << "m_self_label: " << m_self_label << std::endl
-     << '\t' << "m_ext: " << m_ext << std::endl
-     << '\t' << "m_sample_area: " << m_sample_area << std::endl
-     << '\t' << "patch displacements from the center: " << std::endl;
-  for (unsigned int i=0u; i < m_displacements.size() ; ++i) {
-    os << "\t\t" << i+1 << ' ' << m_displacements[i].first << ' ' << m_displacements[i].second << std::endl;
-  }
-
-  return os.str();
-}
-
-std::ostream& patch_descriptor::print(std::ostream& os) const {
-  os << get_description()
-     << '\t' << "m_cur_patch_idx: " << m_cur_patch_idx << std::endl
-     << '\t' << "patch regions: " << std::endl;
-  for (unsigned int i=0u; i < m_positions.size() ; ++i) {
-    os << "\t\t" << i << '\t' << m_positions[i] << std::endl;
-  }
-
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const patch_descriptor& pd) {
-  return pd.print(os);
-}
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_readers/patchworks/patchworks_stats.cpp b/src/data_readers/patchworks/patchworks_stats.cpp
deleted file mode 100644
index 9216fbceab0..00000000000
--- a/src/data_readers/patchworks/patchworks_stats.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-// patchworks_stats.cpp - LBANN PATCHWORKS implementation for pixel statistics
-////////////////////////////////////////////////////////////////////////////////
-
-/**
- * LBANN PATCHWORKS implementation for pixel statistics
- */
-
-#include "lbann/data_readers/patchworks/patchworks_stats.hpp"
-#ifdef LBANN_HAS_OPENCV
-
-namespace lbann {
-namespace patchworks {
-
-bool get_single_channel_stats(const cv::Mat& _img, image_stats& stats) {
-  if (_img.channels() != 1) {
-    return false;
-  }
-
-  cv::Mat img; // pw_fp_t matrix
-  _img.convertTo(img, _PW_CV_FP_);
-
-  cv::MatIterator_<pw_fp_t> itBegin = img.begin<pw_fp_t>();
-  cv::MatIterator_<pw_fp_t> itEnd = img.end<pw_fp_t>();
-
-  const auto typeZero = static_cast<pw_fp_t>(0);
-
-  double sum = 0.0;
-
-  std::vector<pw_fp_t> data(itBegin, itEnd);
-  stats.cnt = data.size();
-  if (stats.cnt == 0u) {
-    return false;
-  }
-
-  std::sort(data.begin(), data.end());
-  if (data[0] < typeZero) {
-    return false;
-  }
-
-  stats.max = data.back();
-  stats.min = data[0];
-
-  std::vector<pw_fp_t>::const_iterator itbeg = data.begin();
-  std::vector<pw_fp_t>::const_iterator itend = data.end();
-  std::vector<pw_fp_t>::const_iterator itbegNZ = std::upper_bound(data.begin(), data.end(), static_cast<pw_fp_t>(0));
-
-  stats.cntZeros = std::distance(itbeg, itbegNZ);
-  stats.minNZ = *itbegNZ;
-
-  const size_t nnz = stats.cnt - stats.cntZeros;
-  const size_t halfPointNZ = nnz/2;
-  const size_t halfPoint   = stats.cnt/2;
-  auto itMedNZ = itbegNZ;
-  auto itMed   = itbeg;
-  std::advance(itMedNZ, halfPointNZ);
-  std::advance(itMed,   halfPoint);
-
-  stats.medianNZ = *itMedNZ;
-  stats.median   = *itMed;
-
-  auto it = itbegNZ;
-  for( ; it != itend; ++it) {
-    sum += *it;
-  }
-
-  stats.avg = sum/stats.cnt;
-  if (nnz == 0u) {
-    stats.avgNZ = stats.avg;
-  } else {
-    stats.avgNZ = sum/nnz;
-  }
-
-  double var = 0.0;
-  double varNZ = 0.0;
-  it = itbegNZ;
-
-  for(it = itbeg; it != itbegNZ; ++it) {
-    const double dev = (*it-stats.avg);
-    var += dev*dev;
-  }
-
-  for( ; it != itend; ++it) {
-    const double dev = (*it-stats.avg);
-    var += dev*dev;
-    const double devNZ = (*it-stats.avgNZ);
-    varNZ += devNZ*devNZ;
-  }
-
-  stats.stdev = sqrt(var/stats.cnt);
-  stats.stdevNZ = sqrt(varNZ/nnz);
-
-  return true;
-}
-
-bool get_channel_stats(const cv::Mat& img, std::vector<image_stats>& stats) {
-  if (img.data == nullptr) {
-    std::cout << "get_channel_stats(): img not set" << std::endl;
-    return false;
-  }
-
-  const int nCh = img.channels();
-  std::vector<cv::Mat> imgCh; // image data per channel
-  cv::split(img, imgCh); // split the image into individual channels
-
-  stats.clear();
-  stats.resize(nCh);
-
-  bool ok = true;
-
-  for (int ch=0; ok && (ch < img.channels()); ++ch) { // compute statistics per channel
-    ok = get_single_channel_stats(imgCh[ch], stats[ch]);
-  }
-
-  if (!ok) {
-    std::cout << "Failed to get stats" << std::endl;
-  }
-  return ok;
-}
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // LBANN_HAS_OPENCV
diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp
index 1bf930cd932..f19413f1117 100644
--- a/src/data_store/data_store_conduit.cpp
+++ b/src/data_store/data_store_conduit.cpp
@@ -27,56 +27,130 @@
 
 #include "lbann/data_store/data_store_conduit.hpp"
 
-#ifdef LBANN_HAS_CONDUIT
-
 #include "lbann/data_readers/data_reader_jag_conduit.hpp"
+#include "lbann/data_readers/data_reader_image.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/utils/options.hpp"
 #include "lbann/utils/timer.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/commify.hpp"
 #include <unordered_set>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/archives/binary.hpp>
+
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
+#include <cstdlib>
 
 namespace lbann {
 
 data_store_conduit::data_store_conduit(
   generic_data_reader *reader) :
-  m_n(0),
-  m_is_setup(false),
-  m_reader(reader),
-  m_preload(false),
-  m_explicit_loading(false),
-  m_owner_map_mb_size(0),
-  m_super_node(false),
-  m_compacted_sample_size(0),
-  m_is_local_cache(false) {
+  m_reader(reader) {
   m_comm = m_reader->get_comm();
   if (m_comm == nullptr) {
-    LBANN_ERROR(" m_comm is nullptr");
+    LBANN_ERROR("m_comm is nullptr");
   }
 
   m_world_master = m_comm->am_world_master();
   m_trainer_master = m_comm->am_trainer_master();
   m_rank_in_trainer = m_comm->get_rank_in_trainer();
+  m_rank_in_world = m_comm->get_rank_in_world();
   m_np_in_trainer = m_comm->get_procs_per_trainer();
 
+  open_informational_files();
+
   options *opts = options::get();
-  m_super_node = opts->get_bool("super_node");
 
-  m_is_local_cache = opts->get_bool("data_store_cache");
-  if (m_is_local_cache && opts->get_bool("preload_data_store")) {
-    LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store");
+  // For use in testing
+  if (opts->has_string("data_store_fail")) {
+    LBANN_ERROR("data_store_conduit is throwing a fake exception; this is for use during testing");
+  }
+
+  if (opts->has_string("data_store_test_checkpoint")
+      && opts->has_string("data_store_spill")) {
+    LBANN_ERROR("you passed both --data_store_test_checkpoint and --data_store_spill; please use one or the other or none, but not both");
+  }  
+  if (opts->has_string("data_store_test_checkpoint")) {
+    setup_checkpoint_test();
+  }  
+  if (opts->has_string("data_store_spill")) {
+    setup_spill(opts->get_string("data_store_spill"));
+  }
+
+  set_is_local_cache(opts->get_bool("data_store_cache"));
+  set_is_preloading(opts->get_bool("preload_data_store"));
+  set_is_explicitly_loading(! is_preloading());
+  
+  if (is_local_cache()) {
+    PROFILE("data_store_conduit is running in local_cache mode");
+  } else {
+    PROFILE("data_store_conduit is running in multi-message mode");
+  }
+  if (is_explicitly_loading()) {
+    PROFILE("data_store_conduit is explicitly loading");
+  } else {
+    PROFILE("data_store_conduit is preloading");
+  }
+
+  check_query_flags();
+}
+
+data_store_conduit::~data_store_conduit() {
+  if (m_debug) {
+    m_debug->close();
+  }
+  if (m_profile) {
+    m_profile->close();
+  }
+  if (m_is_local_cache && m_mem_seg) {
+    int sanity = shm_unlink(m_seg_name.c_str());
+    if (sanity != 0) {
+      std::cout << "\nWARNING: shm_unlink failed in data_store_conduit::~data_store_conduit()\n";
+    }
+    sanity = munmap(reinterpret_cast<void*>(m_mem_seg), m_mem_seg_length);
+    if (sanity != 0) {
+      std::cout << "\nWARNING: munmap failed in data_store_conduit::~data_store_conduit()\n";
+    }
+  }
+}
+
+void data_store_conduit::setup_checkpoint_test() {
+  std::string c = options::get()->get_string("data_store_test_checkpoint");
+  if (c == "1") {
+    LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint=<string>'");
+  } 
+  if (c == "lassen") {
+     c = get_lassen_spill_dir();
+  }
+  m_spill_dir_base = c;
+  m_test_dir = c;
+  m_run_checkpoint_test = true;
+}
+
+std::string data_store_conduit::get_lassen_spill_dir() {
+  char * val = std::getenv("BBPATH");
+  if (val == NULL) {
+    LBANN_ERROR("std::getenv(\"BBPATH\") returned NULL; unable to use burst buffer");
   }
+  std::string cc(val);
+  return cc + "/data_store";
 }
 
-data_store_conduit::~data_store_conduit() {}
 
 data_store_conduit::data_store_conduit(const data_store_conduit& rhs) {
   copy_members(rhs);
 }
 
-data_store_conduit::data_store_conduit(const data_store_conduit& rhs, const std::vector<int>& ds_sample_move_list) {
-
-  copy_members(rhs, ds_sample_move_list);
-}
 
 data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) {
   // check for self-assignment
@@ -87,43 +161,47 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs)
   return (*this);
 }
 
-void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector<int>& ds_sample_move_list) {
-  m_n = rhs.m_n;
+void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { 
+  m_reader = reader; 
+  m_debug = 0;
+  m_profile = 0;
+  open_informational_files();
+}
+
+void data_store_conduit::copy_members(const data_store_conduit& rhs) {
+  m_other = rhs.m_other;
   m_is_setup = rhs.m_is_setup;
-  m_reader = rhs.m_reader;
+  m_preloading = rhs.m_preloading;
+  m_loading_is_complete = rhs.m_loading_is_complete;
+  m_explicitly_loading = rhs.m_explicitly_loading;
+  m_owner_map_mb_size = rhs.m_owner_map_mb_size;
+  m_compacted_sample_size = rhs.m_compacted_sample_size;
+  m_is_local_cache = rhs.m_is_local_cache;
+  m_node_sizes_vary = rhs.m_node_sizes_vary;
+  m_have_sample_sizes = rhs.m_have_sample_sizes;
   m_comm = rhs.m_comm;
-  m_rank_in_trainer = rhs.m_rank_in_trainer;
-  m_np_in_trainer = rhs.m_np_in_trainer;
   m_world_master = rhs.m_world_master;
   m_trainer_master = rhs.m_trainer_master;
-  m_preload = rhs.m_preload;
-  m_explicit_loading = rhs.m_explicit_loading;
+  m_rank_in_trainer = rhs.m_rank_in_trainer;
+  m_np_in_trainer = rhs.m_np_in_trainer;
   m_owner = rhs.m_owner;
   m_shuffled_indices = rhs.m_shuffled_indices;
-  m_owner_map_mb_size = rhs.m_owner_map_mb_size;
-  m_super_node = rhs.m_super_node;
-  m_compacted_sample_size = rhs.m_compacted_sample_size;
-  m_is_local_cache = rhs.m_is_local_cache;
-
-  if(ds_sample_move_list.size() == 0) {
-    m_data = rhs.m_data;
-  } else {
-    /// Move indices on the list from the data and owner maps in the RHS data store to the new data store
-    for(auto&& i : ds_sample_move_list) {
-      if(rhs.m_data.find(i) != rhs.m_data.end()){
-        conduit::Node node = rhs.m_data[i]["data"];
-        rhs.m_data.erase(i);
-        /// Repack the nodes because they don't seem to copy correctly
-        build_node_for_sending(node, m_data[i]);
-      }
-      /// Removed migrated nodes from the original data store's owner list
-      if(rhs.m_owner.find(i) != rhs.m_owner.end()) {
-        m_owner[i] = rhs.m_owner[i];
-        rhs.m_owner.erase(i);
-      }
-    }
-  }
-
+  m_sample_sizes = rhs.m_sample_sizes;
+  m_mem_seg = rhs.m_mem_seg;
+  m_mem_seg_length = rhs.m_mem_seg_length;
+  m_seg_name = rhs.m_seg_name;
+  m_image_offsets = rhs.m_image_offsets;
+
+  // This needs to be false, to ensure a carved out validation set
+  // check for sufficient samples
+  m_bcast_sample_size = true;
+
+  m_spill = rhs.m_spill;
+  m_is_spilled = rhs.m_is_spilled;
+  m_spill_dir_base = rhs.m_spill_dir_base;
+  m_cur_spill_dir_integer = rhs.m_cur_spill_dir_integer;
+  m_cur_spill_dir = rhs.m_cur_spill_dir;
+  m_num_files_in_cur_spill_dir = rhs.m_num_files_in_cur_spill_dir;
 
   /// Clear the pointer to the data reader, this cannot be copied
   m_reader = nullptr;
@@ -140,38 +218,16 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::
   m_outgoing_msg_sizes = rhs.m_outgoing_msg_sizes;
   m_incoming_msg_sizes = rhs.m_incoming_msg_sizes;
   m_compacted_sample_size = rhs.m_compacted_sample_size;
-  m_reconstituted = rhs.m_reconstituted;
   m_indices_to_send = rhs.m_indices_to_send;
   m_indices_to_recv = rhs.m_indices_to_recv;
+
+  open_informational_files();
 }
 
 void data_store_conduit::setup(int mini_batch_size) {
-
-  if (m_world_master) {
-    if (m_super_node) {
-      std::cout << "data store mode: exchange_data via super nodes\n";
-    } else {
-      std::cout << "data store mode: exchange_data via individual samples\n";
-    }
-  }
-
-  double tm1 = get_time();
-  if (m_world_master && !m_preload) {
-    std::cout << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n";
-  }
-
-  if (!m_preload) {
-    // generic_data_store::setup(mini_batch_size);
-    build_owner_map(mini_batch_size);
-  } else {
-    m_owner_map_mb_size = mini_batch_size;
-  }
-
+  PROFILE("starting setup(); m_owner.size(): ", m_owner.size());
+  m_owner_map_mb_size = mini_batch_size;
   m_is_setup = true;
-
-  if (m_world_master && !m_preload) {
-    std::cout << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n";
-  }
 }
 
 void data_store_conduit::setup_data_store_buffers() {
@@ -183,192 +239,171 @@ void data_store_conduit::setup_data_store_buffers() {
   m_outgoing_msg_sizes.resize(m_np_in_trainer);
   m_incoming_msg_sizes.resize(m_np_in_trainer);
   m_recv_buffer.resize(m_np_in_trainer);
-  m_reconstituted.resize(m_np_in_trainer);
 }
 
-// Note: conduit has a very nice interface for communicating nodes
-//       in blocking scenarios. Unf, for non-blocking we need to
-//       handle things ourselves. TODO: possibly modify conduit to
-//       handle non-blocking comms
-void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t mb_size) {
-
-  if (! m_is_setup) {
-    LBANN_ERROR("setup(mb_size) has not been called");
-  }
-
-  if (m_n == 0) {
-    setup_data_store_buffers();
+void data_store_conduit::spill_preloaded_conduit_node(int data_id, const conduit::Node &node) {
+  // note: at this point m_data[data_id] = node
+  conduit::Node n3 = node;
+  { 
+    std::lock_guard<std::mutex> lock(m_mutex);
+    build_node_for_sending(node, n3);
   }
-
-  //========================================================================
-  //part 1: construct the super_nodes
-
-  build_indices_i_will_send(current_pos, mb_size);
-  build_indices_i_will_recv(current_pos, mb_size);
-
-  // construct a super node for each processor; the super node
-  // contains all samples this proc owns that other procs need
-  for (int p=0; p<m_np_in_trainer; p++) {
-    m_send_buffer[p].reset();
-    for (auto idx : m_indices_to_send[p]) {
-      m_send_buffer[p].update_external(m_data[idx]);
-    }
-    build_node_for_sending(m_send_buffer[p], m_send_buffer_2[p]);
+  if (!m_node_sizes_vary) {
+    error_check_compacted_node(n3, data_id);
+  } else {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_sample_sizes[data_id] = n3.total_bytes_compact();
   }
 
-  //========================================================================
-  //part 1.5: exchange super_node sizes
-
-  for (int p=0; p<m_np_in_trainer; p++) {
-    m_outgoing_msg_sizes[p] = m_send_buffer_2[p].total_bytes_compact();
-    El::byte *s = reinterpret_cast<El::byte*>(&m_outgoing_msg_sizes[p]);
-    m_comm->nb_send<El::byte>(s, sizeof(int), m_comm->get_trainer_rank(), p, m_send_requests[p]);
+  {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    spill_conduit_node(node, data_id);
+    m_spilled_nodes[data_id] = m_cur_spill_dir_integer;
+    m_data.erase(data_id);
   }
+}
 
-  for (int p=0; p<m_np_in_trainer; p++) {
-    El::byte *s = reinterpret_cast<El::byte*>(&m_incoming_msg_sizes[p]);
-    m_comm->nb_recv<El::byte>(s, sizeof(int), m_comm->get_trainer_rank(), p, m_recv_requests[p]);
+void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit::Node &node) {
+  // note: at this point m_data[data_id] = node
+  {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (m_data.find(data_id) == m_data.end()) {
+      LBANN_ERROR("(m_data.find(data_id) == m_data.end() for id: ", data_id);
+    }
   }
-  m_comm->wait_all<El::byte>(m_send_requests);
-  m_comm->wait_all<El::byte>(m_recv_requests);
 
-  //========================================================================
-  //part 2: exchange the actual data
+  // TODO: get rid of "m_my_num_indices" -dah, May 2020
 
-  // start sends for outgoing data
-  for (int p=0; p<m_np_in_trainer; p++) {
-    const El::byte *s = reinterpret_cast<El::byte*>(m_send_buffer_2[p].data_ptr());
-    m_comm->nb_send<El::byte>(s, m_outgoing_msg_sizes[p], m_comm->get_trainer_rank(), p, m_send_requests[p]);
-  }
-
-  // start recvs for incoming data
-  for (int p=0; p<m_np_in_trainer; p++) {
-    m_recv_buffer[p].set(conduit::DataType::uint8(m_incoming_msg_sizes[p]));
-    m_comm->nb_recv<El::byte>((El::byte*)m_recv_buffer[p].data_ptr(), m_incoming_msg_sizes[p], m_comm->get_trainer_rank(), p, m_recv_requests[p]);
+  if (is_local_cache()) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    ++m_my_num_indices;
+    m_data[data_id] = node; 
+    return;
   }
 
-  // wait for all msgs to complete
-  m_comm->wait_all<El::byte>(m_send_requests);
-  m_comm->wait_all<El::byte>(m_recv_requests);
-
-  //========================================================================
-  //part 3: construct the Nodes needed by me for the current minibatch
-
-  m_minibatch_data.clear();
-  for (int p=0; p<m_np_in_trainer; p++) {
-    conduit::uint8 *n_buff_ptr = (conduit::uint8*)m_recv_buffer[p].data_ptr();
-    conduit::Node n_msg;
-    n_msg["schema_len"].set_external((conduit::int64*)n_buff_ptr);
-    n_buff_ptr +=8;
-    n_msg["schema"].set_external_char8_str((char*)(n_buff_ptr));
-    conduit::Schema rcv_schema;
-    conduit::Generator gen(n_msg["schema"].as_char8_str());
-    gen.walk(rcv_schema);
-    n_buff_ptr += n_msg["schema"].total_bytes_compact();
-    n_msg["data"].set_external(rcv_schema,n_buff_ptr);
-    m_reconstituted[p].reset();
-
-    // I'm unsure what happens here: m_reconstituted is persistent, but
-    // we're updating from n_msg, which is transitory. Best guess,
-    // when n_msg goes out of scope a deep copy is made. Possibly
-    // there's room for optimization here.
-    m_reconstituted[p].update_external(n_msg["data"]);
-    const std::vector<std::string> &names = m_reconstituted[p].child_names();
-
-    for (auto &t : names) {
-      m_minibatch_data[atoi(t.c_str())][t].update_external(m_reconstituted[p][t]);
-    }
+  if (m_spill) {
+    ++m_my_num_indices;
+    spill_preloaded_conduit_node(data_id, node);
+    return;
   }
-}
 
-void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node &node) {
-  // note: at this point m_data[data_id] = node
-  // note: if running in super_node mode, nothing to do
-  if (!m_super_node) {
-    conduit::Node n2 = node;
+  { 
+    conduit::Node n2 = node;  // node == m_data[data_id]
+    std::lock_guard<std::mutex> lock(m_mutex);
     build_node_for_sending(n2, m_data[data_id]);
+  }
+  if (!m_node_sizes_vary) {
     error_check_compacted_node(m_data[data_id], data_id);
+  } else {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
   }
 }
 
 void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int data_id) {
-  if(m_compacted_sample_size == 0) {
-    m_compacted_sample_size = nd.total_bytes_compact();
-  } else if(m_compacted_sample_size != nd.total_bytes_compact()) {
-    LBANN_ERROR("Conduit node being added data_id: " + std::to_string(data_id)
-                + " is not the same size as existing nodes in the data_store "
-                + std::to_string(m_compacted_sample_size) + " != "
-                + std::to_string(nd.total_bytes_compact())
-                + " role: " + m_reader->get_role());
+  if (m_node_sizes_vary) {
+    return;
   }
-  if(!nd.is_contiguous()) {
-    LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a contiguous layout");
+  {
+    std::lock_guard<std::mutex> lock(m_mutex_2);
+    if (m_compacted_sample_size == 0) {
+      m_compacted_sample_size = nd.total_bytes_compact();
+      PROFILE("num bytes for nodes to be transmitted: ", nd.total_bytes_compact(), " per node");
+    } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) {
+      LBANN_ERROR("Conduit node being added data_id: ", data_id,
+                  " is not the same size as existing nodes in the data_store ",
+                  m_compacted_sample_size, " != ", nd.total_bytes_compact(),
+                  " role: ", m_reader->get_role());
+    }
+  }
+  if (!nd.is_contiguous()) {
+    LBANN_ERROR("m_data[",  data_id, "] does not have a contiguous layout");
   }
-  if(nd.data_ptr() == nullptr) {
-    LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid data pointer");
+  if (nd.data_ptr() == nullptr) {
+    LBANN_ERROR("m_data[", data_id, "] does not have a valid data pointer");
   }
-  if(nd.contiguous_data_ptr() == nullptr) {
-    LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid contiguous data pointer");
+  if (nd.contiguous_data_ptr() == nullptr) {
+    LBANN_ERROR("m_data[", data_id, "] does not have a valid contiguous data pointer");
   }
 }
 
 
-void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) {
-  if (already_have == false && m_data.find(data_id) != m_data.end()) {
-    LBANN_ERROR("duplicate data_id: " + std::to_string(data_id) + " in data_store_conduit::set_conduit_node");
+//n.b. Do not put any PROFILE or DEBUG_DS statements in this method,
+//     since the threading from the data_reader will cause you grief
+void data_store_conduit::set_conduit_node(int data_id, const conduit::Node &node, bool already_have) {
+
+  std::lock_guard<std::mutex> lock(m_mutex);
+  // TODO: test whether having multiple mutexes below is better (faster) than
+  //       locking this entire call with a single mutex. For now I'm
+  //       playing it safe and locking the whole dang thing.
+  ++m_my_num_indices;
+
+  if (is_local_cache() && is_preloading()) {
+    LBANN_ERROR("you called data_store_conduit::set_conduit_node, but you're running in local cache mode with preloading; something is broken; please contact Dave Hysom");
+  }
+
+  {
+    //std::lock_guard<std::mutex> lock(m_mutex);
+    if (already_have == false && m_data.find(data_id) != m_data.end()) {
+      DEBUG_DS("m_data.size: ", m_data.size(), " ERROR: duplicate data_id: ", data_id);
+      LBANN_ERROR("duplicate data_id: ", data_id, " in data_store_conduit::set_conduit_node; role: ", m_reader->get_role());
+    }
   }
 
   if (already_have && is_local_cache()) {
     if (m_data.find(data_id) == m_data.end()) {
-      LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (" + std::to_string(data_id) + ") doesn't exist in m_data");
+      LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (", data_id, ") doesn't exist in m_data");
     }
     return;
   }
 
-  if (m_owner[data_id] != m_rank_in_trainer) {
-    std::stringstream s;
-    s << "set_conduit_node error for data id: "<<data_id<< " m_owner: " << m_owner[data_id] << " me: " << m_rank_in_trainer << "; data reader role: " << m_reader->get_role() << "\n";
-    LBANN_ERROR(s.str());
-  }
-
   if (is_local_cache()) {
     m_data[data_id] = node;
   }
 
-  else if (! m_super_node) {
-    build_node_for_sending(node, m_data[data_id]);
-    error_check_compacted_node(m_data[data_id], data_id);
-  }
-
   else {
-    m_data[data_id] = node;
-    // @TODO would like to do: m_data[data_id].set_external(node); but since
-    // (as of now) 'node' is a local variable in a data_reader+jag_conduit,
-    // we need to do a deep copy. If the data_store furnishes a node to the
-    // data_reader during the first epoch, this copy can be avoided
+    if (m_spill) {
+  PROFILE("spill!\n");
+
+      //TODO: rethink how we go about exchanging sample sizes.
+      //currently, we exchange sample sizes a single time, and
+      //the exchange is for all samples. To make this work with
+      //spilling we need to compute the sample size by building
+      //a node_for_sending (below), then we throw it away.
+      //Also, see not in copy_members() about problems with the
+      //schema that cause us to rebuild the node_for_sending after
+      //copying or loading from disk. I need to revisit this and
+      //figure out what's going on.
+      conduit::Node n2;
+      build_node_for_sending(node, n2);
+      error_check_compacted_node(n2, data_id);
+      {
+    //    std::lock_guard<std::mutex> lock(m_mutex);
+        LBANN_ERROR("NOT YET IMPLEMENTED");
+        m_owner[data_id] = m_rank_in_trainer;
+        m_sample_sizes[data_id] = n2.total_bytes_compact();
+        spill_conduit_node(node, data_id);
+        m_spilled_nodes[data_id] = m_cur_spill_dir_integer;
+      }
+    }
+
+    else {
+      {
+      //  std::lock_guard<std::mutex> lock(m_mutex);
+        m_owner[data_id] = m_rank_in_trainer;
+        build_node_for_sending(node, m_data[data_id]);
+        m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
+      }  
+      error_check_compacted_node(m_data[data_id], data_id);
+    }  
   }
 }
 
 const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const {
-  /**
-   * dah: commenting this out since it gives a false positive for test
-   *      case with unshuffled indices. Since we currently send samples
-   *      to ourselves, they should be in m_minibatch_data. The following
-   *      block is only useful if, at some future time, we do not send
-   *      indices to ourself
-  std::unordered_map<int, conduit::Node>::const_iterator t = m_data.find(data_id);
-  if (t != m_data.end()) {
-    if(m_super_node) {
-      return t->second;
-    } else {
-      return t->second["data"];
-    }
-  }
-  */
   if (is_local_cache()) {
     std::unordered_map<int, conduit::Node>::const_iterator t3 = m_data.find(data_id);
     if (t3 == m_data.end()) {
-      LBANN_ERROR("failed to find data_id: " + std::to_string(data_id) + " in m_data; m_data.size: " + std::to_string(m_data.size()));
+      LBANN_ERROR("(local cache) failed to find data_id: ", data_id, " in m_data; m_data.size: ", m_data.size());
     }
     return t3->second;
   }
@@ -381,7 +416,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const {
     if (t3 != m_data.end()) {
       return t3->second["data"];
     }
-    LBANN_ERROR("failed to find data_id: " + std::to_string(data_id) + " in m_minibatch_data; m_minibatch_data.size: " + std::to_string(m_minibatch_data.size())+ " and also failed to find it in m_data; m_data.size: " + std::to_string(m_data.size()) + "; role: " + m_reader->get_role());
+    LBANN_ERROR("failed to find data_id: ", data_id, " in m_minibatch_data; m_minibatch_data.size: ", m_minibatch_data.size(), " and also failed to find it in m_data; m_data.size: ", m_data.size(), "; role: ", m_reader->get_role());
   }
 
   return t2->second;
@@ -390,7 +425,6 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const {
 // code in the following method is a modification of code from
 // conduit/src/libs/relay/conduit_relay_mpi.cpp
 void data_store_conduit::build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out) {
-
   node_out.reset();
   conduit::Schema s_data_compact;
   if( node_in.is_compact() && node_in.is_contiguous()) {
@@ -429,7 +463,35 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s
     LBANN_ERROR("setup(mb_size) has not been called");
   }
 
+  // The following is needed to deal with one-off cases where one or
+  // more ranks do not own any samples (i.e, m_data is empty).
+  // In this case those processors won't know the size of the compacted
+  // nodes, hence, cannot properly set up their recv buffers, hence,
+  // mpi throws errors.
+  if (m_bcast_sample_size && !m_node_sizes_vary) {
+    verify_sample_size();
+    m_bcast_sample_size = false;
+  }
+
+  double tm5 = get_time();
+
+  /// exchange sample sizes if they are non-uniform (imagenet);
+  /// this will only be called once, during the first call to
+  /// exchange_data_by_sample at the beginning of the 2nd epoch,
+  /// or during the first call th exchange_data_by_sample() during
+  /// the first epoch if preloading
+  if (m_node_sizes_vary && !m_have_sample_sizes & !m_is_local_cache) {
+    double tm3 = get_time();
+    exchange_sample_sizes();
+    m_exchange_sample_sizes_time += (get_time() - tm3);
+  }
+
   int num_send_req = build_indices_i_will_send(current_pos, mb_size);
+  if (m_spill) {
+    // TODO
+    load_spilled_conduit_nodes();
+  }
+
   int num_recv_req = build_indices_i_will_recv(current_pos, mb_size);
 
   m_send_requests.resize(num_send_req);
@@ -446,36 +508,57 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s
     const std::unordered_set<int> &indices = m_indices_to_send[p];
     for (auto index : indices) {
       if (m_data.find(index) == m_data.end()) {
-        LBANN_ERROR("failed to find data_id: " + std::to_string(index) + " to be sent to " + std::to_string(p) + " in m_data");
+        LBANN_ERROR("failed to find data_id: ", index, " to be sent to ", p, " in m_data");
       }
       const conduit::Node& n = m_data[index];
       const El::byte *s = reinterpret_cast<const El::byte*>(n.data_ptr());
       if(!n.is_contiguous()) {
-        LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a contiguous layout");
+        LBANN_ERROR("data_id: ", index, " does not have a contiguous layout");
       }
       if(n.data_ptr() == nullptr) {
-        LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid data pointer");
+        LBANN_ERROR("data_id: ", index, " does not have a valid data pointer");
       }
       if(n.contiguous_data_ptr() == nullptr) {
-        LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid contiguous data pointer");
+        LBANN_ERROR("data_id: ", index, " does not have a valid contiguous data pointer");
+      }
+
+      size_t sz = m_compacted_sample_size;
+
+      if (m_node_sizes_vary) {
+        if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
+          LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", index, "; m_sample_sizes.size: ", m_sample_sizes.size());
+        }
+        sz = m_sample_sizes[index];
       }
-      m_comm->nb_tagged_send(s, m_compacted_sample_size, p, index, m_send_requests[ss++], m_comm->get_trainer_comm());
+
+      m_comm->nb_tagged_send<El::byte>(s, sz, p, index, m_send_requests[ss++], m_comm->get_trainer_comm());
     }
   }
 
   // sanity checks
   if (ss != m_send_requests.size()) {
-    LBANN_ERROR("ss != m_send_requests.size; ss: " + std::to_string(ss) + " m_send_requests.size: " + std::to_string(m_send_requests.size()));
+    LBANN_ERROR("ss != m_send_requests.size; ss: ", ss, " m_send_requests.size: ", m_send_requests.size());
   }
 
   // start recvs for incoming data
   ss = 0;
+
   for (int p=0; p<m_np_in_trainer; p++) {
     const std::unordered_set<int> &indices = m_indices_to_recv[p];
+    int sanity = 0;
     for (auto index : indices) {
-      m_recv_buffer[ss].set(conduit::DataType::uint8(m_compacted_sample_size));
+      ++sanity;
+      int sz = m_compacted_sample_size;
+      if (m_node_sizes_vary) {
+        if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
+          LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", index, "; m_sample_sizes.size(): ", m_sample_sizes.size(), " role: ", m_reader->get_role(), " for index: ", sanity, " of ", indices.size());
+        }
+        sz = m_sample_sizes[index];
+      }
+
+      m_recv_buffer[ss].set(conduit::DataType::uint8(sz));
       El::byte *r = reinterpret_cast<El::byte*>(m_recv_buffer[ss].data_ptr());
-      m_comm->nb_tagged_recv<El::byte>(r, m_compacted_sample_size, p, index, m_recv_requests[ss], m_comm->get_trainer_comm());
+      m_comm->nb_tagged_recv<El::byte>(r, sz, p, index, m_recv_requests[ss], m_comm->get_trainer_comm());
       m_recv_data_ids[ss] = index;
       ++ss;
     }
@@ -483,19 +566,24 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s
 
   // sanity checks
   if (ss != m_recv_buffer.size()) {
-    LBANN_ERROR("ss != m_recv_buffer.size; ss: " + std::to_string(ss) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size()));
+    LBANN_ERROR("ss != m_recv_buffer.size; ss: ", ss, " m_recv_buffer.size: ", m_recv_buffer.size());
   }
   if (m_recv_requests.size() != m_recv_buffer.size()) {
-    LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: " + std::to_string(m_recv_requests.size()) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size()));
+    LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ", m_recv_requests.size(), " m_recv_buffer.size: ", m_recv_buffer.size());
   }
 
+  m_start_snd_rcv_time += (get_time() - tm5);
+
   // wait for all msgs to complete
+  tm5 = get_time();
   m_comm->wait_all(m_send_requests);
   m_comm->wait_all(m_recv_requests);
+  m_wait_all_time += (get_time() - tm5);
 
   //========================================================================
   //part 3: construct the Nodes needed by me for the current minibatch
 
+  tm5 = get_time();
   conduit::Node nd;
   m_minibatch_data.clear();
   for (size_t j=0; j < m_recv_buffer.size(); j++) {
@@ -513,6 +601,12 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s
     int data_id = m_recv_data_ids[j];
     m_minibatch_data[data_id].set_external(n_msg["data"]);
   }
+  m_rebuild_time += (get_time() - tm5);
+
+  if (m_spill) {
+    // TODO
+    m_data.clear();
+  }
 }
 
 int data_store_conduit::build_indices_i_will_recv(int current_pos, int mb_size) {
@@ -534,17 +628,22 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size)
   m_indices_to_send.clear();
   m_indices_to_send.resize(m_np_in_trainer);
   int k = 0;
+  DEBUG_DS("build_indices_i_will_send; cur pos: ", current_pos, " mb_size: ", mb_size, " m_data.size: ", m_data.size());
   for (int i = current_pos; i < current_pos + mb_size; i++) {
     auto index = (*m_shuffled_indices)[i];
     /// If this rank owns the index send it to the (i%m_np)'th rank
+    bool is_mine = false;
     if (m_data.find(index) != m_data.end()) {
+      is_mine = true;
+    } else if (m_spilled_nodes.find(index) != m_spilled_nodes.end()) {
+      is_mine = true;
+    }
+    if (is_mine) {
       m_indices_to_send[(i % m_owner_map_mb_size) % m_np_in_trainer].insert(index);
 
       // Sanity check
       if (m_owner[index] != m_rank_in_trainer) {
-        std::stringstream s;
-        s << "error for i: "<<i<<" index: "<<index<< " m_owner: " << m_owner[index] << " me: " << m_rank_in_trainer;
-        LBANN_ERROR(s.str());
+        LBANN_ERROR( "error for i: ", i, " index: ", index, " m_owner: ", m_owner[index], " me: ", m_rank_in_trainer);
       }
       k++;
     }
@@ -553,6 +652,7 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size)
 }
 
 void data_store_conduit::build_preloaded_owner_map(const std::vector<int>& per_rank_list_sizes) {
+  PROFILE("starting data_store_conduit::build_preloaded_owner_map");
   m_owner.clear();
   int owning_rank = 0;
   size_t per_rank_list_range_start = 0;
@@ -562,24 +662,10 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector<int>& per_r
       ++owning_rank;
       per_rank_list_range_start += per_rank_list_size;
     }
-    m_owner[i] = owning_rank;
-  }
-}
-
-void data_store_conduit::build_owner_map(int mini_batch_size) {
-  if (m_world_master) std::cout << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << "\n";
-  if (mini_batch_size == 0) {
-    LBANN_ERROR("mini_batch_size == 0; can't build owner_map");
-  }
-  m_owner.clear();
-  m_owner_map_mb_size = mini_batch_size;
-  for (size_t i = 0; i < m_shuffled_indices->size(); i++) {
-    auto index = (*m_shuffled_indices)[i];
-    /// To compute the owner index first find its position inside of
-    /// the mini-batch (mod mini-batch size) and then find how it is
-    /// striped across the ranks in the trainer
-    m_owner[index] = (i % m_owner_map_mb_size) % m_np_in_trainer;
+    m_owner[(*m_shuffled_indices)[i]] = owning_rank;
   }
+PROFILE("build_preloaded_owner_map; m_owner_maps_were_exchanged = true");
+  m_owner_maps_were_exchanged = true;
 }
 
 const conduit::Node & data_store_conduit::get_random_node() const {
@@ -602,28 +688,17 @@ const conduit::Node & data_store_conduit::get_random_node(const std::string &fie
 }
 
 conduit::Node & data_store_conduit::get_empty_node(int data_id) {
+  std::lock_guard<std::mutex> lock(m_mutex);
   if (m_data.find(data_id) != m_data.end()) {
-    LBANN_ERROR("we already have a node with data_id= " + std::to_string(data_id));
+    LBANN_ERROR("we already have a node with data_id= ", data_id);
   }
   return m_data[data_id];
 }
 
-void data_store_conduit::purge_unused_samples(const std::vector<int>& indices) {
-  /// Remove unused indices from the data and owner maps
-  for(auto&& i : indices) {
-    if(m_data.find(i) != m_data.end()){
-      m_data.erase(i);
-    }
-    if(m_owner.find(i) != m_owner.end()) {
-      m_owner.erase(i);
-    }
-  }
-}
-
 void data_store_conduit::compact_nodes() {
   for(auto&& j : *m_shuffled_indices) {
     if(m_data.find(j) != m_data.end()){
-      if(!m_data[j].is_contiguous()) {
+      if(! (m_data[j].is_contiguous() && m_data[j].is_compact()) ) {
         /// Repack the nodes because they don't seem to copy correctly
         conduit::Node node = m_data[j]["data"];
         m_data.erase(j);
@@ -635,16 +710,13 @@ void data_store_conduit::compact_nodes() {
 
 int data_store_conduit::get_index_owner(int idx) {
   if (m_owner.find(idx) == m_owner.end()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: "
-        << " idx: " << idx << " was not found in the m_owner map;"
-        << " map size: " << m_owner.size();
-    throw lbann_exception(err.str());
+    LBANN_ERROR(" idx: ", idx, " was not found in the m_owner map; map size: ", m_owner.size());
   }
   return m_owner[idx];
 }
 
 void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset) {
+//TODO: this is junky, and isn't called anywhere; rethink!
   if (comm->am_world_master()) {
     // note: we only estimate memory required by the data reader/store
 
@@ -658,7 +730,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
         std::stringstream s3(line);
         s3 >> line >> a_mem >> units;
         if (units != "kB") {
-          LBANN_ERROR("units is " + units + " but we only know how to handle kB; please contact Dave Hysom");
+          LBANN_ERROR("units is ", units, " but we only know how to handle kB; please contact Dave Hysom");
         }
         break;
       }
@@ -678,7 +750,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
     // get list of conduit files that I own, and compute my num_samples
     std::ifstream istr(sample_list_file);
     if (!istr.good()) {
-      LBANN_ERROR("failed to open " + sample_list_file + " for reading");
+      LBANN_ERROR("failed to open ", sample_list_file, " for reading");
     }
 
     std::string base_dir;
@@ -723,7 +795,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
         try {
           hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read(base_dir + '/' + filename);
         } catch (conduit::Error const& e) {
-          LBANN_ERROR(" failed to open " + base_dir + '/' + filename + " for reading");
+          LBANN_ERROR(" failed to open ", base_dir, '/', filename, " for reading");
         }
         std::vector<std::string> sample_names;
         try {
@@ -737,7 +809,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
           try {
             conduit::relay::io::hdf5_read(hdf5_file_hnd, key, useme);
           } catch (conduit::Error const& e) {
-            LBANN_ERROR("failed to read success flag for " + key);
+            LBANN_ERROR("failed to read success flag for ", key);
           }
           if (useme.to_int64() == 1) {
             got_one = true;
@@ -745,7 +817,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
               key = "/" + t;
               conduit::relay::io::hdf5_read(hdf5_file_hnd, key, useme);
             } catch (conduit::Error const& e) {
-              LBANN_ERROR("failed to load JAG sample: " + key);
+              LBANN_ERROR("failed to load JAG sample: ", key);
             }
             break;
           }
@@ -793,10 +865,1125 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string
 
 bool data_store_conduit::has_conduit_node(int data_id) const {
   std::unordered_map<int, conduit::Node>::const_iterator t = m_data.find(data_id);
-  return t == m_data.end();
+  return t != m_data.end();
+}
+
+void data_store_conduit::set_shuffled_indices(const std::vector<int> *indices) {
+  m_shuffled_indices = indices;
+}
+
+void data_store_conduit::exchange_sample_sizes() {
+  DEBUG_DS("starting data_store_conduit::exchange_sample_sizes");
+  int my_count = m_sample_sizes.size();
+  std::vector<int> all_counts(m_np_in_trainer);
+  m_comm->all_gather(&my_count, 1, all_counts.data(), 1,  m_comm->get_trainer_comm());
+
+  if (m_debug) {
+    for (size_t h=0; h<all_counts.size(); h++) {
+      DEBUG_DS("num samples owned by P_", h, " is ", all_counts[h]);
+    }
+  }
+
+  std::vector<size_t> my_sizes(m_sample_sizes.size()*2);
+  size_t j = 0;
+  for (auto t : m_sample_sizes) {
+    my_sizes[j++] = t.first;
+    my_sizes[j++] = t.second;
+  }
+
+  std::vector<size_t> others;
+  for (int k=0; k<m_np_in_trainer; k++) {
+    DEBUG_DS("sample sizes for P_", k);
+    others.resize(all_counts[k]*2);
+    if (m_rank_in_trainer == k) {
+      m_comm->broadcast<size_t>(k, my_sizes.data(), all_counts[k]*2,  m_comm->get_trainer_comm());
+    } else {
+      m_comm->broadcast<size_t>(k, others.data(), all_counts[k]*2,  m_comm->get_trainer_comm());
+
+      for (size_t i=0; i<others.size(); i += 2) {
+        if (m_sample_sizes.find(others[i]) != m_sample_sizes.end()) {
+          if (m_debug) {
+            DEBUG_DS("SAMPLE SIZES for P_", k);
+            for (size_t h=0; h<others.size(); h += 2) {
+              DEBUG_DS(others[h], " SIZE: ", others[h+1]);
+            }
+          }
+          LBANN_ERROR("m_sample_sizes.find(others[i]) != m_sample_sizes.end() for data_id: ", others[i]);
+        }
+        m_sample_sizes[others[i]] = others[i+1];
+      }
+    }
+  }
+
+  m_have_sample_sizes = true;
+}
+
+void data_store_conduit::set_is_preloading(bool flag) {
+  m_preloading = flag;
+}
+
+void data_store_conduit::set_is_explicitly_loading(bool flag) {
+  m_explicitly_loading = flag;
+  if (is_preloading() && is_explicitly_loading()) {
+    LBANN_ERROR("flags for both explicit and pre- loading are set; this is an error");
+  }
+}
+
+void data_store_conduit::set_loading_is_complete() {
+  PROFILE("set_loading_is_complete()");
+  m_loading_is_complete = true;
+  set_is_preloading(false);
+  set_is_explicitly_loading(false);
+  check_query_flags();
+
+  if (m_run_checkpoint_test) {
+    test_checkpoint(m_spill_dir_base);
+  }
+}
+
+bool data_store_conduit::is_fully_loaded() const { 
+  if (m_loading_is_complete) {
+    return true;
+  }
+  return false;
+}
+
+void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector<std::vector<int>> &indices) {
+  /// this block fires if image sizes have been precomputed
+  if (options::get()->has_string("image_sizes_filename")) {
+    LBANN_ERROR("not yet implemented");
+    //TODO dah - implement, if this becomes a bottleneck (but I don't think it will)
+  }
+
+  // get list of image file names
+  image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
+  if (image_reader == nullptr) {
+    LBANN_ERROR("data_reader_image *image_reader = dynamic_cast<data_reader_image*>(m_reader) failed");
+  }
+  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
+  std::vector<size_t> my_image_sizes;
+
+  // this block fires if we're exchanging cache data at the end
+  // of the first epoch, and the data store was not preloaded
+  if (is_explicitly_loading()) {
+    for (const auto &t : m_data) {
+      int data_id = t.first;
+      my_image_sizes.push_back(data_id);
+      my_image_sizes.push_back(t.second[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value());
+    }
+  }
+  
+  else {
+    // get sizes of files for which I'm responsible
+    for (size_t h=m_rank_in_trainer; h<m_shuffled_indices->size(); h += m_np_in_trainer) {
+      ++m_my_num_indices;
+      const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first;
+      std::ifstream in(fn.c_str());
+      if (!in) {
+        LBANN_ERROR("failed to open ", fn, " for reading; file_dir: ", m_reader->get_file_dir(), "  fn: ", image_list[h].first, "; role: ", m_reader->get_role());
+      }
+      in.seekg(0, std::ios::end);
+      my_image_sizes.push_back((*m_shuffled_indices)[h]);
+      my_image_sizes.push_back(in.tellg());
+      in.close();
+    }
+  }
+
+  // exchange image sizes
+  int my_count = my_image_sizes.size();
+
+  std::vector<int> counts(m_np_in_trainer);
+  m_comm->all_gather<int>(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm());
+
+  //my_image_sizes[h*2] contains the image index
+  //my_image_sizes[h*2+1] contains the image sizee
+
+  //fill in displacement vector for gathering the actual image sizes
+  std::vector<int> disp(m_np_in_trainer + 1);
+  disp[0] = 0;
+  for (size_t h=0; h<counts.size(); ++h) {
+    disp[h+1] = disp[h] + counts[h];
+  }
+
+  std::vector<size_t> work(image_list.size()*2);
+  m_comm->trainer_all_gather<size_t>(my_image_sizes, work, counts, disp);
+  indices.resize(m_np_in_trainer);
+  for (int h=0; h<m_np_in_trainer; h++) {
+    indices[h].reserve(counts[h]);
+    size_t start = disp[h];
+    size_t end = disp[h+1];
+    for (size_t k=start; k<end; k+= 2) {
+      size_t idx = work[k];
+      size_t size = work[k+1];
+      indices[h].push_back(idx);
+      file_sizes[idx] = size;
+    }
+  }
+}
+
+void data_store_conduit::compute_image_offsets(map_is_t &sizes, std::vector<std::vector<int>> &indices) {
+  size_t offset = 0;
+  for (size_t p=0; p<indices.size(); p++) {
+    for (auto idx : indices[p]) {
+      if (sizes.find(idx) == sizes.end()) {
+        LBANN_ERROR("sizes.find(idx) == sizes.end() for idx: ", idx);
+      }
+      size_t sz = sizes[idx];
+      m_image_offsets[idx] = offset;
+      offset += sz;
+    }
+  }
+}
+
+void data_store_conduit::allocate_shared_segment(map_is_t &sizes, std::vector<std::vector<int>> &indices) {
+  off_t size = 0;
+  for (auto &&t : sizes) {
+    size += t.second;
+  }
+  m_mem_seg_length = size;
+
+  struct statvfs stat;
+  int x = statvfs("/dev/shm", &stat);
+  if (x != 0) {
+    LBANN_ERROR("statvfs failed\n");
+  }
+  size_t avail_mem = stat.f_bsize*stat.f_bavail;
+  double percent = 100.0 * m_mem_seg_length / avail_mem;
+  std::stringstream msg;
+  PROFILE(
+    "  Shared Memory segment statistics:\n",
+    "   size of required shared memory segment: ", utils::commify(m_mem_seg_length), "\n",
+    "   available mem: ", utils::commify(avail_mem), "\n",
+    "   required size is ", percent, " percent of available");
+
+  if (m_mem_seg_length >= avail_mem) {
+    LBANN_ERROR("insufficient available memory:\n", msg.str());
+  }
+
+  //need to ensure name is unique across all data readers
+  m_seg_name = "/our_town_" + m_reader->get_role();
+
+  //in case a previous run was aborted, attempt to remove the file, which
+  //may or may not exist
+  shm_unlink(m_seg_name.c_str());
+  int node_id = m_comm->get_rank_in_node();
+  if (node_id == 0) {
+    std::remove(m_seg_name.c_str());
+  }
+  m_comm->trainer_barrier();
+
+  int shm_fd = -1;
+
+  if (node_id == 0) {
+    shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666);
+    if (shm_fd == -1) {
+      LBANN_ERROR("shm_open failed");
+    }
+    int v = ftruncate(shm_fd, size);
+    if (v != 0) {
+      LBANN_ERROR("ftruncate failed for size: ", size);
+    }
+    void *m = mmap(0, size, PROT_WRITE | PROT_READ, MAP_SHARED, shm_fd, 0);
+    if (m == MAP_FAILED) {
+      LBANN_ERROR("mmap failed");
+    }
+    m_mem_seg = reinterpret_cast<char*>(m);
+    std::fill_n(m_mem_seg, m_mem_seg_length, 1);
+    int sanity = msync(static_cast<void*>(m_mem_seg), m_mem_seg_length, MS_SYNC);
+    if (sanity != 0) {
+      LBANN_ERROR("msync failed");
+    }
+  }
+
+  m_comm->barrier(m_comm->get_node_comm());
+
+  if (node_id != 0) {
+    shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY, 0666);
+    if (shm_fd == -1) {
+      LBANN_ERROR("shm_open failed for filename: ", m_seg_name);
+    }
+    void *m = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0);
+    if (m == MAP_FAILED) {
+      LBANN_ERROR("mmap failed");
+    }
+    m_mem_seg = reinterpret_cast<char*>(m);
+
+    struct stat b;
+    int sanity = fstat(shm_fd, &b);
+    if (sanity == -1) {
+      LBANN_ERROR("fstat failed");
+    }
+    if (b.st_size != size) {
+      LBANN_ERROR("b.st_size= ", b.st_size, " should be equal to ", size);
+    }
+  }
+  close(shm_fd);
+}
+
+void data_store_conduit::preload_local_cache() {
+  exchange_local_caches();
+}
+
+void data_store_conduit::exchange_local_caches() {
+  PROFILE("Starting exchange_local_caches");
+  PROFILE("  is_explicitly_loading(): ", is_explicitly_loading());
+  PROFILE("  is_preloading(): ", is_preloading());
+  PROFILE("  is_local_cache(): ", is_local_cache());
+  PROFILE("  is_fully_loaded: ", is_fully_loaded());
+
+  // indices[j] will contain the indices 
+  // that P_j will read from disk, and subsequently bcast to all others
+  std::vector<std::vector<int>> indices;
+
+  double tm1 = get_time();
+  get_image_sizes(m_sample_sizes, indices);
+  PROFILE("  get_image_sizes time: ", (get_time()-tm1));
+
+  tm1 = get_time();
+  allocate_shared_segment(m_sample_sizes, indices);
+  PROFILE("  allocate_shared_segment time: ", (get_time()-tm1));
+
+  std::vector<char> work;
+  if (! is_explicitly_loading()) {
+    tm1 = get_time();
+    read_files(work, m_sample_sizes, indices[m_rank_in_trainer]);
+    PROFILE("  read_files time: ", (get_time()- tm1));
+  }
+
+  tm1 = get_time();
+  compute_image_offsets(m_sample_sizes, indices);
+  PROFILE("  compute_image_offsets time: ", (get_time()-tm1));
+
+  tm1 = get_time();
+  exchange_images(work, m_sample_sizes, indices);
+  PROFILE("  exchange_images time: ", (get_time()-tm1));
+
+  tm1 = get_time();
+  build_conduit_nodes(m_sample_sizes);
+  PROFILE("  build_conduit_nodes time: ", (get_time()-tm1));
+
+  set_loading_is_complete();
+
+  if (options::get()->get_bool("data_store_test_cache")) {
+    test_local_cache_imagenet(20);
+  }
+}
+
+void data_store_conduit::read_files(std::vector<char> &work, map_is_t &sizes, std::vector<int> &indices) {
+
+  //reserve space for reading this proc's files into a contiguous memory space
+  size_t n = 0;
+  for (size_t j=0; j<indices.size(); ++j) {
+    n += sizes[indices[j]];
+  }
+  work.resize(n);
+
+  //get the list of images from the data reader
+  image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
+  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
+
+  //read the images
+  size_t offset = 0;
+  PROFILE("  my num files: ", indices.size());
+  for (size_t j=0; j<indices.size(); ++j) {
+    int idx = indices[j];
+    size_t s = sizes[idx];
+    const std::string fn = m_reader->get_file_dir() + '/' + image_list[idx].first;
+    std::ifstream in(fn, std::ios::in | std::ios::binary);
+    in.read(work.data()+offset, s);
+    in.close();
+    offset += s;
+  }
+}
+
+void data_store_conduit::build_conduit_nodes(map_is_t &sizes) {
+  image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
+  const std::vector<image_data_reader::sample_t> &image_list = image_reader->get_image_list();
+  for (auto t : sizes) {
+    int data_id = t.first;
+    int label = image_list[data_id].second; 
+    if (m_image_offsets.find(data_id) == m_image_offsets.end()) {
+      LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id);
+    }
+    size_t offset = m_image_offsets[data_id];
+    if (sizes.find(data_id) == sizes.end()) {
+      LBANN_ERROR("sizes.find(data_id) == sizes.end() for data_id: ", data_id);
+    }
+    size_t sz = sizes[data_id];
+    conduit::Node &node = m_data[data_id];
+    node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label);
+    node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = sz;
+    char *c = m_mem_seg + offset;
+    node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_external_char_ptr(c, sz);
+  }
+}
+
+void data_store_conduit::fillin_shared_images(char* images, size_t size, size_t offset) {
+  PROFILE("  fillin_shared_images; size: ", utils::commify(size), " offset: ", utils::commify(offset));
+  memcpy(reinterpret_cast<void*>(m_mem_seg+offset), reinterpret_cast<const void*>(images), size);
+}
+
+void data_store_conduit::exchange_images(std::vector<char> &work, map_is_t &image_sizes, std::vector<std::vector<int>> &indices) {
+
+  // If explicitly loading we need to build "work" (the vector to be broadcast);
+  // if preloading, this has already been built in read_files()
+  if (is_explicitly_loading()) {
+    if (work.size() != 0) {
+      LBANN_ERROR("work.size() != 0, but it should be");
+    }
+
+    // Compute the required buffer size
+    size_t n = 0;
+    for (const auto &t : m_data) {
+      int data_id = t.first;
+      size_t sz = t.second[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value();
+      n += sz;
+    }
+    work.resize(n);
+    PROFILE("  size required for my work buffer: ", work.size());
+
+    // Copy the images into the work vector
+    size_t offset2 = 0;
+    for (const auto &t : m_data) {
+      int data_id = t.first;
+      const conduit::Node &node = t.second;
+      const char *buf = node[LBANN_DATA_ID_STR(data_id) + "/buffer"].value();
+      size_t sz = node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value();
+      memcpy(work.data()+offset2, reinterpret_cast<const void*>(buf), sz);
+      offset2 += sz;
+      if (offset2 > work.size()) {
+        LBANN_ERROR("offset >= work.size(); offset: ", offset2, " work.size(): ", work.size(), " sz: ", sz);
+      }
+    }
+  }
+
+  int node_rank = m_comm->get_rank_in_node();
+  std::vector<char> work2;
+  size_t offset = 0;
+  for (int p=0; p<m_np_in_trainer; p++) {
+    // Count the number of bytes to be broadcast by P_p
+    size_t bytes = 0;
+    for (auto idx : indices[p]) {
+      bytes += image_sizes[idx];
+    }
+    //PROFILE("  \nP_", p, " has ", utils::commify(bytes), " bytes to bcast");
+
+    // Set up the rounds; due to MPI yuckiness, can bcast at most INT_MAX bytes
+    // in a single broadcast
+    std::vector<int> rounds;
+    int n = bytes/INT_MAX;
+    if (n < 0) {
+      LBANN_ERROR("(n < 0; that shouldn't be possible; please contact Dave Hysom");
+    }
+    for (int k=0; k<n; k++) {
+      rounds.push_back(INT_MAX);
+    }
+    int remainder = bytes - (n*INT_MAX);
+    rounds.push_back(remainder);
+
+    /*
+    PROFILE("  rounds: ");
+    for (auto t : rounds) {
+      PROFILE("    ", t);
+    }
+    */
+
+    // Broadcast the rounds of data
+    int work_vector_offset = 0;
+    for (size_t i=0; i<rounds.size(); i++) {
+      int sz = rounds[i];
+      //PROFILE("  bcasting ", utils::commify(sz), " bytes");
+      if (m_rank_in_trainer == p) {
+        m_comm->trainer_broadcast<char>(p, work.data()+work_vector_offset, sz);
+        if (node_rank == 0) {
+          fillin_shared_images(work.data()+work_vector_offset, sz, offset);
+        }
+      } else {
+        work2.resize(sz);
+        m_comm->trainer_broadcast<char>(p, work2.data(), sz);
+        if (node_rank == 0) {
+          fillin_shared_images(work2.data(), sz, offset);
+        }
+      }
+      work_vector_offset += sz;
+      offset += sz;
+    }
+  }
+  m_comm->barrier(m_comm->get_node_comm());
+}
+
+void data_store_conduit::exchange_owner_maps() {
+  PROFILE("starting exchange_owner_maps;",
+          "my owner map size: ", m_owner.size());
+  DEBUG_DS("starting exchange_owner_maps;",
+        "size: ", m_owner.size());
+
+  int my_count = m_my_num_indices;
+  std::vector<int> all_counts(m_np_in_trainer);
+  m_comm->all_gather(&my_count, 1, all_counts.data(), 1,  m_comm->get_trainer_comm());
+
+  std::vector<size_t> my_sizes(m_my_num_indices);
+  size_t j = 0;
+  for (auto t : m_owner) {
+    my_sizes[j++] = t.first;
+  }
+
+  std::vector<size_t> others;
+  for (int k=0; k<m_np_in_trainer; k++) {
+    others.resize(all_counts[k]);
+    if (m_rank_in_trainer == k) {
+      m_comm->broadcast<size_t>(k, my_sizes.data(), all_counts[k],  m_comm->get_trainer_comm());
+    } else {
+      m_comm->broadcast<size_t>(k, others.data(), all_counts[k],  m_comm->get_trainer_comm());
+      for (size_t i=0; i<others.size(); ++i) {
+        if (m_owner.find(others[i]) != m_owner.end()) {
+
+          if (m_debug) {
+            DEBUG_DS("data_store_conduit::exchange_owner_maps, duplicate data_id: ", others[i], "; k= ", k, "\nmy current m_owner map: ");
+            for (auto t : m_owner) DEBUG_DS("data_id: ", t.first, " owner: ", t.second);
+            DEBUG_DS("\nowner map (partial or whole) from P_", k);
+            for (auto t : others) DEBUG_DS(t, " ");
+          }
+
+          LBANN_ERROR("duplicate data_id: ", others[i], " role: ", m_reader->get_role(), "; m_owner[", others[i],"] = ", m_owner[others[i]], " for role: ", m_reader->get_role(), " m_owner.size: ", m_owner.size(), " m_data.size(): ", m_data.size());
+        }
+        m_owner[others[i]] = k;
+      }
+    }
+
+  }
+  PROFILE("leaving data_store_conduit::exchange_owner_maps\n",
+          "my owner map size: ", m_owner.size());
+  m_owner_maps_were_exchanged = true;
+PROFILE("exchange_owner_maps; m_owner_maps_were_exchanged = true");
+  set_loading_is_complete();
+
+  PROFILE("LEAVING exchange_owner_maps;",
+          "my owner map size: ", m_owner.size());
+}
+
+void data_store_conduit::profile_timing() {
+  if (m_exchange_time == 0) {
+    return;
+  }
+  if (m_exchange_time > 0.) {
+    PROFILE(
+        "\n",
+        "Exchange Data Timing:\n",
+        "  exchange_mini_batch_data: ", m_exchange_time, "\n",
+        "  exchange sample sizes:    ", m_exchange_sample_sizes_time, "\n",
+        "  start sends and rcvs:     ", m_start_snd_rcv_time, "\n",
+        "  wait alls:                ", m_wait_all_time, "\n",
+        "  unpacking rcvd nodes:     ", m_rebuild_time, "\n\n");
+
+    if (options::get()->get_bool("data_store_min_max_timing")) {
+      std::vector<double> send;
+      static int count = 5;
+      send.reserve(count);
+      send.push_back(m_exchange_time);
+      send.push_back(m_exchange_sample_sizes_time);
+      send.push_back(m_start_snd_rcv_time);
+      send.push_back(m_wait_all_time);
+      send.push_back(m_rebuild_time);
+      if (m_trainer_master) {
+        std::vector<double> rcv_max(count);
+        std::vector<double> rcv_min(count);
+        m_comm->trainer_reduce<double>(send.data(), count, rcv_max.data(), El::mpi::MAX);
+        m_comm->trainer_reduce<double>(send.data(), count, rcv_min.data(), El::mpi::MIN);
+        PROFILE(
+          "Exchange Data MAX Timing:\n",
+          "  exchange_mini_batch_data: ", rcv_max[0], "\n",
+          "  exchange sample sizes:    ", rcv_max[1], "\n",
+          "  start sends and rcvs:     ", rcv_max[2], "\n",
+          "  wait alls:                ", rcv_max[3], "\n",
+          "  unpacking rcvd nodes:     ", rcv_max[4], "\n\n");
+        PROFILE(
+          "Exchange Data MIN Timing:\n",
+          "  exchange_mini_batch_data: ", rcv_min[0], "\n",
+          "  exchange sample sizes:    ", rcv_min[1], "\n",
+          "  start sends and rcvs:     ", rcv_min[2], "\n",
+          "  wait alls:                ", rcv_min[3], "\n",
+          "  unpacking rcvd nodes:     ", rcv_min[4], "\n\n");
+      } else {
+        m_comm->trainer_reduce<double>(send.data(), count, 0, El::mpi::MAX);
+        m_comm->trainer_reduce<double>(send.data(), count, 0, El::mpi::MIN);
+      }
+    }
+
+    m_exchange_sample_sizes_time = 0.;
+    m_start_snd_rcv_time = 0.;
+    m_wait_all_time = 0.;
+    m_rebuild_time = 0.;
+    m_exchange_time = 0.;
+  }
+}
+
+void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_size) {
+  if (is_local_cache() && is_fully_loaded()) {
+    return;
+  }
+
+  if (m_reader->at_new_epoch() && is_local_cache() && is_explicitly_loading()) {
+    exchange_local_caches();
+    return;
+  }
+
+  if (m_reader->at_new_epoch()) {
+    PROFILE("\nExchange_mini_batch_data");
+    PROFILE("  is_explicitly_loading(): ", is_explicitly_loading());
+    PROFILE("  is_local_cache(): ", is_local_cache());
+    PROFILE("  is_fully_loaded: ", is_fully_loaded());
+    if (! is_local_cache()) {
+      profile_timing();
+    }  
+  }
+
+  double tm1 = get_time();
+
+  // when not running in preload mode, exchange owner maps after the 1st epoch
+  if (m_reader->at_new_epoch() && ! is_preloading() && !is_local_cache()) {
+    PROFILE("calling exchange_owner_maps");
+    if (!m_owner_maps_were_exchanged) {
+      exchange_owner_maps();
+    } 
+
+    else {  
+      PROFILE("  owner_maps were already exchanged; returning");
+    }  
+    m_owner_maps_were_exchanged = true;
+PROFILE("exchange_mini_batch_data; m_owner_maps_were_exchanged = true");
+    /*
+     * TODO
+    if (m_spill) {
+      m_is_spilled = true;
+      m_metadata.close();
+      save_state();
+    }  
+    */
+  }
+
+  exchange_data_by_sample(current_pos, mb_size);
+  m_exchange_time += (get_time() - tm1);
+}
+
+void data_store_conduit::flush_debug_file() {
+  if (!m_debug) {
+    return;
+  }
+  m_debug->close();
+  m_debug->open(m_debug_filename.c_str(), std::ios::app);
+}
+
+void data_store_conduit::flush_profile_file() const {
+  if (!m_profile) {
+    return;
+  }
+  m_profile->close();
+  m_profile->open(m_profile_filename.c_str(), std::ios::app);
+}
+
+size_t data_store_conduit::get_num_global_indices() const {
+  size_t n = m_comm->trainer_allreduce<size_t>(m_data.size());
+  return n;
+}
+
+void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) {
+  if (m_world_master) {
+    std::cout << "starting data_store_conduit::test_checkpoint for role: "
+              << m_reader->get_role() << std::endl;
+    print_partial_owner_map(10);
+    std::cout << "\nHere are some private variables before clearing them:\n";
+    print_variables();
+    std::cout << "\nCalling write_checkpoint()" << std::endl;
+  }
+  write_checkpoint(checkpoint_dir);
+
+  // clear or reset private variables
+  auto sanity = m_owner;
+  m_owner.clear();
+  m_sample_sizes.clear();
+  m_data.clear();
+
+  m_is_setup = false;
+  m_preloading = false;
+  m_explicitly_loading = true;
+  m_owner_map_mb_size = 0;
+  m_compacted_sample_size = 0;
+  m_node_sizes_vary = true;
+
+  if (m_world_master) {
+    std::cout << "\nHere are some private variables after clearing them:\n";
+    print_variables();
+  }
+
+  if (m_world_master) {
+    std::cout << "Cleared the owner map; m_owner.size(): " << m_owner.size() 
+              << std::endl
+              << "Calling load_checkpoint" << std::endl;
+  }
+  load_checkpoint(checkpoint_dir, nullptr);
+  if (m_world_master) {
+    std::cout << "Here is part of the re-loaded owner map; map.size(): " << m_owner.size() << std::endl;
+    print_partial_owner_map(10);
+    std::cout << "\nHere are some private variables after reloading:\n";
+    print_variables();
+  }
+
+  //check that the owner map was correctly loaded
+  for (auto t : m_owner) {
+    if (sanity.find(t.first) == sanity.end()) {
+      LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first);
+    } else if (sanity[t.first] != m_owner[t.first]) {
+      LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first, " and m_owner[t.first]= ", m_owner[t.first]);
+    }
+  }
+
+  m_comm->global_barrier();
+}
+
+void data_store_conduit::make_dir_if_it_doesnt_exist(const std::string &dir_name) {
+  int node_rank = m_comm->get_rank_in_node();
+  if (node_rank == 0) {
+    bool exists = file::directory_exists(dir_name);
+    if (!exists) {
+      PROFILE("data_store_conduit; the directory '", dir_name, "' doesn't exist; creating it");
+      file::make_directory(dir_name);
+    }
+  }
+}
+
+void data_store_conduit::setup_spill(std::string base_dir) {
+  if (base_dir == "lassen") {
+     base_dir = get_lassen_spill_dir();
+  }
+  m_spill_dir_base = base_dir;
+  m_spill = true;
+  m_cur_spill_dir_integer = -1;
+  m_num_files_in_cur_spill_dir = m_max_files_per_directory;
+  PROFILE("base directory for spilling: ", m_spill_dir_base);
+
+  // create directory structure for spilling data
+  make_dir_if_it_doesnt_exist(m_spill_dir_base);
+  m_comm->trainer_barrier();
+  make_dir_if_it_doesnt_exist(get_conduit_dir());
+  PROFILE("base directory for spilling conduit nodes: ", get_conduit_dir());
+
+  // open metadata file; this will contains the file pathnames of spilled
+  // conduit nodes
+  const std::string fnn = get_metadata_fn();
+  m_metadata.open(fnn.c_str()); 
+  if (!m_metadata) {
+    LBANN_ERROR("failed to open ", fnn, " for writing");
+  }
+  PROFILE("will write metadata to file: ", get_metadata_fn());
+
+  //n.b. must do this here, instead of only in spill_conduit_node(),
+  //     in case a reader (e.g, validation reader) has no data
+  open_next_conduit_spill_directory();
+}
+
+void data_store_conduit::write_checkpoint(std::string dir_name) {
+  // if we're spilling data, everything has already been written to file
+  if (m_is_spilled) {
+    return;
+  }
+  double tm1 = get_time();
+  setup_spill(dir_name);
+
+  // cerealize all non-conduit::Node variables
+  save_state();
+
+  // save conduit Nodes
+  m_metadata << get_conduit_dir() << "\n";
+  DEBUG_DS("m_data.size: ", m_data.size());
+  for (auto t : m_data) {
+    spill_conduit_node(t.second["data"], t.first);
+  }
+  m_metadata.close();
+  PROFILE("time to write checkpoint: ", (get_time() - tm1));
+}
+
+void data_store_conduit::save_state() {
+  // checkpoint remaining state using cereal
+  const std::string fn = get_cereal_fn();
+  std::ofstream os(fn);
+  if (!os) {
+    LBANN_ERROR("failed to open ", fn, " for writing");
+  }
+
+  {
+  cereal::XMLOutputArchive archive(os);
+    archive(CEREAL_NVP(m_my_num_indices),
+            CEREAL_NVP(m_owner_maps_were_exchanged), 
+            CEREAL_NVP(m_is_setup),
+            CEREAL_NVP(m_preloading), 
+            CEREAL_NVP(m_loading_is_complete), 
+            CEREAL_NVP(m_explicitly_loading),
+            CEREAL_NVP(m_owner_map_mb_size), 
+            CEREAL_NVP(m_compacted_sample_size), 
+            CEREAL_NVP(m_is_local_cache),
+            CEREAL_NVP(m_node_sizes_vary), 
+            CEREAL_NVP(m_have_sample_sizes),
+            CEREAL_NVP(m_owner),
+            CEREAL_NVP(m_sample_sizes));
+  }
+  os.close();
+}
+
+void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_reader *reader) {
+  double tm1 = get_time();
+  PROFILE("starting data_store_conduit::load_checkpoint");
+
+  // Sanity check that checkpoint directories exist
+  m_spill_dir_base = dir_name;
+  bool exists = file::directory_exists(m_spill_dir_base);
+  if (!exists) {
+    LBANN_ERROR("cannot load data_store from file, since the specified directory ", dir_name, "doesn't exist");
+  }
+  const std::string conduit_dir = get_conduit_dir();
+  exists = file::directory_exists(conduit_dir);
+  if (!exists) {
+    LBANN_ERROR("cannot load data_store from file, since the specified directory '", conduit_dir, "' doesn't exist");
+  }
+
+  // Read checkpoint for all essential variables except conduit Nodes
+  const std::string fn = get_cereal_fn();
+  std::ifstream in(fn);
+  if (!in) {
+    LBANN_ERROR("failed to open ", m_cereal_fn, " for reading");
+  }
+  cereal::XMLInputArchive iarchive(in);
+  iarchive(CEREAL_NVP(m_my_num_indices),
+           m_owner_maps_were_exchanged, m_is_setup,
+           m_preloading, m_loading_is_complete,
+           m_explicitly_loading, m_owner_map_mb_size,
+           m_compacted_sample_size, m_is_local_cache,
+           m_node_sizes_vary, m_have_sample_sizes,
+           m_owner, m_sample_sizes);
+
+  if (reader != nullptr) {
+    m_reader = reader;
+    m_comm = m_reader->get_comm();
+    m_shuffled_indices = &(m_reader->get_shuffled_indices());
+    m_world_master = m_comm->am_world_master();
+    m_trainer_master = m_comm->am_trainer_master();
+    m_rank_in_trainer = m_comm->get_rank_in_trainer();
+    m_rank_in_world = m_comm->get_rank_in_world();
+    m_np_in_trainer = m_comm->get_procs_per_trainer();
+  }  
+
+  // Open metadata filename; this is in index re, checkpointed conduit filenames
+  const std::string metadata_fn = get_metadata_fn();
+  std::ifstream metadata(metadata_fn);
+  if (!metadata) {
+    LBANN_ERROR("failed to open ", metadata_fn, " for reading");
+  }
+
+  // Error check that the conduit base directory name is correct
+  std::string base_dir;
+  getline(metadata, base_dir);
+  if (conduit_dir != base_dir) {
+    LBANN_ERROR("conduit_dir != base_dir (", conduit_dir, ", ", base_dir);
+  }
+
+  // Load conduit Nodes
+  std::string tmp;
+  int sample_id;
+  while (metadata >> tmp >> sample_id) {
+    if (tmp.size() > 2) {
+      const std::string fn2 = base_dir + "/" + tmp;
+      conduit::Node nd;
+      nd.load(fn2);
+      build_node_for_sending(nd, m_data[sample_id]);
+    }
+  }
+  metadata.close();
+
+  m_was_loaded_from_file = true;
+  PROFILE("time to load checkpoint: ", (get_time() - tm1));
+}
+
+void data_store_conduit::print_variables() {
+  if (!m_world_master) {
+    return;
+  }
+  std::cout << "m_is_setup: " << m_is_setup << std::endl
+            << "m_preloading: " << m_preloading << std::endl
+            << "m_explicitly_loading: " << m_explicitly_loading << std::endl
+            << "m_owner_map_mb_size: " << m_owner_map_mb_size << std::endl
+            << "m_compacted_sample_size: " << m_compacted_sample_size << std::endl
+            << "m_node_sizes_vary: " << m_node_sizes_vary << std::endl;
+}
+
+std::string data_store_conduit::get_conduit_dir() const {
+  return m_spill_dir_base + "/conduit_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world);
+}
+
+std::string data_store_conduit::get_cereal_fn() const {
+  return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml"; 
+}
+
+std::string data_store_conduit::get_metadata_fn() const {
+  return m_spill_dir_base + "/metadata_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world);
+}
+
+void data_store_conduit::open_next_conduit_spill_directory() {
+  if (m_num_files_in_cur_spill_dir != m_max_files_per_directory) {
+    return;
+  }
+  m_num_files_in_cur_spill_dir = 0;
+  m_cur_spill_dir_integer += 1;
+  m_cur_spill_dir = get_conduit_dir() + "/" + to_string(m_cur_spill_dir_integer);
+  DEBUG_DS("calling file::directory_exists(", m_cur_spill_dir, ")");
+  bool exists = file::directory_exists(m_cur_spill_dir);
+  DEBUG_DS("exists? ", exists);
+  if (!exists) {
+    file::make_directory(m_cur_spill_dir);
+  }
+}
+
+void data_store_conduit::spill_conduit_node(const conduit::Node &node, int data_id) {
+  if (!m_metadata.is_open()) {
+    LBANN_ERROR("metadata file is not open");
+  }
+
+  std::lock_guard<std::mutex> lock(m_mutex);
+  if (m_num_files_in_cur_spill_dir == m_max_files_per_directory) {
+    open_next_conduit_spill_directory();
+  }
+
+  const std::string fn = m_cur_spill_dir + "/" + std::to_string(data_id);
+  node.save(fn);
+  m_metadata <<  m_cur_spill_dir_integer << "/" << data_id << " " << data_id << std::endl;
+  m_spilled_nodes[data_id] = m_cur_spill_dir_integer;
+  ++m_num_files_in_cur_spill_dir;
+}
+
+void data_store_conduit::load_spilled_conduit_nodes() {
+  m_data.clear();
+
+  for (const auto &v : m_indices_to_send) {
+    for (const auto &id : v) {
+      map_ii_t::const_iterator it = m_spilled_nodes.find(id);
+      if (it == m_spilled_nodes.end()) {
+        LBANN_ERROR("it == m_spilled_nodes.end() for sample_id: ", id, "; m_spilled_nodes.size: ", m_spilled_nodes.size());
+      }
+      const std::string fn = get_conduit_dir() + "/" + std::to_string(it->second) + "/" + std::to_string(id);
+      //PROFILE("loading conduit file: ", fn);
+      conduit::Node node;
+      node.load(fn);
+      build_node_for_sending(node, m_data[id]);
+    }
+  }
+}
+
+void data_store_conduit::open_informational_files() {
+  options *opts = options::get();
+  if (m_comm == nullptr) {
+    LBANN_ERROR("m_comm == nullptr");
+  }
+
+  // optionally, each <rank, reader_role> pair opens a debug file
+  if (opts->get_bool("data_store_debug") && !m_debug && m_reader != nullptr) {
+    m_debug_filename = m_debug_filename_base + "_" + m_reader->get_role() + "." + std::to_string(m_comm->get_rank_in_world()) + ".txt";
+    m_debug = new std::ofstream(m_debug_filename.c_str());
+    if (!m_debug) {
+      LBANN_ERROR("failed to open ", m_debug_filename, " for writing");
+    }
+  }
+
+  // optionally, <P_0, reader_role> pair opens a file for writing
+  if (opts->get_bool("data_store_profile") && m_world_master && !m_profile && m_reader != nullptr) {
+    m_profile_filename = m_profile_filename_base + "_" + m_reader->get_role() + ".txt";
+    m_profile = new std::ofstream(m_profile_filename.c_str());
+    if (!m_profile) {
+      LBANN_ERROR("failed to open ", m_profile_filename, " for writing");
+    }
+  }
+}
+
+void data_store_conduit::print_partial_owner_map(int n) {
+   std::cout << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl;
+  std::map<int,int> m;
+  for (auto t : m_owner) {
+    m[t.first] = t.second;
+  }
+  int j = 0;
+  for (auto t : m) {
+    std::cout << "  sample_id: " << t.first << " owner: " << t.second << std::endl;
+    if (j++ >= 10) break;
+  }
+}
+
+void data_store_conduit::set_profile_msg(std::string s) {
+  PROFILE(s);
+}
+
+void data_store_conduit::test_imagenet_node(int index, bool dereference) {
+  image_data_reader *image_reader = dynamic_cast<image_data_reader*>(m_reader);
+  if (image_reader == nullptr) {
+    LBANN_ERROR("data_reader_image *image_reader = dynamic_cast<data_reader_image*>(m_reader) failed");
+  }
+
+  int data_id = index;
+  if (dereference) {
+    data_id = (*m_shuffled_indices)[index];
+  }
+  if (m_image_offsets.find(data_id) == m_image_offsets.end()) {
+    LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end()");
+  }
+
+  if (m_image_offsets.find(data_id) == m_image_offsets.end()) {
+    LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id);
+  }
+
+  if (m_sample_sizes.find(data_id) == m_sample_sizes.end()) {
+    LBANN_ERROR("failed to find data_id ", data_id, " in the image_sizes map");
+  }
+  size_t szz = m_sample_sizes[data_id];
+  PROFILE("test_imagenet_node() for data_id: ", utils::commify(data_id), " at offset: ", utils::commify(m_image_offsets[data_id]), " image size: ", utils::commify(szz));
+  if (m_image_offsets[data_id] >= INT_MAX) {
+    PROFILE("    WARNING: offset is >= INT_MAX!");
+  }
+
+  std::cout << "testing sample_id: "<< utils::commify(data_id)<< " stored at offset: "<< utils::commify(m_image_offsets[data_id]);
+  if (m_image_offsets[data_id] >= INT_MAX) {
+    std::cout << "; (>= INT_MAX)\n";
+  } else {
+    std::cout << std::endl;
+  }  
+  conduit::Node nd1;
+  image_reader->load_conduit_node_from_file(data_id, nd1);
+  char *buf1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer"].value();
+  size_t size1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value();
+
+  const conduit::Node &nd2 = get_conduit_node(data_id);
+  const char *buf2 = nd2[LBANN_DATA_ID_STR(data_id) + "/buffer"].value();
+  size_t size2 = nd2[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value();
+
+  if (size1 != size2) {
+    PROFILE("buffer sizes mismatch: size of buffer read from file does not match buffer size from cache; from file: ", size1, " from cache: ", size2, " for data_id: ", data_id);
+
+
+
+    if (m_world_master) {
+      const conduit::Schema &s = nd2.schema();
+      s.print();
+      nd2.print();
+    }  
+
+
+
+    LBANN_ERROR("buffer sizes mismatch: size of buffer read from file does not match buffer size from cache; from file: ", size1, " from cache: ", size2, " for deta_id: ", data_id);
+  }
+  for (size_t i=0; i<size1; i++) {
+    if (buf1[i] != buf2[i]) {
+      PROFILE("buffer mismatch for char #", i+1, " of ", size1, "; image buffer read from file does not match buffer from conduit node");
+      LBANN_ERROR("buffer mismatch for char #", i+1, " of ", size1, "; image buffer read from file does not match buffer from conduit node");
+    }
+  }
+  PROFILE("    PASSED!");
+}
+
+
+bool data_store_conduit::test_local_cache_imagenet(int n) {
+  if (!m_world_master) {
+    return true;
+  }
+  PROFILE("\nStarting data_store_conduit::test_local_cache_imagenet(", n, ")");
+  if (n < 0 || n > (int)m_shuffled_indices->size()) {
+    n = m_shuffled_indices->size();
+  }
+
+  // edge cases: get images with smallest and largest offsets in the cache
+  size_t max_offset = 0;
+  size_t min_offset = 200000000;
+  size_t id_max = 0;
+  size_t id_min = 0;
+  for (auto t :  m_image_offsets) {
+    if (t.second > max_offset) {
+      id_max = t.first;
+      max_offset = t.second;
+    }
+    if (t.second < min_offset) {
+      id_min = t.first;
+      min_offset = t.second;
+    }
+  }
+
+  // test image with smallest offset
+  test_imagenet_node(id_min, false);
+
+  // test n randomly selected images
+  for (int h=0; h<n; ++h) {
+    const int index = random() % m_shuffled_indices->size();
+    test_imagenet_node(index);
+  }
+
+  // test image with largest offset
+  test_imagenet_node(id_max, false);
+
+  if (m_world_master) std::cout<< "  All tests passed\n";
+  PROFILE("  All tests passed\n.");
+  return true;
+}
+
+void data_store_conduit::check_query_flags() const {
+  if (m_explicitly_loading && m_preloading) {
+    LBANN_ERROR("is_explicitly_loading() && is_preloading() are both true, but should not be");
+  }
+  if (m_loading_is_complete && m_explicitly_loading) {
+    LBANN_ERROR("is_fully_loaded() && is_explicitly_loading() are both true, but should not be");
+  }
+  if (m_loading_is_complete && m_preloading) {
+    LBANN_ERROR("is_fully_loaded() && is_preloading() are both true, but should not be");
+  }
+}
+
+void data_store_conduit::clear_owner_map() { 
+    m_owner_maps_were_exchanged = false;
+    m_owner.clear(); 
+}
+
+void data_store_conduit::verify_sample_size() {
+  // Note: m_compacted_sample_size is set during calls to set_conduit_node() or 
+  //  set_preloaded_conduit_node(). Hence, if these are not called (i.e, the
+  //  rank does not own any data), m_compacted_sample_size will be zero.
+  //  This method ensures that all ranks know the sample size, whether or not
+  //  they own any samples
+  int max_samples = m_comm->trainer_allreduce<int>(m_compacted_sample_size, El::mpi::MAX);
+  if (max_samples <= 0) {
+    LBANN_ERROR("sample size, which is needed for data exchange, is invalid; should be > 0, but value is: ", max_samples, "; this indicates there is insufficient data. Role: ", m_reader->get_role());
+  }
+  if (m_compacted_sample_size != 0 && max_samples != m_compacted_sample_size) {
+    LBANN_ERROR("m_compacted_sample_size = ", m_compacted_sample_size, " but max_samples = ", max_samples, "; values should be identical");
+  }
+  m_compacted_sample_size = max_samples;
+}
+
+size_t data_store_conduit::get_mem_usage() {
+  size_t r = 0;
+  for (const auto &t : m_data) {
+    const conduit::Node &nd = t.second;
+    if (!nd.is_contiguous()) {
+      LBANN_ERROR("node does not have a contiguous layout");
+    }
+/*
+    if (nd.data_ptr() == nullptr) {
+      nd.print();
+      sleep(1);
+      LBANN_ERROR("node does not have a valid data pointer");
+    }
+*/
+    if (nd.contiguous_data_ptr() == nullptr) {
+      LBANN_ERROR("node does not have a valid contiguous data pointer");
+    }
+    r += nd.total_bytes_compact();
+  }  
+  return r;
 }
 
 
 }  // namespace lbann
 
-#endif //#ifdef LBANN_HAS_CONDUIT
+
diff --git a/src/execution_contexts/CMakeLists.txt b/src/execution_contexts/CMakeLists.txt
new file mode 100644
index 00000000000..14b452d3b93
--- /dev/null
+++ b/src/execution_contexts/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  execution_context.cpp
+  sgd_execution_context.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/execution_contexts/execution_context.cpp b/src/execution_contexts/execution_context.cpp
new file mode 100644
index 00000000000..521dcb82a26
--- /dev/null
+++ b/src/execution_contexts/execution_context.cpp
@@ -0,0 +1,104 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/training_algorithms/training_algorithm.hpp"
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/io/persist.hpp"
+#include <string>
+#include <unistd.h>
+#include <iomanip>
+#include <queue>
+#include <unordered_set>
+#include <lbann.pb.h>
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
+#include <cereal/types/base_class.hpp>
+#include <cereal/types/polymorphic.hpp>
+
+//#include <mpi.h>
+
+namespace lbann {
+
+//******************************************************************************
+// Execution context
+//******************************************************************************
+
+execution_context::execution_context(trainer& trainer,
+                                     training_algorithm& training_algorithm,
+                                     lbann_comm *comm,
+                                     execution_mode mode)
+  : m_trainer(trainer),
+    m_training_algorithm(training_algorithm),
+    m_comm(comm),
+    m_execution_mode(mode),
+    m_terminate_training(false) {}
+
+////////////////////////////////////////////////////////////
+// Training_Algorithm state
+////////////////////////////////////////////////////////////
+
+// observer_ptr<thread_pool> training_algorithm::get_io_thread_pool() {
+//   return m_trainer->get_io_thread_pool();
+// }
+
+thread_pool& execution_context::get_io_thread_pool() const {
+  return m_trainer.get_io_thread_pool();
+}
+
+/** Are background I/O activities enabled by the input layers */
+bool execution_context::background_io_activity_allowed() {
+  return m_trainer.background_io_activity_allowed();
+}
+
+////////////////////////////////////////////////////////////
+// Checkpointing
+////////////////////////////////////////////////////////////
+
+void execution_context::save_to_checkpoint_shared(persist& p) {
+  if (get_comm().am_trainer_master()) {
+    write_cereal_archive<execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  }
+  return;
+}
+
+void execution_context::load_from_checkpoint_shared(persist& p) {
+  load_from_shared_cereal_archive<execution_context>(*this, p, get_execution_mode(), get_comm(), "_ctx.xml");
+  return;
+}
+
+void execution_context::save_to_checkpoint_distributed(persist& p){
+  write_cereal_archive<execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  return;
+}
+
+void execution_context::load_from_checkpoint_distributed(persist& p){
+  read_cereal_archive<execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  return;
+}
+
+}  // namespace lbann
diff --git a/src/execution_contexts/sgd_execution_context.cpp b/src/execution_contexts/sgd_execution_context.cpp
new file mode 100644
index 00000000000..d0114a7f55a
--- /dev/null
+++ b/src/execution_contexts/sgd_execution_context.cpp
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+#include <cereal/types/base_class.hpp>
+#include <cereal/types/polymorphic.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/xml.hpp>
+
+namespace lbann {
+
+sgd_execution_context::sgd_execution_context(trainer& trainer,
+                                             training_algorithm& training_alg,
+                                             lbann_comm *comm,
+                                             execution_mode mode,
+                                             size_t mini_batch_size)
+  : execution_context(trainer, training_alg, comm, mode),
+    m_current_mini_batch_size(mini_batch_size),
+    m_effective_mini_batch_size(mini_batch_size) {}
+
+////////////////////////////////////////////////////////////
+// Checkpointing
+////////////////////////////////////////////////////////////
+
+void sgd_execution_context::save_to_checkpoint_shared(persist& p) {
+  if (get_comm().am_trainer_master()) {
+    write_cereal_archive<sgd_execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  }
+  return;
+}
+
+void sgd_execution_context::load_from_checkpoint_shared(persist& p) {
+  load_from_shared_cereal_archive<sgd_execution_context>(*this, p, get_execution_mode(), get_comm(), "_ctx.xml");
+  return;
+}
+
+void sgd_execution_context::save_to_checkpoint_distributed(persist& p) {
+  write_cereal_archive<sgd_execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  return;
+}
+
+void sgd_execution_context::load_from_checkpoint_distributed(persist& p) {
+  read_cereal_archive<sgd_execution_context>(*this, p, get_execution_mode(), "_ctx.xml");
+  return;
+}
+
+}  // namespace lbann
diff --git a/src/io/data_buffers/generic_io_buffer.cpp b/src/io/data_buffers/generic_io_buffer.cpp
index c45322da2b2..2cc93dfcd88 100644
--- a/src/io/data_buffers/generic_io_buffer.cpp
+++ b/src/io/data_buffers/generic_io_buffer.cpp
@@ -28,19 +28,22 @@
 #include "lbann/utils/exception.hpp"
 
 namespace lbann {
-generic_io_buffer::generic_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers)
+template <typename TensorDataType>
+generic_io_buffer<TensorDataType>::generic_io_buffer(lbann_comm *comm, int num_parallel_readers)
   : m_comm(comm), fetch_data_fn(nullptr),  update_data_reader_fn(nullptr) {}
 
-generic_io_buffer::generic_io_buffer(const generic_io_buffer& rhs)
+template <typename TensorDataType>
+generic_io_buffer<TensorDataType>::generic_io_buffer(const generic_io_buffer& rhs)
 : m_comm(rhs.m_comm)
 {
   if (rhs.fetch_data_fn)
-    fetch_data_fn = new fetch_data_functor(*(rhs.fetch_data_fn));
+    fetch_data_fn = new fetch_data_functor<IODataType>(*(rhs.fetch_data_fn));
   if (rhs.update_data_reader_fn)
     update_data_reader_fn = new update_data_reader_functor(*(rhs.update_data_reader_fn));
 }
 
-generic_io_buffer& generic_io_buffer::operator=(const generic_io_buffer& rhs) {
+template <typename TensorDataType>
+generic_io_buffer<TensorDataType>& generic_io_buffer<TensorDataType>::operator=(const generic_io_buffer<TensorDataType>& rhs) {
   m_comm = rhs.m_comm;
   if (fetch_data_fn) {
     delete fetch_data_fn;
@@ -51,11 +54,18 @@ generic_io_buffer& generic_io_buffer::operator=(const generic_io_buffer& rhs) {
     update_data_reader_fn = nullptr;
   }
   if (rhs.fetch_data_fn)
-    fetch_data_fn = new fetch_data_functor(*(rhs.fetch_data_fn));
+    fetch_data_fn = new fetch_data_functor<IODataType>(*(rhs.fetch_data_fn));
   if (rhs.update_data_reader_fn)
     update_data_reader_fn = new update_data_reader_functor(*(rhs.update_data_reader_fn));
 
   return (*this);
 }
 
+#define PROTO(T)                      \
+  template class generic_io_buffer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/io/data_buffers/partitioned_io_buffer.cpp b/src/io/data_buffers/partitioned_io_buffer.cpp
index a57bfab9e06..14495a8463a 100644
--- a/src/io/data_buffers/partitioned_io_buffer.cpp
+++ b/src/io/data_buffers/partitioned_io_buffer.cpp
@@ -27,32 +27,39 @@
 #include "lbann/io/data_buffers/partitioned_io_buffer.hpp"
 #include "lbann/utils/exception.hpp"
 
-lbann::partitioned_io_buffer::partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map<execution_mode, generic_data_reader *> data_readers, int num_child_layers)
-  : generic_io_buffer(comm, num_parallel_readers, data_readers) {
-  m_data_buffers[execution_mode::training] = new data_buffer(comm, num_child_layers);
-  m_data_buffers[execution_mode::validation] = new data_buffer(comm, num_child_layers);
-  m_data_buffers[execution_mode::testing] = new data_buffer(comm, num_child_layers);
+namespace lbann {
+
+template <typename TensorDataType>
+partitioned_io_buffer<TensorDataType>::partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, int num_child_layers)
+  : generic_io_buffer<TensorDataType>(comm, num_parallel_readers) {
+  m_data_buffers[execution_mode::training] = new data_buffer<IODataType>(comm, num_child_layers);
+  m_data_buffers[execution_mode::validation] = new data_buffer<IODataType>(comm, num_child_layers);
+  m_data_buffers[execution_mode::testing] = new data_buffer<IODataType>(comm, num_child_layers);
 }
 
-lbann::partitioned_io_buffer::~partitioned_io_buffer() {
+template <typename TensorDataType>
+partitioned_io_buffer<TensorDataType>::~partitioned_io_buffer() {
   for (auto& buf : m_data_buffers) {
     delete buf.second;
   }
 }
 
-lbann::partitioned_io_buffer::partitioned_io_buffer(const lbann::partitioned_io_buffer& other)
-  : generic_io_buffer(other) {
+template <typename TensorDataType>
+partitioned_io_buffer<TensorDataType>::partitioned_io_buffer(const partitioned_io_buffer& other)
+  : generic_io_buffer<TensorDataType>(other) {
   for (const auto& buf : other.m_data_buffers) {
     m_data_buffers[buf.first] = buf.second->copy();
   }
 }
 
-lbann::partitioned_io_buffer* lbann::partitioned_io_buffer::copy() const {
-  return new partitioned_io_buffer(*this);
+template <typename TensorDataType>
+partitioned_io_buffer<TensorDataType>* partitioned_io_buffer<TensorDataType>::copy() const {
+  return new partitioned_io_buffer<TensorDataType>(*this);
 }
 
-lbann::partitioned_io_buffer& lbann::partitioned_io_buffer::operator=(const lbann::partitioned_io_buffer& other) {
-  generic_io_buffer::operator=(other);
+template <typename TensorDataType>
+partitioned_io_buffer<TensorDataType>& partitioned_io_buffer<TensorDataType>::operator=(const partitioned_io_buffer& other) {
+  generic_io_buffer<TensorDataType>::operator=(other);
   for (auto& buf : m_data_buffers) {
     if (buf.second) delete buf.second;
     buf.second = buf.second->copy();
@@ -60,20 +67,22 @@ lbann::partitioned_io_buffer& lbann::partitioned_io_buffer::operator=(const lban
   return *this;
 }
 
-void lbann::partitioned_io_buffer::fp_setup_data(El::Int cur_mini_batch_size, int idx) {
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::fp_setup_data(El::Int cur_mini_batch_size, int idx) {
   for (auto& buf : m_data_buffers) {
     buf.second->m_input_buffers[idx]->Resize(buf.second->m_input_buffers[idx]->Height(), cur_mini_batch_size);
   }
 }
 
-void lbann::partitioned_io_buffer::setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) {
-  El::Int local_mini_batch_size = max_mini_batch_size / m_comm->get_procs_per_trainer();
-  El::Int partial_mini_batch_size = max_mini_batch_size % m_comm->get_procs_per_trainer();
-  if(partial_mini_batch_size > 0 && m_comm->get_rank_in_trainer() < partial_mini_batch_size) {
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) {
+  El::Int local_mini_batch_size = max_mini_batch_size / this->m_comm->get_procs_per_trainer();
+  El::Int partial_mini_batch_size = max_mini_batch_size % this->m_comm->get_procs_per_trainer();
+  if(partial_mini_batch_size > 0 && this->m_comm->get_rank_in_trainer() < partial_mini_batch_size) {
     local_mini_batch_size++;
   }
   for (const auto& it : m_data_buffers) {
-    data_buffer *data_buffer = it.second;
+    data_buffer<IODataType> *data_buffer = it.second;
     int i = 0;
     for (const auto& buf : data_buffer->m_input_buffers) {
       if(i == 0) {
@@ -91,24 +100,21 @@ void lbann::partitioned_io_buffer::setup_data(El::Int num_neurons, El::Int num_t
   }
 }
 
-int lbann::partitioned_io_buffer::fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) {
+template <typename TensorDataType>
+int partitioned_io_buffer<TensorDataType>::fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) {
   int num_parallel_readers = data_reader->get_num_parallel_readers();
 
   /// Coordinate all available readers so that the perform I/O in the same step
   /// Check to make sure that the local matrix has space for data
-  data_buffer *buf = get_data_buffer(mode);
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   buf->m_num_samples_fetched = 0;
-  if (m_comm->get_rank_in_trainer() < num_parallel_readers && (buf->m_input_buffers[0]->Height() != 0 && buf->m_input_buffers[0]->Width() != 0)) {
-    for(auto& m : buf->m_input_buffers) {
-      El::Zeros_seq(*m, m->Height(), m->Width());
-    }
-
+  if (this->m_comm->get_rank_in_trainer() < num_parallel_readers && (buf->m_input_buffers[0]->Height() != 0 && buf->m_input_buffers[0]->Width() != 0)) {
     /// Each data reader needs to either have independent / split
     /// data, or take an offset / stride
     if(buf->m_input_buffers.size() == 2) {
-      buf->m_num_samples_fetched = (*fetch_data_fn)(buf->m_input_buffers[0]->Matrix(), buf->m_input_buffers[1]->Matrix(), buf->m_indices_fetched_per_mb, data_reader);
+      buf->m_num_samples_fetched = (*this->fetch_data_fn)(buf->m_input_buffers[0]->Matrix(), buf->m_input_buffers[1]->Matrix(), buf->m_indices_fetched_per_mb, data_reader);
     }else {
-      buf->m_num_samples_fetched = (*fetch_data_fn)(buf->m_input_buffers[0]->Matrix(), buf->m_indices_fetched_per_mb, data_reader);
+      buf->m_num_samples_fetched = (*this->fetch_data_fn)(buf->m_input_buffers[0]->Matrix(), buf->m_indices_fetched_per_mb, data_reader);
     }
     bool data_valid = (buf->m_num_samples_fetched > 0);
     if(data_valid) {
@@ -118,26 +124,29 @@ int lbann::partitioned_io_buffer::fetch_to_local_matrix(generic_data_reader *dat
   return buf->m_num_samples_fetched;
 }
 
-void lbann::partitioned_io_buffer::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   Copy(*buf->m_input_buffers[0], sample);
   Copy(*buf->m_input_buffers[1], response);
   buf->m_num_samples_fetched = 0;
   return;
 }
 
-void lbann::partitioned_io_buffer::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   Copy(*buf->m_input_buffers[0], sample);
   buf->m_num_samples_fetched = 0;
   return;
 }
 
-bool lbann::partitioned_io_buffer::update_data_set(generic_data_reader *data_reader, execution_mode mode) {
+template <typename TensorDataType>
+bool partitioned_io_buffer<TensorDataType>::update_data_set(generic_data_reader *data_reader, execution_mode mode) {
   int num_iterations_per_epoch = data_reader->get_num_iterations_per_epoch();
   int current_step_in_epoch = data_reader->get_current_step_in_epoch(); // Get the current step before the update function increments it
 
-  (*update_data_reader_fn)(true, data_reader);
+  (*this->update_data_reader_fn)(true, data_reader);
 
   if(current_step_in_epoch == (num_iterations_per_epoch - 1)) {
     return true;
@@ -146,222 +155,50 @@ bool lbann::partitioned_io_buffer::update_data_set(generic_data_reader *data_rea
   }
 }
 
-void lbann::partitioned_io_buffer::set_fetch_data_in_background(bool flag, execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::set_fetch_data_in_background(bool flag, execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   buf->m_fetch_data_in_background = flag;
 }
 
-bool lbann::partitioned_io_buffer::is_data_fetched_in_background(execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+bool partitioned_io_buffer<TensorDataType>::is_data_fetched_in_background(execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   return buf->m_fetch_data_in_background;
 }
 
 /**
  * Return the sample indices fetched in the current mini-batch.
  */
-El::Matrix<El::Int>* lbann::partitioned_io_buffer::get_sample_indices_fetched_per_mb(execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+El::Matrix<El::Int>* partitioned_io_buffer<TensorDataType>::get_sample_indices_fetched_per_mb(execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   return &(buf->m_indices_fetched_per_mb);
 }
 
-int lbann::partitioned_io_buffer::num_samples_ready(execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+int partitioned_io_buffer<TensorDataType>::num_samples_ready(execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   return buf->m_num_samples_fetched;
 }
 
-void lbann::partitioned_io_buffer::set_data_fetch_future(std::future<void> future, execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+void partitioned_io_buffer<TensorDataType>::set_data_fetch_future(std::future<void> future, execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   buf->m_data_fetch_future = std::move(future);
 }
 
-std::future<void> lbann::partitioned_io_buffer::get_data_fetch_future(execution_mode mode) {
-  data_buffer *buf = get_data_buffer(mode);
+template <typename TensorDataType>
+std::future<void> partitioned_io_buffer<TensorDataType>::get_data_fetch_future(execution_mode mode) {
+  data_buffer<IODataType> *buf = get_data_buffer(mode);
   return std::move(buf->m_data_fetch_future);
 }
 
-int lbann::partitioned_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const {
-  return partitioned_io_buffer::compute_max_num_parallel_readers(data_set_size, mini_batch_size, requested_num_parallel_readers, m_comm);
-}
-
-int lbann::partitioned_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm) {
-  int num_parallel_readers = requested_num_parallel_readers;
-
-  if(comm->get_procs_per_trainer() != num_parallel_readers) {
-    if (comm->am_trainer_master()) {
-      std::cout << "Warning the requested number of parallel readers "
-                << num_parallel_readers
-                << " does not match the grid size "
-                << comm->get_procs_per_trainer()
-                << " OVERRIDING requested number of parallel readers."
-                << std::endl;
-    }
-    num_parallel_readers = comm->get_procs_per_trainer();
-  }
-
-#if 0
-  if(mini_batch_size < num_parallel_readers) {
-    if (comm->am_trainer_master()) {
-      std::cout << "Warning the requested number of parallel readers "
-                << num_parallel_readers
-                << " is larger than the requested mini-batch size "
-                << mini_batch_size
-                << " OVERRIDING requested number of parallel readers."
-                << std::endl;
-    }
-    num_parallel_readers = mini_batch_size;
-  }
-#endif
-  return num_parallel_readers;
-}
-
-void lbann::partitioned_io_buffer::calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) {
-  if(data_reader == nullptr) { return; }
-  // If the data reader does not have any data bail out (e.g. unused validation reader)
-  if(data_reader->get_num_data() == 0) { return; }
-
-  /// Make sure that the mini-batch size is not larger than the data set
-  if(max_mini_batch_size > data_reader->get_num_data()) {
-    max_mini_batch_size = data_reader->get_num_data();
-  }
-
-  bool apportioned = data_reader->is_partitioned();
-
-  /// Check to make sure that there is enough data for all of the parallel readers
-  int num_parallel_readers_per_model = compute_max_num_parallel_readers(data_reader->get_num_data(), max_mini_batch_size, m_comm->get_procs_per_trainer());
-  data_reader->set_num_parallel_readers(num_parallel_readers_per_model);
-  if(num_parallel_readers_per_model == 0
-     || (num_parallel_readers_per_model != m_comm->get_procs_per_trainer() && num_parallel_readers_per_model != max_mini_batch_size)) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__)
-      + " :: partitioned_io_buffer: number of parallel readers is " + std::to_string(num_parallel_readers_per_model)
-      + " and there are " + std::to_string(m_comm->get_procs_per_trainer()) + " processes in the model");
-  }
-
-  /// Set the basic parameters for stride and offset of the data reader
-  int batch_stride = m_comm->get_num_trainers() * max_mini_batch_size;
-  int base_offset  = m_comm->get_rank_in_trainer();
-  int model_offset = m_comm->get_trainer_rank() * max_mini_batch_size;
-
-  if (apportioned) {
-    batch_stride = max_mini_batch_size;
-    model_offset = 0;
-  }
-
-  /// Set mini-batch size and stride
-  data_reader->set_mini_batch_size(max_mini_batch_size);
-  data_reader->set_stride_to_next_mini_batch(batch_stride);
-  data_reader->set_sample_stride(num_parallel_readers_per_model);
-  data_reader->set_iteration_stride(1);
-  /// Set data reader base offset and model offset
-  data_reader->set_base_offset(base_offset);
-  data_reader->set_model_offset(model_offset);
-  data_reader->set_initial_position();
-
-  int min_stride_across_models = max_mini_batch_size * m_comm->get_num_trainers();  /// Given that each model has to have at least one reader, what is the minimum stride
-  if (apportioned) {
-    min_stride_across_models = max_mini_batch_size;
-  }
-
-  data_reader->set_global_mini_batch_size(min_stride_across_models); /// The global mini-batch is a full mini-batch per model
-
-  data_reader->set_last_mini_batch_size(max_mini_batch_size); /// By default the last mini-batch is a full one
-  data_reader->set_global_last_mini_batch_size(min_stride_across_models); /// By default the last mini-batch is a full one per model
-
-  int num_whole_mini_batches_per_model = floor(data_reader->get_num_data() / min_stride_across_models);
-  int global_partial_mini_batch_size = data_reader->get_num_data() - (num_whole_mini_batches_per_model * min_stride_across_models);
-  int per_model_partial_mini_batch_size = global_partial_mini_batch_size / m_comm->get_num_trainers();
-  int world_master_remainder_data = 0;
+#define PROTO(T)                          \
+  template class partitioned_io_buffer<T>
 
-  // Compute how many full "parallel" mini-batches are available
-  int last_mini_batch_threshold = num_whole_mini_batches_per_model * min_stride_across_models;
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
-  int world_master_remainder_adjustment = data_reader->get_num_data()
-                                          - (num_whole_mini_batches_per_model * min_stride_across_models)
-                                          - (per_model_partial_mini_batch_size * m_comm->get_num_trainers());
-  if(m_comm->get_trainer_rank() == 0) {
-    world_master_remainder_data = world_master_remainder_adjustment;
-    world_master_remainder_adjustment = 0;
-  }
-  per_model_partial_mini_batch_size += world_master_remainder_data;
-
-  if(per_model_partial_mini_batch_size > 0 || world_master_remainder_adjustment > 0) {
-    data_reader->set_last_mini_batch_size(per_model_partial_mini_batch_size);
-    data_reader->set_global_last_mini_batch_size(global_partial_mini_batch_size);
-  }
-
-  if(global_partial_mini_batch_size != 0) {
-    data_reader->set_num_iterations_per_epoch(num_whole_mini_batches_per_model+1);
-  }else {
-    data_reader->set_num_iterations_per_epoch(num_whole_mini_batches_per_model);
-  }
-
-  if(data_reader->get_last_mini_batch_size() > max_mini_batch_size) {
-    throw new lbann_exception("Error in calculating the partial mini-batch size, exceeds the max mini-batch size");
-  }
-
-  /// Note that m_comm->get_trainer_rank() + m_comm->get_rank_in_trainer() is not equivalent to m_comm->get_world_rank() from a parallel I/O perspective
-  /// Given the data readers model rank, how many models have a higher rank
-
-  /// By default the last stride of each reader is part of a regular (full) round
-  data_reader->set_stride_to_last_mini_batch(data_reader->get_stride_to_next_mini_batch());
-
-  /// BVE FIXME - I feel like this is wrong  I don't think that the -1
-  /// should be there
-  int last_mini_batch_offset = std::max(0, num_whole_mini_batches_per_model - 1) * data_reader->get_stride_to_next_mini_batch();
-
-  ///  The last mini-batch may be partial and thus may have a smaller stride
-  if(per_model_partial_mini_batch_size > 0 || world_master_remainder_adjustment > 0) {
-    data_reader->set_stride_to_last_mini_batch((last_mini_batch_threshold - data_reader->get_base_offset() - data_reader->get_model_offset() - last_mini_batch_offset) + m_comm->get_trainer_rank() * per_model_partial_mini_batch_size + m_comm->get_rank_in_trainer());
-  }
-
-  //  cout << "[" << m_comm->get_rank_in_world() << "] " << m_comm->get_trainer_rank() << " model rank, "<< m_comm->get_rank_in_trainer() << " rank in model, num_whole_mini_batches_per_model " << num_whole_mini_batches_per_model << " parallel_readers_with_extra_mini_batch " << /*parallel_readers_with_extra_mini_batch <<*/ " partial_mini_batch_size=" << per_model_partial_mini_batch_size << " last mini bath size=" << data_reader->get_last_mini_batch_size() << " world_master_remainder_data=" << world_master_remainder_data << " with a last stride of " << data_reader->get_stride_to_last_mini_batch() << " and stride of " << data_reader->get_stride_to_next_mini_batch() << " and there are " << num_parallel_readers_per_model << " parallel readers per model" << " last mini batch offset = " << last_mini_batch_offset <<  " parallel reader with extra minibatch = " << /*parallel_readers_with_extra_mini_batch << */" model bracket = " << (/*parallel_readers_with_extra_mini_batch **/ max_mini_batch_size + per_model_partial_mini_batch_size + world_master_remainder_data) <<" base ofset "<< data_reader->get_base_offset() << " model offset " << data_reader->get_model_offset() <<endl;
-//cout << "[" << m_comm->get_rank_in_world() << "] " << m_comm->get_trainer_rank() << " model rank, "<< m_comm->get_rank_in_trainer() << " rank in model, num_whole_mini_batches_per_model " << num_whole_mini_batches_per_model << " num_whole_mini_batches_per_reader " << num_whole_mini_batches_per_reader << "(m_num_mini_batches_per_reader=" << data_reader->get_num_mini_batches_per_reader() << ") parallel_readers_with_extra_mini_batch " << /*parallel_readers_with_extra_mini_batch <<*/ " partial_mini_batch_size=" << per_model_partial_mini_batch_size << " last mini bath size=" << data_reader->get_last_mini_batch_size() << " world_master_remainder_data=" << world_master_remainder_data << " threshold " << data_reader->get_last_mini_batch_threshold() << " with a last stride of " << data_reader->get_stride_to_last_mini_batch() << " and stride of " << data_reader->get_batch_stride() << " and there are " << num_parallel_readers_per_model << " parallel readers per model" << " last mini batch offset = " << last_mini_batch_offset <<  " parallel reader with extra minibatch = " << /*parallel_readers_with_extra_mini_batch << */" model bracket = " << (/*parallel_readers_with_extra_mini_batch **/ max_mini_batch_size + per_model_partial_mini_batch_size + world_master_remainder_data) <<" base ofset "<< data_reader->get_base_offset() << " model offset " << data_reader->get_model_offset() <<endl;
-  return;
-}
-
-void lbann::partitioned_io_buffer::calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) {
-  if(data_reader == nullptr) { return; }
-  // If the data reader does not have any data bail out (e.g. unused validation reader)
-  if(data_reader->get_num_data() == 0) { return; }
-
-  if(max_mini_batch_size > data_reader->get_num_data()) {
-    max_mini_batch_size = data_reader->get_num_data();
-  }
-
-  /// Check to make sure that there is enough data for all of the parallel readers
-  int num_parallel_readers_per_model = compute_max_num_parallel_readers(data_reader->get_num_data(), max_mini_batch_size, m_comm->get_procs_per_trainer());
-  data_reader->set_num_parallel_readers(num_parallel_readers_per_model);
-  if(num_parallel_readers_per_model == 0
-     || (num_parallel_readers_per_model != m_comm->get_procs_per_trainer() && num_parallel_readers_per_model != max_mini_batch_size)) {
-    throw lbann_exception(
-      std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
-      " :: generic_data_distribution: number of parallel readers is zero");
-  }
-
-  /// Set the basic parameters for stride and offset of the data reader
-  int batch_stride = max_mini_batch_size;
-  int base_offset  = m_comm->get_rank_in_trainer();
-  /// Set mini-batch size and stride
-  data_reader->set_mini_batch_size(max_mini_batch_size);
-  data_reader->set_stride_to_next_mini_batch(batch_stride);
-  data_reader->set_sample_stride(num_parallel_readers_per_model);
-  data_reader->set_iteration_stride(1);
-  /// Set data reader base offset and model offset
-  data_reader->set_base_offset(base_offset);
-  data_reader->set_model_offset(0);
-  data_reader->set_initial_position();
-
-  /// By default each data reader will plan to process the entire data set
-  int num_iterations_per_epoch = ceil((float) data_reader->get_num_data() / (float) max_mini_batch_size);
-  int last_mini_batch_size = data_reader->get_num_data() % max_mini_batch_size;
-  if(last_mini_batch_size == 0) {
-    last_mini_batch_size = max_mini_batch_size;
-  }
-  data_reader->set_num_iterations_per_epoch(num_iterations_per_epoch);
-  data_reader->set_last_mini_batch_size(last_mini_batch_size);
-  data_reader->set_stride_to_last_mini_batch(data_reader->get_stride_to_next_mini_batch());
-
-  data_reader->set_global_mini_batch_size(max_mini_batch_size);
-  data_reader->set_global_last_mini_batch_size(last_mini_batch_size);
-  return;
-}
+} // namespace lbann
diff --git a/src/io/persist.cpp b/src/io/persist.cpp
index 87513600bcd..7874e9fab54 100644
--- a/src/io/persist.cpp
+++ b/src/io/persist.cpp
@@ -31,9 +31,10 @@
 #include <cstring>
 #include <cstdio>
 
+#define LBANN_PERSIST_INSTANTIATE
+#include "lbann/io/persist.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/io/file_io.hpp"
-#include "lbann/io/persist.hpp"
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -64,7 +65,8 @@ struct layer_header {
 /** \brief Given an open file descriptor, file name, and a matrix, write the matrix
  *         to the file descriptor, return the number of bytes written */
 
-bool lbann::persist::write_rank_distmat(persist_type type, const char *name, const AbsDistMat& M) {
+template <typename TensorDataType>
+bool lbann::persist::write_rank_distmat(persist_type type, const char *name, const El::AbstractDistMatrix<TensorDataType>& M) {
   // TODO: store in network order
   std::string filename = m_checkpoint_dir;
   if (type == persist_type::train) {
@@ -72,9 +74,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con
   } else if (type == persist_type::model) {
     filename += std::string("/model_") + name;
   } else {
-    std::stringstream err;
-    err << "invalid persist_type (" << static_cast<int>(type) << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("invalid persist_type (", static_cast<int>(type), ")");
   }
   // skip all of this if matrix is not held on rank
   const El::Int localHeight = M.LocalHeight();
@@ -99,7 +99,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con
   if (write_rc != sizeof(header)) {
     // error!
   }
-  m_bytes += write_rc;
+  m_bytes[type] += write_rc;
 
   // now write the data for our part of the distributed matrix
   const El::Int lDim = M.LDim();
@@ -112,7 +112,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con
     if (write_rc != bufsize) {
       // error!
     }
-    m_bytes += write_rc;
+    m_bytes[type] += write_rc;
   } else {
     // TODO: if this padding is small, may not be a big deal to write it out anyway
     // we've got some padding along the first dimension
@@ -124,7 +124,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con
       if (write_rc != bufsize) {
         // error!
       }
-      m_bytes += write_rc;
+      m_bytes[type] += write_rc;
     }
   }
   return true;
@@ -132,9 +132,8 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con
 
 /** \brief Given an open file descriptor, file name, and a matrix, read the matrix
  *         from the file descriptor, return the number of bytes read */
-bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsDistMat& M) {
-  std::stringstream err;
-
+template <typename TensorDataType>
+bool lbann::persist::read_rank_distmat(persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType>& M) {
   // read in the header
   std::string filename = m_checkpoint_dir;
   if (type == persist_type::train) {
@@ -142,8 +141,7 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD
   } else if (type == persist_type::model) {
     filename += std::string("/model_") + name;
   } else {
-    err << "invalid persist_type (" << static_cast<int>(type) << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("invalid persist_type (", static_cast<int>(type), ")");
   }
   int fd = openread(filename.c_str());
   // file does not exist. we will try to grab matrix from rank 0
@@ -152,13 +150,10 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD
   struct layer_header header;
   ssize_t read_rc = read(fd, &header, sizeof(header));
   if (read_rc != sizeof(header)) {
-    err << "failed to read layer header from file "
-        << "(attempted to read " << sizeof(header) << " bytes "
-        << "from " << filename << ", "
-        << "but got " << read_rc << " bytes)";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("failed to read layer header from file (attempted to read ",
+                sizeof(header), " bytes from ", filename, ", but got ", read_rc, " bytes)");
   }
-  m_bytes += read_rc;
+  m_bytes[type] += read_rc;
 
   // resize our global matrix
   El::Int height = header.height;
@@ -173,26 +168,20 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD
       El::Int bufsize = localheight * localwidth * sizeof(DataType);
       read_rc = read(fd, buf, bufsize);
       if (read_rc != bufsize) {
-        err << "failed to read layer data from file "
-            << "(attempted to read " << bufsize << " bytes "
-            << "from " << filename << ", "
-            << "but got " << read_rc << " bytes)";
-        LBANN_ERROR(err.str());
+        LBANN_ERROR("failed to read layer data from file (attempted to read ", bufsize,
+                    " bytes from ", filename, ", but got ", read_rc, " bytes)");
       }
-      m_bytes += read_rc;
+      m_bytes[type] += read_rc;
     } else {
       for(El::Int j = 0; j <  localwidth; ++j) {
         auto *buf = (void *) M.Buffer(0, j);
         El::Int bufsize = localheight * sizeof(DataType);
         read_rc = read(fd, buf, bufsize);
         if (read_rc != bufsize) {
-          err << "failed to read layer data from file "
-              << "(attempted to read " << bufsize << " bytes "
-              << "from " << filename << ", "
-              << "but got " << read_rc << " bytes)";
-          LBANN_ERROR(err.str());
+          LBANN_ERROR("failed to read layer data from file (attempted to read ",
+                      bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)");
         }
-        m_bytes += read_rc;
+        m_bytes[type] += read_rc;
       }
     }
   } else {
@@ -202,26 +191,20 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD
       El::Int bufsize = localheight * localwidth * sizeof(DataType);
       read_rc = read(fd, buf, bufsize);
       if (read_rc != bufsize) {
-        err << "failed to read layer data from file "
-            << "(attempted to read " << bufsize << " bytes "
-            << "from " << filename << ", "
-            << "but got " << read_rc << " bytes)";
-        LBANN_ERROR(err.str());
+        LBANN_ERROR("failed to read layer data from file (attempted to read ",
+                    bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)");
       }
-      m_bytes += read_rc;
+      m_bytes[type] += read_rc;
     } else {
       for(El::Int jLoc = 0; jLoc < localwidth; ++jLoc) {
         auto *buf = (void *) M.Buffer(0, jLoc);
         El::Int bufsize = localheight * sizeof(DataType);
         read_rc = read(fd, buf, bufsize);
         if (read_rc != bufsize) {
-          err << "failed to read layer data from file "
-              << "(attempted to read " << bufsize << " bytes "
-              << "from " << filename << ", "
-              << "but got " << read_rc << " bytes)";
-          LBANN_ERROR(err.str());
+          LBANN_ERROR("failed to read layer data from file (attempted to read ",
+                      bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)");
         }
-        m_bytes += read_rc;
+        m_bytes[type] += read_rc;
       }
     }
   }
@@ -232,114 +215,65 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD
  * Functions to read/write values to files
  ****************************************************/
 
-lbann::persist::persist() {
-  // initialize number of bytes written
-  m_bytes = 0;
-
-  // initialize file descriptors
-  m_model_fd = -1;
-  m_train_fd = -1;
-  m_validate_fd = -1;
+lbann::persist::persist():
+  ckpt_type(callback_type::invalid),
+  m_checkpoint_dir("<unknown>") {
+  for(persist_type pt : persist_type_iterator()) {
+    // initialize number of bytes written
+    m_bytes[pt] = 0;
+    // initialize file descriptors
+    m_filenames[pt] = "<unknown>";
+  }
 }
 
-void lbann::persist::open_checkpoint(const char *dir) {
-  // create directory for checkpoint
-  lbann::makedir(dir);
-
+void lbann::persist::open_checkpoint_dir(const std::string& dir, bool const create_dir) {
+  if(create_dir) {
+    // create directory for checkpoint
+    lbann::makedir(dir.c_str());
+  }
   // copy checkpoint directory
-  strcpy(m_checkpoint_dir, dir);
-
-  // open the file for writing
-  sprintf(m_model_filename, "%s/model", dir);
-
-  // define filename for train state
-  sprintf(m_train_filename, "%s/train", dir);
+  m_checkpoint_dir = dir;
+}
 
-  if(ckpt_type != callback_type::validation && ckpt_type != callback_type::inference){
-    m_model_fd = lbann::openwrite(m_model_filename);
-    if (m_model_fd < 0) {
-      LBANN_ERROR(std::string{}
-                  + "failed to open file (" + m_model_filename + ")");
-    }
+/** @todo BVE FIXME this should be refactored to only open the
+    checkpoints files that we care about */
+void lbann::persist::open_checkpoint(const std::string& dir, bool const create_dir) {
+  open_checkpoint_dir(dir, create_dir);
 
-    m_train_fd = lbann::openwrite(m_train_filename);
-    if (m_train_fd < 0) {
-      LBANN_ERROR(std::string{}
-                  + "failed to open file (" + m_train_filename + ")");
-    }
-  }
-  if (ckpt_type == callback_type::validation || ckpt_type == callback_type::batch){
-    sprintf(m_validate_filename, "%s/validate", dir);
-    m_validate_fd = lbann::openwrite(m_validate_filename);
-    if (m_validate_fd < 0) {
-      LBANN_ERROR(std::string{}
-                  + "failed to open file (" + m_validate_filename + ")");
+  for(persist_type pt : persist_type_iterator()) {
+    // open the file for writing
+    if(m_filenames[pt].compare("<unknown>") == 0) {
+      m_filenames[pt] = dir + to_string(pt);
     }
   }
 }
 
 void lbann::persist::close_checkpoint() {
-  // close model file
-  if (m_model_fd >= 0) {
-    lbann::closewrite(m_model_fd, m_model_filename);
-    m_model_fd = -1;
-  }
-
-  // close training file
-  if (m_train_fd >= 0) {
-    lbann::closewrite(m_train_fd, m_train_filename);
-    m_train_fd = -1;
-  }
-  if (m_validate_fd >= 0) {
-    lbann::closewrite(m_validate_fd, m_validate_filename);
-    m_validate_fd = -1;
+  for(persist_type pt : persist_type_iterator()) {
+    m_filenames[pt] = "<unknown>";
   }
 }
 
-void lbann::persist::open_restart(const char *dir) {
+void lbann::persist::open_restart(const std::string& dir) {
   // copy checkpoint directory
-  strcpy(m_checkpoint_dir, dir);
-  // open the file for writing
-  sprintf(m_model_filename, "%s/model", dir);
-
-  // define filename for train state
-  sprintf(m_train_filename, "%s/train", dir);
-  // define filename for validate phase state
-  sprintf(m_validate_filename, "%s/validate", dir);
-
-  m_model_fd = lbann::openread(m_model_filename);
-  if (m_model_fd < 0) {
-    LBANN_ERROR(std::string{}
-                + "failed to read file (" + m_model_filename + ")");
-  }
+  m_checkpoint_dir = dir;
 
-  m_train_fd = lbann::openread(m_train_filename);
-  if (m_train_fd < 0) {
-    LBANN_ERROR(std::string{}
-                + "failed to read file (" + m_train_filename + ")");
-  }
-  m_validate_fd = lbann::openread(m_validate_filename);
-  if (m_validate_fd < 0) {
-    LBANN_WARNING(std::string{}
-                  + "failed to read file (" + m_validate_filename + "), "
-                  + "which is not an error if validation percent = 0");
+  for(persist_type pt : persist_type_iterator()) {
+    // open the file for reading
+    if(m_filenames[pt].compare("<unknown>") == 0) {
+      m_filenames[pt] = dir + to_string(pt);
+    }
   }
 }
 
 void lbann::persist::close_restart() {
-  // close model file
-  lbann::closeread(m_model_fd, m_model_filename);
-  m_model_fd = -1;
-  // close training file
-  lbann::closeread(m_train_fd, m_train_filename);
-  m_train_fd = -1;
-  // close validate file
-  lbann::closeread(m_validate_fd, m_validate_filename);
-  m_validate_fd = -1;
-
+  for(persist_type pt : persist_type_iterator()) {
+    m_filenames[pt] = "<unknown>";
+  }
 }
 
-bool lbann::persist::write_distmat(persist_type type, const char *name, AbsDistMat *M) {
+template <typename TensorDataType>
+bool lbann::persist::write_distmat(persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType> *M) {
   // define full path to file to store matrix
   std::string filename = m_checkpoint_dir;
   if (type == persist_type::train) {
@@ -347,21 +281,20 @@ bool lbann::persist::write_distmat(persist_type type, const char *name, AbsDistM
   } else if (type == persist_type::model) {
     filename += std::string("/model_") + name;
   } else {
-    std::stringstream err;
-    err << "invalid persist_type (" << static_cast<int>(type) << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("invalid persist_type (", static_cast<int>(type), ")");
   }
 
   El::Write(*M, filename, El::BINARY, "");
   //Write_MPI(M, filename, BINARY, "");
 
   uint64_t bytes = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType);
-  m_bytes += bytes;
+  m_bytes[type] += bytes;
 
   return true;
 }
 
-bool lbann::persist::read_distmat(persist_type type, const char *name, AbsDistMat *M) {
+template <typename TensorDataType>
+bool lbann::persist::read_distmat(persist_type type, const char *name, El::AbstractDistMatrix<TensorDataType> *M) {
   // define full path to file to store matrix
   std::string filename = m_checkpoint_dir;
   if (type == persist_type::train) {
@@ -369,163 +302,37 @@ bool lbann::persist::read_distmat(persist_type type, const char *name, AbsDistMa
   } else if (type == persist_type::model) {
     filename += std::string("/model_") + name;
   } else {
-    std::stringstream err;
-    err << "invalid persist_type (" << static_cast<int>(type) << ")";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("invalid persist_type (", static_cast<int>(type), ")");
   }
 
   // check whether file exists
   int exists = lbann::exists(filename.c_str());
   if (! exists) {
-    LBANN_ERROR("failed to read distributed matrix from file (" + filename + ")");
+    LBANN_ERROR("failed to read distributed matrix from file (", filename, ")");
     return false;
   }
   El::Read(*M, filename, El::BINARY, true);
   //Read_MPI(M, filename, BINARY, 1);
 
   uint64_t bytes = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType);
-  m_bytes += bytes;
-
-  return true;
-}
-
-bool lbann::persist::write_bytes(persist_type type, const char *name, const void *buf, size_t size) {
-  int fd = get_fd(type);
-  if (fd >= 0) {
-    ssize_t rc = write(fd, buf, size);
-    if (rc != (ssize_t) size) {
-      LBANN_ERROR(std::string{} + "failed to write file (" + name + ")");
-      return false;
-    }
-    m_bytes += size;
-  }
-  return true;
-}
+  m_bytes[type] += bytes;
 
-bool lbann::persist::read_bytes(persist_type type, const char *name, void *buf, size_t size) {
-  int fd = get_fd(type);
-  if (fd >= 0) {
-    ssize_t rc = read(fd, buf, size);
-    if (rc != (ssize_t) size) {
-      LBANN_ERROR(std::string{} + "failed to read file (" + name + ")");
-      return false;
-    }
-    m_bytes += size;
-  }
-  else {
-    return false;
-  }
   return true;
 }
 
-bool lbann::persist::write_uint32(persist_type type, const char *name, uint32_t val) {
-  return write_bytes(type, name, &val, sizeof(uint32_t));
-}
-
-bool lbann::persist::read_uint32(persist_type type, const char *name, uint32_t *val) {
-  return read_bytes(type, name, val, sizeof(uint32_t));
-}
-
-bool lbann::persist::write_uint64(persist_type type, const char *name, uint64_t val) {
-  return write_bytes(type, name, &val, sizeof(uint64_t));
-}
-
-bool lbann::persist::read_uint64(persist_type type, const char *name, uint64_t *val) {
-  return read_bytes(type, name, val, sizeof(uint64_t));
-}
-
-bool lbann::persist::write_int32_contig(persist_type type, const char *name, const int32_t *buf, uint64_t count) {
-  size_t bytes = count * sizeof(int32_t);
-  return write_bytes(type, name, buf, bytes);
-}
-
-bool lbann::persist::read_int32_contig(persist_type type, const char *name, int32_t *buf, uint64_t count) {
-  size_t bytes = count * sizeof(int32_t);
-  return read_bytes(type, name, buf, bytes);
-}
-
-bool lbann::persist::write_float(persist_type type, const char *name, float val) {
-  return write_bytes(type, name, &val, sizeof(float));
-}
-
-bool lbann::persist::read_float(persist_type type, const char *name, float *val) {
-  return read_bytes(type, name, val, sizeof(float));
-}
-
-bool lbann::persist::write_double(persist_type type, const char *name, double val) {
-  return write_bytes(type, name, &val, sizeof(double));
-}
-
-bool lbann::persist::read_double(persist_type type, const char *name, double *val) {
-  return read_bytes(type, name, val, sizeof(double));
-}
-
-bool lbann::persist::write_datatype(persist_type type, const char *name, DataType val) {
-  return write_bytes(type, name, &val, sizeof(DataType));
-}
-
-bool lbann::persist::read_datatype(persist_type type, const char *name, DataType *val) {
-  return read_bytes(type, name, val, sizeof(DataType));
-}
-
-bool lbann::persist::write_string(persist_type type, const char *name, const char *val, int str_length) {
-  return write_bytes(type, name, val, sizeof(char) * str_length);
-}
-
-bool lbann::persist::read_string(persist_type type, const char *name, char *val, int str_length) {
-  return read_bytes(type, name, val, sizeof(char) * str_length);
-}
-
-int lbann::persist::get_fd(persist_type type) const {
-  int fd = -1;
-  if (type == persist_type::train) {
-    fd = m_train_fd;
-  } else if (type == persist_type::model) {
-    fd = m_model_fd;
-  } else if (type == persist_type::validate) {
-    fd = m_validate_fd;
-  }
-  return fd;
+std::string lbann::persist::get_filename(persist_type type) const {
+  return m_filenames.at(type);
 }
 
 /****************************************************
  * Functions to read/write values to files
  ****************************************************/
 
-bool lbann::write_distmat(int fd, const char *name, DistMat *M, uint64_t *bytes) {
-  El::Write(*M, name, El::BINARY, "");
-  //Write_MPI(M, name, BINARY, "");
-
-  uint64_t bytes_written = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType);
-  *bytes += bytes_written;
-
-  return true;
-}
-
-bool lbann::read_distmat(int fd, const char *name, DistMat *M, uint64_t *bytes) {
-  // check whether file exists
-  int exists = lbann::exists(name);
-  if (! exists) {
-    LBANN_ERROR(std::string{}
-                + "failed to read distributed matrix from file "
-                + "(" + name + ")");
-    return false;
-  }
-
-  El::Read(*M, name, El::BINARY, true);
-  //Read_MPI(M, name, BINARY, 1);
-
-  uint64_t bytes_read = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType);
-  *bytes += bytes_read;
-
-  return true;
-}
-
 bool lbann::write_bytes(int fd, const char *name, const void *buf, size_t size) {
   if (fd >= 0) {
     ssize_t rc = write(fd, buf, size);
     if (rc != (ssize_t) size) {
-      LBANN_ERROR(std::string{} + "failed to write file (" + name + ")");
+      LBANN_ERROR("failed to write file (", name, ")");
       return false;
     }
   }
@@ -536,60 +343,18 @@ bool lbann::read_bytes(int fd, const char *name, void *buf, size_t size) {
   if (fd >= 0) {
     ssize_t rc = read(fd, buf, size);
     if (rc != (ssize_t) size) {
-      LBANN_ERROR(std::string{} + "failed to read file (" + name + ")");
+      LBANN_ERROR("failed to read file (", name, ")");
       return false;
     }
   }
   return true;
 }
 
-bool lbann::write_uint32(int fd, const char *name, uint32_t val) {
-  return lbann::write_bytes(fd, name, &val, sizeof(uint32_t));
-}
-
-bool lbann::read_uint32(int fd, const char *name, uint32_t *val) {
-  return lbann::read_bytes(fd, name, val, sizeof(uint32_t));
-}
-
-bool lbann::write_uint64(int fd, const char *name, uint64_t val) {
-  return lbann::write_bytes(fd, name, &val, sizeof(uint64_t));
-}
-
-bool lbann::read_uint64(int fd, const char *name, uint64_t *val) {
-  return lbann::read_bytes(fd, name, val, sizeof(uint64_t));
-}
-
-bool lbann::write_int32_contig(int fd, const char *name, const int32_t *buf, uint64_t count) {
-  size_t bytes = count * sizeof(int32_t);
-  return lbann::write_bytes(fd, name, buf, bytes);
-}
-
-bool lbann::read_int32_contig(int fd, const char *name, int32_t *buf, uint64_t count) {
-  size_t bytes = count * sizeof(int32_t);
-  return lbann::read_bytes(fd, name, buf, bytes);
-}
-
-bool lbann::write_float(int fd, const char *name, float val) {
-  return lbann::write_bytes(fd, name, &val, sizeof(float));
-}
-
-bool lbann::read_float(int fd, const char *name, float *val) {
-  return lbann::read_bytes(fd, name, val, sizeof(float));
-}
-
-bool lbann::write_double(int fd, const char *name, double val) {
-  return lbann::write_bytes(fd, name, &val, sizeof(double));
-}
-
-bool lbann::read_double(int fd, const char *name, double *val) {
-  return lbann::read_bytes(fd, name, val, sizeof(double));
-}
-
 bool lbann::write_string(int fd, const char *name, const char *buf, size_t size) {
   if (fd > 0) {
     ssize_t rc = write(fd, buf, size);
     if (rc != (ssize_t) size) {
-      LBANN_ERROR(std::string{} + "failed to write file (" + name + ")");
+      LBANN_ERROR("failed to write file (", name, ")");
       return false;
     }
   }
@@ -600,9 +365,28 @@ bool lbann::read_string(int fd, const char *name, char *buf, size_t size) {
   if (fd > 0) {
     ssize_t rc = read(fd, buf, size);
     if (rc <= 0) {
-      LBANN_ERROR(std::string{} + "failed to read file (" + name + ")");
+      LBANN_ERROR("failed to read file (", name, ")");
       return false;
     }
   }
   return true;
 }
+
+
+namespace lbann {
+
+#define PROTO(T)                     \
+  template bool persist::write_rank_distmat<T>(                        \
+    persist_type type, const char *name, const El::AbstractDistMatrix<T>& M); \
+  template bool persist::read_rank_distmat<T>(                         \
+    persist_type type, const char *name, El::AbstractDistMatrix<T>& M);       \
+  template bool persist::write_distmat<T>(                             \
+    persist_type type, const char *name, El::AbstractDistMatrix<T> *M);       \
+  template bool persist::read_distmat<T>(                              \
+    persist_type type, const char *name, El::AbstractDistMatrix<T> *M)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
index 2bd2ea2db36..7b8a97a6844 100644
--- a/src/layers/CMakeLists.txt
+++ b/src/layers/CMakeLists.txt
@@ -1,11 +1,20 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  data_type_layer.cpp
   layer.cpp
   )
 
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/distconv_adapter.cpp")
+  list(APPEND THIS_DIR_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/data_type_distconv_adapter.cpp")
+endif ()
+
 # Add the subdirectories
 add_subdirectory(activations)
 add_subdirectory(image)
+add_subdirectory(io/input)
 add_subdirectory(learning)
 add_subdirectory(loss)
 add_subdirectory(math)
diff --git a/src/layers/activations/CMakeLists.txt b/src/layers/activations/CMakeLists.txt
index e7be74b8843..3f58dad3c8c 100644
--- a/src/layers/activations/CMakeLists.txt
+++ b/src/layers/activations/CMakeLists.txt
@@ -2,9 +2,12 @@
 set_full_path(THIS_DIR_SOURCES
   activations.cpp
   elu.cpp
+  identity.cpp
+  relu.cpp
   leaky_relu.cpp
   log_softmax.cpp
   softmax.cpp
+  softmax_builder.cpp
   )
 
 if (LBANN_HAS_CUDA)
@@ -12,6 +15,7 @@ if (LBANN_HAS_CUDA)
   set_full_path(THIS_DIR_CU_SOURCES
     activations.cu
     elu.cu
+    relu.cu
     leaky_relu.cu
     log_softmax.cu
     softmax.cu
diff --git a/src/layers/activations/activations.cpp b/src/layers/activations/activations.cpp
index 6b45f0c0b63..f461355b933 100644
--- a/src/layers/activations/activations.cpp
+++ b/src/layers/activations/activations.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ACTIVATIONS_LAYER_INSTANTIATE
 #include "lbann/layers/activations/activations.hpp"
 #include "lbann/utils/entrywise_operator.hpp"
 
@@ -31,11 +32,6 @@ namespace lbann {
 
 namespace {
 
-// Helpful constants
-constexpr DataType zero = 0;
-constexpr DataType one = 1;
-constexpr DataType eps = std::numeric_limits<DataType>::epsilon();
-
 // =========================================================
 // Operator objects for entry-wise unary layers
 // =========================================================
@@ -45,86 +41,92 @@ constexpr DataType eps = std::numeric_limits<DataType>::epsilon();
 // (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
 
 /** Log sigmoid operator. */
+template <typename TensorDataType>
 struct log_sigmoid_op {
-  inline DataType operator()(const DataType& x) const {
-    if (x >= zero) {
-      return -std::log1p(std::exp(-x));
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::log1p;
+    if (x >= El::TypeTraits<TensorDataType>::Zero()) {
+      return -log1p(El::Exp(-x));
     } else {
-      return x - std::log1p(std::exp(x));
+      return x - log1p(El::Exp(x));
     }
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (one + std::exp(x));
-  }
-};
-
-/** ReLU operator. */
-struct relu_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::max(x, zero);
-  }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return x > zero ? dy : zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (El::TypeTraits<TensorDataType>::One() + El::Exp(x));
   }
 };
 
 /** SELU operator. */
+template <typename TensorDataType>
 struct selu_op {
-  inline DataType operator()(const DataType& x) const {
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::expm1;
+    static const auto alpha = TensorDataType(1.6732632423543772848170429916717);
+    static const auto scale = TensorDataType(1.0507009873554804934193349852946);
+    static const auto zero = TensorDataType(0.);
     return (x > zero ?
             scale * x :
-            scale * alpha * std::expm1(x));
+            scale * alpha * expm1(x));
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    static const auto alpha = TensorDataType(1.6732632423543772848170429916717);
+    static const auto scale = TensorDataType(1.0507009873554804934193349852946);
+    static const auto zero = TensorDataType(0.);
     return (x > zero ?
             dy * scale :
-            dy * scale * alpha * std::exp(x));
+            dy * scale * alpha * El::Exp(x));
   }
-private:
-  static constexpr DataType alpha = 1.6732632423543772848170429916717;
-  static constexpr DataType scale = 1.0507009873554804934193349852946;
 };
 
 /** Sigmoid operator. */
+template <typename TensorDataType>
 struct sigmoid_op {
-  inline DataType operator()(const DataType& x) const {
-    const auto& y = 1 / (one + std::exp(-x));
+  TensorDataType eps = std::numeric_limits<TensorDataType>::epsilon();
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    static const auto one = El::TypeTraits<TensorDataType>::One();
+    const auto& y = one / (one + El::Exp(-x));
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
     if (y <= eps)            { return eps; }
     else if (y >= one - eps) { return one - eps; }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
     return y;
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& y = 1 / (one + std::exp(-x));
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    static const auto one = El::TypeTraits<TensorDataType>::One();
+    const auto& y = one / (one + El::Exp(-x));
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    if (y <= eps || y >= one - eps) { return zero; }
+    if (y <= eps || y >= one - eps) { return El::TypeTraits<TensorDataType>::Zero(); }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
     return dy * y * (one - y);
   }
 };
 
 /** Softplus operator. */
+template <typename TensorDataType>
 struct softplus_op {
-  inline DataType operator()(const DataType& x) const {
-    if (x > zero) {
-      return std::log1p(std::exp(-x)) + x;
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::log1p;
+    if (x > El::TypeTraits<TensorDataType>::Zero()) {
+      return log1p(El::Exp(-x)) + x;
     } else {
-      return std::log1p(std::exp(x));
+      return log1p(El::Exp(x));
     }
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (one + std::exp(-x));
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (El::TypeTraits<TensorDataType>::One() + El::Exp(-x));
   }
 };
 
 /** Softsign operator. */
+template <typename TensorDataType>
 struct softsign_op {
-  inline DataType operator()(const DataType& x) const {
-    return x / (one + std::fabs(x));
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::fabs;
+    return x / (El::TypeTraits<TensorDataType>::One() + fabs(x));
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& denom = one + std::fabs(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    using std::fabs;
+    const auto& denom = El::TypeTraits<TensorDataType>::One() + fabs(x);
     return dy / (denom * denom);
   }
 };
@@ -132,38 +134,35 @@ struct softsign_op {
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-  ::fp_compute() {                                                      \
-    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
-                                       get_activations());              \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    apply_entrywise_unary_operator<op>(                                 \
+      this->get_prev_activations(),                                     \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-  ::bp_compute() {                                                      \
-    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
-                                        get_prev_error_signals(),       \
-                                        get_error_signals());           \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-  ::fp_compute() {                                                      \
-    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
-                                       get_activations());              \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-  ::bp_compute() {                                                      \
-    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
-                                        get_prev_error_signals(),       \
-                                        get_error_signals());           \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_entrywise_binary_operator<op>(                                \
+      this->get_prev_activations(),                                     \
+      this->get_prev_error_signals(),                                   \
+      this->get_error_signals());                                       \
   }
-  INSTANTIATE(log_sigmoid_layer, log_sigmoid_op)
-  INSTANTIATE(relu_layer, relu_op)
-  INSTANTIATE(selu_layer, selu_op)
-  INSTANTIATE(sigmoid_layer, sigmoid_op)
-  INSTANTIATE(softplus_layer, softplus_op)
-  INSTANTIATE(softsign_layer, softsign_op)
+
+DEFINE_COMPUTE_OPS(log_sigmoid_layer, log_sigmoid_op)
+DEFINE_COMPUTE_OPS(selu_layer, selu_op)
+DEFINE_COMPUTE_OPS(sigmoid_layer, sigmoid_op)
+DEFINE_COMPUTE_OPS(softplus_layer, softplus_op)
+DEFINE_COMPUTE_OPS(softsign_layer, softsign_op)
+
+#define PROTO(T) \
+  UNARY_ETI_INST_MACRO_DEV_DT(log_sigmoid_layer, T, El::Device::CPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(selu_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(sigmoid_layer, T, El::Device::CPU);     \
+  UNARY_ETI_INST_MACRO_DEV_DT(softplus_layer, T, El::Device::CPU);    \
+  UNARY_ETI_INST_MACRO_DEV_DT(softsign_layer, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/activations/activations.cu b/src/layers/activations/activations.cu
index 16696e11910..9eee7e9c06f 100644
--- a/src/layers/activations/activations.cu
+++ b/src/layers/activations/activations.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ACTIVATIONS_LAYER_INSTANTIATE
 #include "lbann/layers/activations/activations.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -40,90 +41,97 @@ namespace {
 // (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
 
 /** Log sigmoid operator. */
+template <typename TensorDataType>
 struct log_sigmoid_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    if (x >= DataType(0)) {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    if (x >= TensorDataType(0.0)) {
       return -cuda::log1p(cuda::exp(-x));
     } else {
       return x - cuda::log1p(cuda::exp(x));
     }
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (DataType(1) + cuda::exp(x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (TensorDataType(1.0) + cuda::exp(x));
   }
 };
 
 /** ReLU operator. */
+template <typename TensorDataType>
 struct relu_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    return cuda::max(x, DataType(0));
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    return cuda::max(x, TensorDataType(0.0));
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return x > DataType(0) ? dy : DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return x > TensorDataType(0.0) ? dy : TensorDataType(0.0);
   }
 };
 
 /** SELU operator. */
+template <typename TensorDataType>
 struct selu_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    return (x > DataType(0) ?
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    const TensorDataType alpha = 1.6732632423543772848170429916717;
+    const TensorDataType scale = 1.0507009873554804934193349852946;
+    return (x > TensorDataType(0.0) ?
             scale * x :
             scale * alpha * cuda::expm1(x));
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return (x > DataType(0) ?
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const TensorDataType alpha = 1.6732632423543772848170429916717;
+    const TensorDataType scale = 1.0507009873554804934193349852946;
+    return (x > TensorDataType(0.0) ?
             dy * scale :
             dy * scale * alpha * cuda::exp(x));
   }
-private:
-  static constexpr DataType alpha = 1.6732632423543772848170429916717;
-  static constexpr DataType scale = 1.0507009873554804934193349852946;
 };
 
 /** Sigmoid operator. */
+template <typename TensorDataType>
 struct sigmoid_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    constexpr DataType one = 1;
-    const auto& y = 1 / (one + cuda::exp(-x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    const TensorDataType one = 1.;
+    const auto& y = one / (one + cuda::exp(-x));
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    constexpr DataType eps = cuda::epsilon<DataType>();
+    const auto eps = cuda::epsilon<TensorDataType>();
     if (y <= eps) { return eps; }
     else if (y >= one - eps) { return one - eps; }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
     return y;
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    constexpr DataType one = 1;
-    const auto& y = 1 / (one + cuda::exp(-x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const TensorDataType one = 1.;
+    const auto& y = one / (one + cuda::exp(-x));
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    constexpr DataType eps = cuda::epsilon<DataType>();
-    if (y <= eps || y >= DataType(1) - eps) { return DataType(0); }
+    const auto eps = cuda::epsilon<TensorDataType>();
+    if (y <= eps || y >= one - eps) { return TensorDataType(0.0); }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
-    return dy * y * (DataType(1) - y);
+    return dy * y * (one - y);
   }
 };
 
 /** Softplus operator. */
+template <typename TensorDataType>
 struct softplus_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    if (x > DataType(0)) {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    if (x > TensorDataType(0.0)) {
       return cuda::log1p(cuda::exp(-x)) + x;
     } else {
       return cuda::log1p(cuda::exp(x));
     }
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (DataType(1) + cuda::exp(-x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (TensorDataType(1.0) + cuda::exp(-x));
   }
 };
 
 /** Softsign operator. */
+template <typename TensorDataType>
 struct softsign_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    return x / (DataType(1) + cuda::abs(x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    return x / (TensorDataType(1.0) + cuda::abs(x));
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& denom = DataType(1) + cuda::abs(x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& denom = TensorDataType(1.0) + cuda::abs(x);
     return dy / (denom * denom);
   }
 };
@@ -131,38 +139,35 @@ struct softsign_op {
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-  ::fp_compute() {                                                      \
-    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
-                                             get_activations());        \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    cuda::apply_entrywise_unary_operator<op>(                           \
+      this->get_prev_activations(),                                     \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-  ::bp_compute() {                                                      \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
-                                              get_prev_error_signals(), \
-                                              get_error_signals());     \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::fp_compute() {                                                      \
-    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
-                                             get_activations());        \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::bp_compute() {                                                      \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
-                                              get_prev_error_signals(), \
-                                              get_error_signals());     \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    cuda::apply_entrywise_binary_operator<op>(                          \
+      this->get_prev_activations(),                                     \
+      this->get_prev_error_signals(),                                   \
+      this->get_error_signals());                                       \
   }
-  INSTANTIATE(log_sigmoid_layer, log_sigmoid_op)
-  INSTANTIATE(relu_layer, relu_op)
-  INSTANTIATE(selu_layer, selu_op)
-  INSTANTIATE(sigmoid_layer, sigmoid_op)
-  INSTANTIATE(softplus_layer, softplus_op)
-  INSTANTIATE(softsign_layer, softsign_op)
+
+DEFINE_COMPUTE_OPS(log_sigmoid_layer, log_sigmoid_op)
+DEFINE_COMPUTE_OPS(selu_layer, selu_op)
+DEFINE_COMPUTE_OPS(sigmoid_layer, sigmoid_op)
+DEFINE_COMPUTE_OPS(softplus_layer, softplus_op)
+DEFINE_COMPUTE_OPS(softsign_layer, softsign_op)
+
+#define PROTO(T) \
+  UNARY_ETI_INST_MACRO_DEV_DT(log_sigmoid_layer, T, El::Device::GPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(selu_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(sigmoid_layer, T, El::Device::GPU);     \
+  UNARY_ETI_INST_MACRO_DEV_DT(softplus_layer, T, El::Device::GPU);    \
+  UNARY_ETI_INST_MACRO_DEV_DT(softsign_layer, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/activations/elu.cpp b/src/layers/activations/elu.cpp
index 4a4083b8a78..ccb7ff4187b 100644
--- a/src/layers/activations/elu.cpp
+++ b/src/layers/activations/elu.cpp
@@ -24,19 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ELU_LAYER_INSTANTIATE
 #include "lbann/layers/activations/elu.hpp"
 
 namespace lbann {
 
 namespace {
 
-// Useful constants
-constexpr DataType zero = 0;
-
 /** Local forward prop computation. */
-void local_fp(DataType alpha,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType alpha,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -44,16 +43,17 @@ void local_fp(DataType alpha,
     for (El::Int row = 0; row < height; ++row) {
       const auto& x = input(row, col);
       auto& y = output(row, col);
-      y = (x > zero) ? x : alpha * std::expm1(x);
+      y = (x > El::TypeTraits<TensorDataType>::Zero()) ? x : alpha * std::expm1(x);
     }
   }
 }
 
 /** Local backprop computation. */
-void local_bp(DataType alpha,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType alpha,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -62,42 +62,33 @@ void local_bp(DataType alpha,
       const auto& x = input(row, col);
       const auto& dy = gradient_wrt_output(row, col);
       auto& dx = gradient_wrt_input(row, col);
-      dx = (x > zero) ? dy : dy * alpha * std::exp(x);
+      dx = (x > El::TypeTraits<TensorDataType>::Zero()) ? dy : dy * alpha * std::exp(x);
     }
   }
 }
 
 } // namespace
 
-template <>
-void elu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_alpha,
-           get_local_prev_activations(),
-           get_local_activations());
-}
-template <>
-void elu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_alpha,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
-}
-template <>
-void elu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_alpha,
-           get_local_prev_activations(),
-           get_local_activations());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void elu_layer<TensorDataType, Layout, Device>::fp_compute() {
+  local_fp(this->m_alpha,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void elu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_alpha,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void elu_layer<TensorDataType, Layout, Device>::bp_compute() {
+  local_bp(this->m_alpha,
+           this->get_local_prev_activations(),
+           this->get_local_prev_error_signals(),
+           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class elu_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class elu_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/elu.cu b/src/layers/activations/elu.cu
index f6d6f1581fb..7999168dd3d 100644
--- a/src/layers/activations/elu.cu
+++ b/src/layers/activations/elu.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ELU_LAYER_INSTANTIATE
 #include "lbann/layers/activations/elu.hpp"
 
 namespace lbann {
@@ -31,12 +32,13 @@ namespace lbann {
 namespace {
 
 /** CUDA kernel for forward prop computation. */
-__global__ void fp_kernel(DataType alpha,
+template <typename TensorDataType>
+__global__ void fp_kernel(TensorDataType alpha,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          DataType* __restrict__ output,
+                          TensorDataType* __restrict__ output,
                           El::Int output_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -46,19 +48,20 @@ __global__ void fp_kernel(DataType alpha,
     const auto& col = pos / height;
     const auto& x = input[row + col * input_ldim];
     auto& y = output[row + col * output_ldim];
-    y = (x > DataType(0)) ? x : alpha * cuda::expm1(x);
+    y = (x > TensorDataType(0.0)) ? x : alpha * cuda::expm1(x);
   }
 }
 
 /** CUDA kernel for backprop computation. */
-__global__ void bp_kernel(DataType alpha,
+template <typename TensorDataType>
+__global__ void bp_kernel(TensorDataType alpha,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
                           El::Int gradient_wrt_output_ldim,
-                          DataType* __restrict__ gradient_wrt_input,
+                          TensorDataType* __restrict__ gradient_wrt_input,
                           El::Int gradient_wrt_input_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -69,14 +72,15 @@ __global__ void bp_kernel(DataType alpha,
     const auto& x = input[row + col * input_ldim];
     const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
     auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-    dx = (x > DataType(0)) ? dy : dy * alpha * cuda::exp(x);
+    dx = (x > TensorDataType(0.0)) ? dy : dy * alpha * cuda::exp(x);
   }
 }
 
 /** Local forward prop computation. */
-void local_fp(DataType alpha,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType alpha,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -101,10 +105,11 @@ void local_fp(DataType alpha,
 }
 
 /** Local backprop computation. */
-void local_bp(DataType alpha,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType alpha,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -131,35 +136,25 @@ void local_bp(DataType alpha,
 
 } // namespace
 
-template <>
-void elu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_alpha,
-           get_local_prev_activations(),
-           get_local_activations());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void elu_layer<TensorDataType, Layout, Device>::fp_compute() {
+  local_fp(this->m_alpha,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void elu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_alpha,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
-}
-template <>
-void elu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_alpha,
-           get_local_prev_activations(),
-           get_local_activations());
-}
-template <>
-void elu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_alpha,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void elu_layer<TensorDataType, Layout, Device>::bp_compute() {
+  local_bp(this->m_alpha,
+           this->get_local_prev_activations(),
+           this->get_local_prev_error_signals(),
+           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class elu_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class elu_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/identity.cpp b/src/layers/activations/identity.cpp
new file mode 100644
index 00000000000..7770d68ba93
--- /dev/null
+++ b/src/layers/activations/identity.cpp
@@ -0,0 +1,75 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_IDENTITY_LAYER_INSTANTIATE
+#include "lbann/layers/activations/identity.hpp"
+
+namespace lbann {
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void identity_distconv_adapter<TensorDataType, Layout, Device>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  data_type_distconv_adapter<TensorDataType>::setup_distributions(
+      constraints);
+
+  auto &x = this->get_prev_activations_dist();
+  auto &y = this->get_activations_dist();
+  auto &dx = this->get_error_signals_dist();
+  auto &dy = this->get_prev_error_signals_dist();
+
+  // x == y
+  constraints.mark_equivalent(x, y);
+  // dx == dy
+  constraints.mark_equivalent(dx, dy);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<typename identity_distconv_adapter<TensorDataType, Layout, Device>::TensorDevType>
+identity_distconv_adapter<TensorDataType, Layout, Device>::
+setup_activations_i(int index) const {
+  assert_eq(index, 0);
+  const auto &prev_activations = this->get_prev_activations(0);
+  return make_unique<TensorDevType>(prev_activations);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<typename identity_distconv_adapter<TensorDataType, Layout, Device>::TensorDevType>
+identity_distconv_adapter<TensorDataType, Layout, Device>::
+setup_error_signals_i(int index) const {
+  assert_eq(index, 0);
+  const auto &prev_error_signals = this->get_prev_error_signals(0);
+  return make_unique<TensorDevType>(prev_error_signals);
+}
+#endif // LBANN_HAS_DISTCONV
+
+#define PROTO_DEVICE(T, Device) \
+  template class identity_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class identity_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/activations/leaky_relu.cpp b/src/layers/activations/leaky_relu.cpp
index e808e35a017..ab3937b9835 100644
--- a/src/layers/activations/leaky_relu.cpp
+++ b/src/layers/activations/leaky_relu.cpp
@@ -24,19 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_LEAKY_RELU_LAYER_INSTANTIATE
 #include "lbann/layers/activations/leaky_relu.hpp"
 
 namespace lbann {
 
 namespace {
 
-// Useful constants
-constexpr DataType zero = 0;
-
 /** Local forward prop computation. */
-void local_fp(DataType negative_slope,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType negative_slope,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -44,16 +43,17 @@ void local_fp(DataType negative_slope,
     for (El::Int row = 0; row < height; ++row) {
       const auto& x = input(row, col);
       auto& y = output(row, col);
-      y = (x > zero) ? x : negative_slope * x;
+      y = (x > El::TypeTraits<TensorDataType>::Zero()) ? x : negative_slope * x;
     }
   }
 }
 
 /** Local backprop computation. */
-void local_bp(DataType negative_slope,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType negative_slope,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -62,42 +62,33 @@ void local_bp(DataType negative_slope,
       const auto& x = input(row, col);
       const auto& dy = gradient_wrt_output(row, col);
       auto& dx = gradient_wrt_input(row, col);
-      dx = (x > zero) ? dy : negative_slope * dy;
+      dx = (x > El::TypeTraits<TensorDataType>::Zero()) ? dy : negative_slope * dy;
     }
   }
 }
 
 } // namespace
 
-template <>
-void leaky_relu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_activations());
-}
-template <>
-void leaky_relu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
-}
-template <>
-void leaky_relu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_activations());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void leaky_relu_layer<TensorDataType, Layout, Device>::fp_compute() {
+  local_fp(this->m_negative_slope,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void leaky_relu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void leaky_relu_layer<TensorDataType, Layout, Device>::bp_compute() {
+  local_bp<TensorDataType>(this->m_negative_slope,
+                           this->get_local_prev_activations(),
+                           this->get_local_prev_error_signals(),
+                           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class leaky_relu_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class leaky_relu_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/leaky_relu.cu b/src/layers/activations/leaky_relu.cu
index e87d9a39af0..0a6ed4fd058 100644
--- a/src/layers/activations/leaky_relu.cu
+++ b/src/layers/activations/leaky_relu.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_LEAKY_RELU_LAYER_INSTANTIATE
 #include "lbann/layers/activations/leaky_relu.hpp"
 
 namespace lbann {
@@ -31,12 +32,13 @@ namespace lbann {
 namespace {
 
 /** CUDA kernel for forward prop computation. */
-__global__ void fp_kernel(DataType negative_slope,
+template <typename TensorDataType>
+__global__ void fp_kernel(TensorDataType negative_slope,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          DataType* __restrict__ output,
+                          TensorDataType* __restrict__ output,
                           El::Int output_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -46,19 +48,20 @@ __global__ void fp_kernel(DataType negative_slope,
     const auto& col = pos / height;
     const auto& x = input[row + col * input_ldim];
     auto& y = output[row + col * output_ldim];
-    y = (x > DataType(0)) ? x : negative_slope * x;
+    y = (x > TensorDataType(0.)) ? x : negative_slope * x;
   }
 }
 
 /** CUDA kernel for backprop computation. */
-__global__ void bp_kernel(DataType negative_slope,
+template <typename TensorDataType>
+__global__ void bp_kernel(TensorDataType negative_slope,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
                           El::Int gradient_wrt_output_ldim,
-                          DataType* __restrict__ gradient_wrt_input,
+                          TensorDataType* __restrict__ gradient_wrt_input,
                           El::Int gradient_wrt_input_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -69,14 +72,15 @@ __global__ void bp_kernel(DataType negative_slope,
     const auto& x = input[row + col * input_ldim];
     const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
     auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-    dx = (x > DataType(0)) ? dy : dy * negative_slope;
+    dx = (x > TensorDataType(0.)) ? dy : dy * negative_slope;
   }
 }
 
 /** Local forward prop computation. */
-void local_fp(DataType negative_slope,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType negative_slope,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -101,10 +105,11 @@ void local_fp(DataType negative_slope,
 }
 
 /** Local backprop computation. */
-void local_bp(DataType negative_slope,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType negative_slope,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -129,37 +134,59 @@ void local_bp(DataType negative_slope,
 
 }
 
-} // namespace
-
-template <>
-void leaky_relu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_activations());
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void fp_compute_distconv(leaky_relu_distconv_adapter<TensorDataType, Layout, Device> &dc,
+                         TensorDataType negative_slope) {
+  assert_always(Layout == data_layout::DATA_PARALLEL);
+  dc.m_leaky_relu->forward(dc.get_prev_activations(), negative_slope,
+                           dc.get_activations());
 }
-template <>
-void leaky_relu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void bp_compute_distconv(leaky_relu_distconv_adapter<TensorDataType, Layout, Device> &dc,
+                         TensorDataType negative_slope) {
+  assert_always(Layout == data_layout::DATA_PARALLEL);
+  dc.m_leaky_relu->backward(dc.get_prev_activations(),
+                            dc.get_prev_error_signals(),
+                            negative_slope,
+                            dc.get_error_signals());
 }
-template <>
-void leaky_relu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_activations());
+#endif // LBANN_HAS_DISTCONV
+
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void leaky_relu_layer<TensorDataType, Layout, Device>::fp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    fp_compute_distconv(get_distconv_adapter(), m_negative_slope);
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+  local_fp(this->m_negative_slope,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void leaky_relu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_negative_slope,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void leaky_relu_layer<TensorDataType, Layout, Device>::bp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    bp_compute_distconv(get_distconv_adapter(), m_negative_slope);
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+  local_bp(this->m_negative_slope,
+           this->get_local_prev_activations(),
+           this->get_local_prev_error_signals(),
+           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class leaky_relu_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class leaky_relu_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/log_softmax.cpp b/src/layers/activations/log_softmax.cpp
index 737d1ec1045..4d5f3f7ceb7 100644
--- a/src/layers/activations/log_softmax.cpp
+++ b/src/layers/activations/log_softmax.cpp
@@ -24,26 +24,28 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE
 #include "lbann/layers/activations/log_softmax.hpp"
 
 namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 void fp(lbann_comm& comm,
-        const AbsDistMat& input,
-        AbsDistMat& output,
-        AbsDistMat& workspace) {
+        const El::AbstractDistMatrix<TensorDataType>& input,
+        El::AbstractDistMatrix<TensorDataType>& output,
+        El::AbstractDistMatrix<TensorDataType>& workspace) {
 
   // Local matrices
-  const auto& local_input = input.LockedMatrix();
-  auto& local_output = output.Matrix();
-  auto& local_workspace = workspace.Matrix();
-  const auto& local_height = local_input.Height();
-  const auto& local_width = local_input.Width();
+  const auto& local_input = dynamic_cast<const CPUMatDT<TensorDataType>&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<CPUMatDT<TensorDataType>&>(output.Matrix());
+  auto& local_workspace = dynamic_cast<CPUMatDT<TensorDataType>&>(workspace.Matrix());
+  const auto local_height = local_input.Height();
+  const auto local_width = local_input.Width();
 
   // Find column-wise maximum entries
-  El::Fill(workspace, std::numeric_limits<DataType>::lowest());
+  El::Fill(workspace, std::numeric_limits<TensorDataType>::lowest());
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
     auto& max_entry = local_workspace(0, col);
@@ -58,7 +60,7 @@ void fp(lbann_comm& comm,
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
     const auto shift = local_workspace(0, col);
-    DataType sum = 0;
+    TensorDataType sum{0};
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& x = local_input(row, col);
       auto& y = local_output(row, col);
@@ -72,7 +74,7 @@ void fp(lbann_comm& comm,
   // Compute output by subtracting LogSumExp
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    const DataType log_sum_exp = std::log(local_workspace(0, col));
+    const TensorDataType log_sum_exp = static_cast<TensorDataType>(std::log(local_workspace(El::TypeTraits<TensorDataType>::Zero(), col)));
     for (El::Int row = 0; row < local_height; ++row) {
       auto& y = local_output(row, col);
       y -= log_sum_exp;
@@ -81,17 +83,18 @@ void fp(lbann_comm& comm,
 
 }
 
+template <typename TensorDataType>
 void bp(lbann_comm& comm,
-        const AbsDistMat& output,
-        const AbsDistMat& gradient_wrt_output,
-        AbsDistMat& gradient_wrt_input,
-        AbsDistMat& workspace) {
+        const El::AbstractDistMatrix<TensorDataType>& output,
+        const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+        El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+        El::AbstractDistMatrix<TensorDataType>& workspace) {
 
   // Local matrices
-  const auto& local_output = output.LockedMatrix();
-  const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
-  auto& local_gradient_wrt_input = gradient_wrt_input.Matrix();
-  auto& local_workspace = workspace.Matrix();
+  const auto& local_output = dynamic_cast<const CPUMat&>(output.LockedMatrix());
+  const auto& local_gradient_wrt_output = dynamic_cast<const CPUMat&>(gradient_wrt_output.LockedMatrix());
+  auto& local_gradient_wrt_input = dynamic_cast<CPUMat&>(gradient_wrt_input.Matrix());
+  auto& local_workspace = dynamic_cast<CPUMat&>(workspace.Matrix());
   const auto& local_height = local_output.Height();
   const auto& local_width = local_output.Width();
 
@@ -123,35 +126,27 @@ void bp(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp(*get_comm(),
-     get_prev_activations(),
-     get_activations(),
-     *m_workspace);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void log_softmax_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp(*this->get_comm(),
+     this->get_prev_activations(),
+     this->get_activations(),
+     *this->m_workspace);
 }
-template <>
-void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp(*get_comm(),
-     get_activations(),
-     get_prev_error_signals(),
-     get_error_signals(),
-     *m_workspace);
-}
-template <>
-void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp(*get_comm(),
-     get_prev_activations(),
-     get_activations(),
-     *m_workspace);
-}
-template <>
-void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp(*get_comm(),
-     get_activations(),
-     get_prev_error_signals(),
-     get_error_signals(),
-     *m_workspace);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void log_softmax_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp(*this->get_comm(),
+     this->get_activations(),
+     this->get_prev_error_signals(),
+     this->get_error_signals(),
+     *this->m_workspace);
 }
 
+#define PROTO(T)                                      \
+  template class log_softmax_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class log_softmax_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>; \
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu
index d7ce82c5f9c..8c93bd7a5a9 100644
--- a/src/layers/activations/log_softmax.cu
+++ b/src/layers/activations/log_softmax.cu
@@ -24,200 +24,222 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE
 #include "lbann/layers/activations/log_softmax.hpp"
+#include "lbann/utils/cuda.hpp"
 
 namespace lbann {
 
 namespace {
 
-/** Find largest entry within each CUDA block.
- *  Each block is assigned several entries from the same mini-batch
- *  sample and it finds the largest entry. Results are output to an
- *  nblocksx x width matrix.
+/** @brief Max functor */
+template <class T>
+struct max_op {
+  __device__ __forceinline__
+  DataType operator()(const T& x1, const T& x2) const {
+    return cuda::max(x1, x2);
+  }
+};
+
+/** @brief Kernel for max reduction on matrix columns
+ *
+ *  Each CUDA block computes the max over a subset of matrix entries
+ *  and outputs the result. This is repeated multiple times for
+ *  column-wise max reduction.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param values       (height x width) matrix
+ *  @param max_values   (nblocksx x width) matrix
  */
-template <El::Int block_size>
-__global__ void reduce_max_kernel(El::Int height, El::Int width,
-                                  const DataType* __restrict__ values,
-                                  El::Int values_ldim,
-                                  DataType* __restrict__ max_values) {
-
+template <size_t bsize, typename TensorDataType>
+__global__ void reduce_max_kernel(size_t height,
+                                  size_t width,
+                                  const TensorDataType* __restrict__ values,
+                                  size_t values_ldim,
+                                  TensorDataType* __restrict__ max_values) {
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidx = blockIdx.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksx = gridDim.x;
-  const El::Int nblocksy = gridDim.y;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidx = blockIdx.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksx = gridDim.x;
+  const size_t nblocksy = gridDim.y;
 
-  // Reduce each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
+  for (size_t col = bidy; col < width; col += nblocksy) {
 
     // Find largest value for each thread
-    DataType private_max_val = -cuda::infinity<DataType>();
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      private_max_val = cuda::max(private_max_val,
-                                  values[row + col * values_ldim]);
+    TensorDataType thread_max_val{-cuda::infinity<DataType>()};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& val = values[row+col*values_ldim];
+      thread_max_val = cuda::max(thread_max_val, val);
     }
 
-    // Shared memory reduction to get largest value for each block
-    __shared__ DataType shared_max_vals[block_size];
-    shared_max_vals[tid] = private_max_val;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_max_vals[tid] = cuda::max(shared_max_vals[tid],
-                                         shared_max_vals[tid + stride]);
-      }
-    }
+    // Find largest value for each block
+    const TensorDataType block_max_val
+      = cuda::block_reduce<bsize,1,1,DataType,max_op<DataType>>(thread_max_val);
     if (tid == 0) {
-      max_values[bidx + col*nblocksx] = shared_max_vals[0];
+      max_values[bidx+col*nblocksx] = block_max_val;
     }
 
   }
 
 }
 
-/** Exponentiate inputs and compute sum(exp(x)).
- *  Inputs are shifted by the column max to prevent LogSumExp from
- *  blowing up.
+/** @brief Kernel for matrix column sums
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param sums On input, array of zeros. On output, sum(x) for each
+ *              column.
  */
-template <El::Int block_size>
-__global__ void fp_exp_kernel(El::Int height, El::Int width,
-                              const DataType* __restrict__ input,
-                              El::Int input_ldim,
-                              DataType* __restrict__ output,
-                              El::Int output_ldim,
-                              const DataType* __restrict__ shifts,
-                              El::Int shifts_stride,
-                              DataType* __restrict__ sums,
-                              El::Int sums_stride) {
+template <size_t bsize, typename TensorDataType>
+__global__ void reduce_sum_kernel(size_t height,
+                                  size_t width,
+                                  const TensorDataType* __restrict__ values,
+                                  size_t values_ldim,
+                                  TensorDataType* __restrict__ sums) {
 
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksy = gridDim.y;
 
-  // Reduce each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& shift = shifts[col * shifts_stride];
-
-    // Exponentiate inputs and compute sum for each thread
-    DataType private_sum = 0;
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& x = input[row + col * input_ldim];
-      auto& y = output[row + col * output_ldim];
-      y = x - shift;
-      private_sum += cuda::exp(y);
-    }
+  for (size_t col = bidy; col < width; col += nblocksy) {
 
-    // Shared memory reduction to get sum for each block
-    __shared__ DataType shared_sums[block_size];
-    shared_sums[tid] = private_sum;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_sums[tid] += shared_sums[tid + stride];
-      }
+    // Compute sum for each thread
+    TensorDataType thread_sum{0};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      thread_sum += values[row+col*values_ldim];
     }
 
-    // Atomic add to global sum
+    // Compute sum for each block
+    const TensorDataType block_sum = cuda::block_reduce<bsize,1,1>(thread_sum);
     if (tid == 0) {
-      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+      cuda::atomic_add(&sums[col], block_sum);
     }
 
   }
 
 }
 
-/** Subtract LogSumExp from outputs.
- *  sums should contain sum(exp(x)) for each column.
+/** @brief Compute sum(exp(x-shift)) for each matrix column
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param shifts   max(x) for each column
+ *  @param sums     On input, array of zeros. On output,
+ *                  sum(exp(x-shift)) for each column.
  */
-__global__ void fp_lse_kernel(El::Int height, El::Int width,
-                              DataType* __restrict__ output,
-                              El::Int output_ldim,
-                              const DataType* __restrict__ sums,
-                              El::Int sums_stride) {
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& log_sum_exp = cuda::log(sums[col * sums_stride]);
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      auto& y = output[row + col * output_ldim];
-      y -= log_sum_exp;
-    }
-  }
-}
-
-/** Compute sum of entries in gradient w.r.t. output. */
-template <El::Int block_size>
-__global__ void bp_sum_kernel(El::Int height, El::Int width,
-                              const DataType* __restrict__ gradient_wrt_output,
-                              El::Int gradient_wrt_output_ldim,
-                              DataType* __restrict__ sums,
-                              El::Int sums_stride) {
+template <size_t bsize, typename TensorDataType>
+__global__ void fp_sumexp_kernel(size_t height,
+                                 size_t width,
+                                 const TensorDataType* __restrict__ input,
+                                 size_t input_ldim,
+                                 const TensorDataType* __restrict__ shifts,
+                                 TensorDataType* __restrict__ sums) {
 
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksy = gridDim.y;
 
-  // Compute sum for each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
+  for (size_t col = bidy; col < width; col += nblocksy) {
+    const auto& shift = shifts[col];
 
-    // Compute sum for each thread
-    DataType private_sum = 0;
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      private_sum += dy;
-    }
-
-    // Shared memory reduction to get sum for each block
-    __shared__ DataType shared_sums[block_size];
-    shared_sums[tid] = private_sum;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_sums[tid] += shared_sums[tid + stride];
-      }
+    // Exponentiate inputs and compute sum for each thread
+    TensorDataType thread_sum{0};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& x = input[row+col*input_ldim];
+      thread_sum += cuda::exp(x-shift);
     }
 
-    // Atomic add to global sum
+    // Compute sum for each block
+    const TensorDataType block_sum = cuda::block_reduce<bsize,1,1>(thread_sum);
     if (tid == 0) {
-      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+      cuda::atomic_add(&sums[col], block_sum);
     }
 
   }
 
 }
 
-/** Compute gradient w.r.t. input. */
-template <El::Int block_size>
-__global__ void bp_kernel(El::Int height, El::Int width,
-                          const DataType* __restrict__ output,
-                          El::Int output_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          El::Int gradient_wrt_output_ldim,
-                          const DataType* __restrict__ sums,
-                          El::Int sums_stride,
-                          DataType* __restrict__ gradient_wrt_input,
-                          El::Int gradient_wrt_input_ldim) {
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& sum = sums[col * sums_stride];
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& y = output[row + col * output_ldim];
-      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+/** @brief Compute layer output
+ *
+ *  y = x - shift - log(sum(x-shift))
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param shifts   max(x) for each column
+ *  @param sums     sum(exp(x-shift)) for each column
+ */
+template <typename TensorDataType>
+__global__ void fp_output_kernel(size_t height,
+                                 size_t width,
+                                 const TensorDataType* __restrict__ input,
+                                 size_t input_ldim,
+                                 TensorDataType* __restrict__ output,
+                                 size_t output_ldim,
+                                 const TensorDataType* __restrict__ shifts,
+                                 const TensorDataType* __restrict__ sums) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t col = gidy; col < width; col += nthreadsy) {
+    const auto& shift = shifts[col];
+    const TensorDataType log_sum_exp = cuda::log(sums[col]);
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& x = input[row+col*input_ldim];
+      auto& y = output[row+col*output_ldim];
+      y = x - shift - log_sum_exp;
+    }
+  }
+}
+
+/** @brief Compute gradient w.r.t. input
+ *
+ *  dx = dy - softmax(x) * sum(dy)
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param sums Column sums of the gradient w.r.t. output
+ */
+template <typename TensorDataType>
+__global__ void bp_kernel(size_t height,
+                          size_t width,
+                          const TensorDataType* __restrict__ output,
+                          size_t output_ldim,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          size_t gradient_wrt_output_ldim,
+                          const TensorDataType* __restrict__ sums,
+                          TensorDataType* __restrict__ gradient_wrt_input,
+                          size_t gradient_wrt_input_ldim) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t col = gidy; col < width; col += nthreadsy) {
+    const auto& sum = sums[col];
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row+col*output_ldim];
+      const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row+col*gradient_wrt_input_ldim];
       dx = dy - cuda::exp(y) * sum;
     }
   }
@@ -225,54 +247,56 @@ __global__ void bp_kernel(El::Int height, El::Int width,
 
 } // namespace
 
-template <>
-void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+template <typename TensorDataType>
+void fp_compute_impl(log_softmax_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
+  const TensorDataType zero = 0;
+  const TensorDataType one = 1;
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_prev_activations());
+  auto& local_output = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_activations());
   if (!local_input.IsEmpty()) {
     CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(),
                                     CUDNN_SOFTMAX_LOG,
                                     CUDNN_SOFTMAX_MODE_INSTANCE,
                                     &one,
-                                    m_tensors_cudnn_desc.get_prev_activations(),
+                                    l.m_tensors_cudnn_desc.get_prev_activations(),
                                     local_input.LockedBuffer(),
                                     &zero,
-                                    m_tensors_cudnn_desc.get_activations(),
+                                    l.m_tensors_cudnn_desc.get_activations(),
                                     local_output.Buffer()));
   }
 }
 
-template <>
-void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+template <typename TensorDataType>
+void bp_compute_impl(log_softmax_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+  const TensorDataType zero = 0;
+  const TensorDataType one = 1;
+  const auto& local_output = dynamic_cast<const GPUMatType&>(l.get_local_activations());
+  const auto& local_gradient_wrt_output = dynamic_cast<const GPUMatType&>(l.get_local_prev_error_signals());
+  auto& local_gradient_wrt_input = dynamic_cast<GPUMatType&>(l.get_local_error_signals());
   if (!local_output.IsEmpty()) {
     CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(),
                                      CUDNN_SOFTMAX_LOG,
                                      CUDNN_SOFTMAX_MODE_INSTANCE,
                                      &one,
-                                     m_tensors_cudnn_desc.get_activations(),
+                                     l.m_tensors_cudnn_desc.get_activations(),
                                      local_output.LockedBuffer(),
-                                     m_tensors_cudnn_desc.get_prev_error_signals(),
+                                     l.m_tensors_cudnn_desc.get_prev_error_signals(),
                                      local_gradient_wrt_output.LockedBuffer(),
                                      &zero,
-                                     m_tensors_cudnn_desc.get_error_signals(),
+                                     l.m_tensors_cudnn_desc.get_error_signals(),
                                      local_gradient_wrt_input.Buffer()));
   }
 }
 
-template <>
-void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(log_softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_input = dynamic_cast<const GPUMatType&>(l.get_local_prev_activations());
+  auto& local_output = dynamic_cast<GPUMatType&>(l.get_local_activations());
+  auto& local_workspace = dynamic_cast<GPUMatType&>(l.m_workspace->Matrix());
   const auto& local_height = local_input.Height();
   const auto& local_width = local_input.Width();
 
@@ -281,68 +305,80 @@ void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute
   auto&& event = El::GPUManager::Event();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
-  // Initialize CUDA threads/blocks
-  // Note: kernels use a 2D thread distribution with a 256 x 1 block
-  // and nblocksx x local_width grid.
-  constexpr El::Int block_size = 256;
-  dim3 block_dims, grid_dims;
-  block_dims.x = block_size;
-  grid_dims.y = local_width;
-
-  // Find column-wise maximum entries
-  grid_dims.x = (local_height + block_size - 1) / block_size;
-  if (grid_dims.x < 1) { grid_dims.x = 1; }
-  cuda::thrust::vector<DataType> max_vals(grid_dims.x * local_width);
-  reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
-    local_height, local_width,
-    local_input.LockedBuffer(), local_input.LDim(),
-    max_vals.data().get());
-  while (grid_dims.x > 1) {
-    const El::Int prev_height = grid_dims.x;
-    grid_dims.x = (prev_height + block_size - 1) / block_size;
-    cuda::thrust::vector<DataType> prev_vals(std::move(max_vals));
+  // Find max value in each column
+  cuda::thrust::vector<TensorDataType> max_vals;
+  if (local_input.IsEmpty()) {
+    max_vals.resize(local_width,
+                    -std::numeric_limits<DataType>::infinity());
+  }
+  else {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
     max_vals.resize(grid_dims.x * local_width);
     reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
-      prev_height, local_width,
-      prev_vals.data().get(), prev_height,
+      local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
       max_vals.data().get());
+    while (grid_dims.x > 1) {
+      const size_t prev_height = grid_dims.x;
+      grid_dims.x = (prev_height + block_size - 1) / block_size;
+      cuda::thrust::vector<TensorDataType> prev_vals(std::move(max_vals));
+      max_vals.resize(grid_dims.x * local_width);
+      reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+        prev_height, local_width,
+        prev_vals.data().get(), prev_height,
+        max_vals.data().get());
+    }
   }
   El::mpi::AllReduce(max_vals.data().get(), max_vals.size(),
-                     El::mpi::MAX, m_workspace->RedundantComm(),
+                     El::mpi::MAX, l.m_workspace->RedundantComm(),
                      sync_info);
 
-  // Shift inputs and compute sum(exp(x)) for each column
-  El::Zero(*m_workspace);
-  if (!local_output.IsEmpty()) {
+  // Compute sum(exp(x-max_val)) for each column
+  El::Zero(*l.m_workspace);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    fp_exp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+    grid_dims.y = local_width;
+    fp_sumexp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
       local_input.LockedBuffer(), local_input.LDim(),
-      local_output.Buffer(), local_output.LDim(),
-      max_vals.data().get(), 1,
-      local_workspace.Buffer(), 1);
+      max_vals.data().get(),
+      local_workspace.Buffer());
   }
-  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+  l.get_comm()->allreduce(*l.m_workspace, l.m_workspace->RedundantComm());
 
-  // Compute output by subtracting LogSumExp
+  // Compute output
+  // Note: y = x - max_val - log(sum(exp(x-max_val)))
   if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    fp_lse_kernel<<<grid_dims, block_dims, 0, stream>>>(
+    grid_dims.y = local_width;
+    fp_output_kernel<<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
       local_output.Buffer(), local_output.LDim(),
-      local_workspace.LockedBuffer(), 1);
+      max_vals.data().get(),
+      local_workspace.LockedBuffer());
   }
 
 }
 
-template <>
-void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-
+template <typename TensorDataType>
+void bp_compute_impl(log_softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
   // Local matrices
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_output = dynamic_cast<const GPUMatType&>(l.get_local_activations());
+  const auto& local_gradient_wrt_output = dynamic_cast<const GPUMatType&>(l.get_local_prev_error_signals());
+  auto& local_gradient_wrt_input = dynamic_cast<GPUMatType&>(l.get_local_error_signals());
+  auto& local_workspace = dynamic_cast<GPUMatType&>(l.m_workspace->Matrix());
   const auto& local_height = local_output.Height();
   const auto& local_width = local_output.Width();
 
@@ -351,41 +387,58 @@ void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute
   auto&& event = El::GPUManager::Event();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
-  // Initialize CUDA threads/blocks
-  // Note: kernels use a 2D thread distribution with a 256 x 1 block
-  // and nblocksx x local_width grid.
-  constexpr El::Int block_size = 256;
-  dim3 block_dims, grid_dims;
-  block_dims.x = block_size;
-  grid_dims.y = local_width;
-
   // Compute sum of entries in gradient w.r.t. output
   El::Zero(local_workspace);
-  if (!local_output.IsEmpty()) {
+  if (!local_gradient_wrt_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    bp_sum_kernel<block_size>
+    grid_dims.y = local_width;
+    reduce_sum_kernel<block_size>
       <<<grid_dims, block_dims, 0, stream>>>(
         local_height, local_width,
         local_gradient_wrt_output.LockedBuffer(),
         local_gradient_wrt_output.LDim(),
-        local_workspace.Buffer(), 1);
+        local_workspace.Buffer());
   }
-  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+  l.get_comm()->allreduce(*l.m_workspace, l.m_workspace->RedundantComm());
 
   // Compute gradient w.r.t. input
-  if (!local_output.IsEmpty()) {
+  if (!local_gradient_wrt_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    bp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+    grid_dims.y = local_width;
+    bp_kernel<<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
       local_output.LockedBuffer(),
       local_output.LDim(),
       local_gradient_wrt_output.LockedBuffer(),
       local_gradient_wrt_output.LDim(),
-      local_workspace.Buffer(), 1,
+      local_workspace.LockedBuffer(),
       local_gradient_wrt_input.Buffer(),
       local_gradient_wrt_input.LDim());
   }
 
 }
 
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void log_softmax_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_compute_impl(*this);
+}
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void log_softmax_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_compute_impl(*this);
+}
+
+// Template instantiation
+#define PROTO(T)                                      \
+  template class log_softmax_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class log_softmax_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>; \
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/relu.cpp b/src/layers/activations/relu.cpp
new file mode 100644
index 00000000000..854a3ee6774
--- /dev/null
+++ b/src/layers/activations/relu.cpp
@@ -0,0 +1,78 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_RELU_LAYER_INSTANTIATE
+#include "lbann/layers/activations/relu.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** Entry-wise operator. */
+template <typename TensorDataType>
+struct op {
+  inline TensorDataType operator()(const TensorDataType &x) const {
+    return std::max(x, El::TypeTraits<TensorDataType>::Zero());
+  }
+};
+
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+template <typename TensorDataType>
+struct op_backprop {
+  inline TensorDataType operator()(const TensorDataType &x, const TensorDataType &dy) const {
+    return x > El::TypeTraits<TensorDataType>::Zero() ? dy : El::TypeTraits<TensorDataType>::Zero();
+  }
+};
+
+} // namespace
+
+// Template instantiation
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void relu_layer<TensorDataType, Layout, Device>::fp_compute() {
+  apply_entrywise_unary_operator<op, TensorDataType>(
+      this->get_prev_activations(), this->get_activations());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void relu_layer<TensorDataType, Layout, Device>::bp_compute() {
+  apply_entrywise_binary_operator<op_backprop, TensorDataType>(
+      this->get_prev_activations(), this->get_prev_error_signals(),
+      this->get_error_signals());
+}
+
+#define PROTO(T)                                                        \
+  template class relu_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class relu_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/activations/relu.cu b/src/layers/activations/relu.cu
new file mode 100644
index 00000000000..7ed6b75a4c5
--- /dev/null
+++ b/src/layers/activations/relu.cu
@@ -0,0 +1,106 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_RELU_LAYER_INSTANTIATE
+#include "lbann/layers/activations/relu.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** Entry-wise operator. */
+template <typename TensorDataType>
+struct op {
+  inline __device__ TensorDataType operator()(TensorDataType x) const {
+    return x > TensorDataType{0} ? x : TensorDataType{0};
+  }
+};
+
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+template <typename TensorDataType>
+struct op_backprop {
+  inline __device__ TensorDataType operator()(TensorDataType x, TensorDataType dy) const {
+    return x > TensorDataType{0} ? dy : TensorDataType{0};
+  }
+};
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void fp_compute_distconv(relu_distconv_adapter<TensorDataType, Layout, Device> &dc) {
+  assert_always(Layout == data_layout::DATA_PARALLEL);
+  dc.m_relu->forward(TensorDataType{1}, dc.get_prev_activations(),
+                     TensorDataType{0}, dc.get_activations());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void bp_compute_distconv(relu_distconv_adapter<TensorDataType, Layout, Device> &dc) {
+  assert_always(Layout == data_layout::DATA_PARALLEL);
+  dc.m_relu->backward(TensorDataType{1}, dc.get_activations(),
+                      dc.get_prev_error_signals(),
+                      dc.get_prev_activations(),
+                      TensorDataType{0}, dc.get_error_signals());
+}
+#endif // LBANN_HAS_DISTCONV
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void relu_layer<TensorDataType, Layout, Device>::fp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    fp_compute_distconv(get_distconv_adapter());
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+  cuda::apply_entrywise_unary_operator<op, TensorDataType>(
+      this->get_prev_activations(),
+      this->get_activations());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void relu_layer<TensorDataType, Layout, Device>::bp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    bp_compute_distconv(get_distconv_adapter());
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+  cuda::apply_entrywise_binary_operator<op_backprop, TensorDataType>(
+      this->get_prev_activations(), this->get_prev_error_signals(),
+      this->get_error_signals());
+}
+
+#define PROTO(T)                                                        \
+  template class relu_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class relu_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/activations/softmax.cpp b/src/layers/activations/softmax.cpp
index e56788fd43f..3a8bb16184a 100644
--- a/src/layers/activations/softmax.cpp
+++ b/src/layers/activations/softmax.cpp
@@ -24,23 +24,24 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_SOFTMAX_LAYER_INSTANTIATE
 #include "lbann/layers/activations/softmax.hpp"
 
 namespace lbann {
 
 namespace {
 
-// Minimum output value to avoid denormalized floats
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-const DataType min_output = std::sqrt(std::numeric_limits<DataType>::min());
-#else
-const DataType min_output = 0;
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-
+template <typename TensorDataType>
 void fp(lbann_comm& comm,
-        const AbsDistMat& input,
-        AbsDistMat& output,
-        AbsDistMat& workspace) {
+        const El::AbstractDistMatrix<TensorDataType>& input,
+        El::AbstractDistMatrix<TensorDataType>& output,
+        El::AbstractDistMatrix<TensorDataType>& workspace,
+        TensorDataType threshold_val,
+        softmax_mode mode) {
+
+  if(mode != softmax_mode::INSTANCE) {
+    LBANN_ERROR("Unsupported softmax mode");
+  }
 
   // Local matrices
   const auto& local_input = input.LockedMatrix();
@@ -50,7 +51,7 @@ void fp(lbann_comm& comm,
   const auto& local_width = local_input.Width();
 
   // Find column-wise maximum entries
-  El::Fill(workspace, std::numeric_limits<DataType>::lowest());
+  El::Fill(workspace, std::numeric_limits<TensorDataType>::lowest());
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
     auto& max_entry = local_workspace(0, col);
@@ -66,7 +67,7 @@ void fp(lbann_comm& comm,
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
     const auto shift = local_workspace(0, col);
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& x = local_input(row, col);
       auto& y = local_output(row, col);
@@ -85,17 +86,27 @@ void fp(lbann_comm& comm,
     const auto& scale = 1 / local_workspace(0, col);
     for (El::Int row = 0; row < local_height; ++row) {
       auto& y = local_output(row, col);
-      y = std::max(scale * y, min_output);
+      y = scale * y;
+#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD
+      y = std::max(y, threshold_val);
+#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD
     }
   }
 
 }
 
+template <typename TensorDataType>
 void bp(lbann_comm& comm,
-        const AbsDistMat& output,
-        const AbsDistMat& gradient_wrt_output,
-        AbsDistMat& gradient_wrt_input,
-        AbsDistMat& workspace) {
+        const El::AbstractDistMatrix<TensorDataType>& output,
+        const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+        El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+        El::AbstractDistMatrix<TensorDataType>& workspace,
+        TensorDataType threshold_val,
+        softmax_mode mode) {
+
+  if(mode != softmax_mode::INSTANCE) {
+    LBANN_ERROR("Unsupported softmax mode");
+  }
 
   // Local matrices
   const auto& local_output = output.LockedMatrix();
@@ -126,7 +137,7 @@ void bp(lbann_comm& comm,
       const auto& y = local_output(row, col);
       const auto& dy = local_gradient_wrt_output(row, col);
       auto& dx = local_gradient_wrt_input(row, col);
-      dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0);
+      dx = y * (dy - y_dot_dy);
     }
   }
 
@@ -134,35 +145,32 @@ void bp(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp(*get_comm(),
-     get_prev_activations(),
-     get_activations(),
-     *m_workspace);
-}
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp(*get_comm(),
-     get_activations(),
-     get_prev_error_signals(),
-     get_error_signals(),
-     *m_workspace);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void softmax_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp(*this->get_comm(),
+     this->get_prev_activations(),
+     this->get_activations(),
+     *this->m_workspace,
+     this->threshold_val,
+     this->m_mode);
 }
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp(*get_comm(),
-     get_prev_activations(),
-     get_activations(),
-     *m_workspace);
-}
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp(*get_comm(),
-     get_activations(),
-     get_prev_error_signals(),
-     get_error_signals(),
-     *m_workspace);
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void softmax_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp(*this->get_comm(),
+     this->get_activations(),
+     this->get_prev_error_signals(),
+     this->get_error_signals(),
+     *this->m_workspace,
+     this->threshold_val,
+     this->m_mode);
 }
 
+#define PROTO(T)                                      \
+  template class softmax_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class softmax_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu
index a58d38c760a..95965f53426 100644
--- a/src/layers/activations/softmax.cu
+++ b/src/layers/activations/softmax.cu
@@ -24,368 +24,440 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_SOFTMAX_LAYER_INSTANTIATE
 #include "lbann/layers/activations/softmax.hpp"
+#include "lbann/utils/cuda.hpp"
 
 namespace lbann {
 
 namespace {
 
-/** Minimum output value to avoid denormalized floats. */
-inline __device__ DataType get_min_output() {
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-  return cuda::sqrt(cuda::min<DataType>());
-#else
-  return DataType(0);
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-}
-
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-/** Operator for thresholding output. */
-struct fp_threshold_op {
-  const DataType min_output = get_min_output();
-  inline __device__ DataType operator()(const DataType& y) const {
-    return cuda::max(y, min_output);
+#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD
+/** Functor to ensure values are above threshold value */
+template <typename TensorDataType>
+struct threshold_op {
+  __forceinline__ __device__ TensorDataType operator()(const TensorDataType& y) const {
+    return cuda::max(y, cuda::sqrt(cuda::min<TensorDataType>()));
   }
 };
-/** Operator for thresholding gradient w.r.t. input. */
-struct bp_threshold_op {
-  const DataType min_output = get_min_output();
-  inline __device__ DataType operator()(const DataType& y,
-                                        const DataType& dx) const {
-    return (y > min_output) ? dx : DataType(0);
+#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD
+
+/** @brief Max functor */
+template <class T>
+struct max_op {
+  __device__ __forceinline__
+  DataType operator()(const T& x1, const T& x2) const {
+    return cuda::max(x1, x2);
   }
 };
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
 
-/** Find largest entry within each CUDA block.
- *  Each block is assigned several entries from the same mini-batch
- *  sample and it finds the largest entry. Results are output to an
- *  nblocksx x width matrix.
+/** @brief Kernel for max reduction on matrix columns
+ *
+ *  Each CUDA block computes the max over a subset of matrix entries
+ *  and outputs the result. This is repeated multiple times for
+ *  column-wise max reduction.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param values       (height x width) matrix
+ *  @param max_values   (nblocksx x width) matrix
  */
-template <El::Int block_size>
-__global__ void reduce_max_kernel(El::Int height, El::Int width,
-                                  const DataType* __restrict__ values,
-                                  El::Int values_ldim,
-                                  DataType* __restrict__ max_values) {
+template <size_t bsize, typename TensorDataType>
+__global__ void reduce_max_kernel(size_t height,
+                                  size_t width,
+                                  const TensorDataType* __restrict__ values,
+                                  size_t values_ldim,
+                                  TensorDataType* __restrict__ max_values) {
 
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidx = blockIdx.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksx = gridDim.x;
-  const El::Int nblocksy = gridDim.y;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidx = blockIdx.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksx = gridDim.x;
+  const size_t nblocksy = gridDim.y;
 
-  // Reduce each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
+  for (size_t col = bidy; col < width; col += nblocksy) {
 
     // Find largest value for each thread
-    DataType private_max_val = -cuda::infinity<DataType>();
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      private_max_val = cuda::max(private_max_val,
-                                  values[row + col * values_ldim]);
+    TensorDataType thread_max_val{-cuda::infinity<DataType>()};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& val = values[row+col*values_ldim];
+      thread_max_val = cuda::max(thread_max_val, val);
     }
 
-    // Shared memory reduction to get largest value for each block
-    __shared__ DataType shared_max_vals[block_size];
-    shared_max_vals[tid] = private_max_val;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_max_vals[tid] = cuda::max(shared_max_vals[tid],
-                                         shared_max_vals[tid + stride]);
-      }
-    }
+    // Find largest value for each block
+    const TensorDataType block_max_val
+      = cuda::block_reduce<bsize,1,1,DataType,max_op<DataType>>(thread_max_val);
     if (tid == 0) {
-      max_values[bidx + col*nblocksx] = shared_max_vals[0];
+      max_values[bidx+col*nblocksx] = block_max_val;
     }
 
   }
 
 }
 
-/** Exponentiate outputs and compute column sums.
- *  Subtracting by the column max prevents output from blowing
- *  up. Large negative values underflow to 0.
+/** @brief Compute exp(x-shift)
+ *
+ *  Also compute sum(exp(x-shift)) for each matrix column.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
  */
-template <El::Int block_size>
-__global__ void fp_exp_kernel(El::Int height, El::Int width,
-                              const DataType* __restrict__ input,
-                              El::Int input_ldim,
-                              DataType* __restrict__ output,
-                              El::Int output_ldim,
-                              const DataType* __restrict__ shifts,
-                              El::Int shifts_stride,
-                              DataType* __restrict__ sums,
-                              El::Int sums_stride) {
+template <size_t bsize, typename TensorDataType>
+__global__ void fp_exp_kernel(size_t height,
+                              size_t width,
+                              const TensorDataType* __restrict__ input,
+                              size_t input_ldim,
+                              TensorDataType* __restrict__ output,
+                              size_t output_ldim,
+                              const TensorDataType* __restrict__ shifts,
+                              TensorDataType* __restrict__ sums) {
 
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
-
-  // Reduce each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& shift = shifts[col * shifts_stride];
-
-    // Exponentiate and compute sum for each thread
-    DataType private_sum = 0;
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& x = input[row + col * input_ldim];
-      auto& y = output[row + col * output_ldim];
-      y = cuda::exp(x - shift);
-      private_sum += y;
-    }
-
-    // Shared memory reduction to get sum for each block
-    __shared__ DataType shared_sums[block_size];
-    shared_sums[tid] = private_sum;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_sums[tid] += shared_sums[tid + stride];
-      }
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksy = gridDim.y;
+
+  for (size_t col = bidy; col < width; col += nblocksy) {
+    const auto& shift = shifts[col];
+
+    // Exponentiate inputs and compute sum for each thread
+    TensorDataType thread_sum{0};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& x = input[row+col*input_ldim];
+      auto& y = output[row+col*output_ldim];
+      y = cuda::exp(x-shift);
+      thread_sum += y;
     }
 
-    // Atomic add to global sum
+    // Compute sum for each block
+    const TensorDataType block_sum = cuda::block_reduce<bsize,1,1>(thread_sum);
     if (tid == 0) {
-      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+      cuda::atomic_add(&sums[col], block_sum);
     }
 
   }
 
 }
 
-/** Divide outputs by column sums.
- *  Small values can be rounded to minimum output value to avoid
- *  denormalized floats.
+/** @brief Compute layer output
+ *
+ *  y = exp(x-shift) / sum(exp(x-shift))
+ *
+ *  If @c LBANN_ENABLE_SOFTMAX_THRESHOLD is set, small values are
+ *  thresholded to a minimum value to avoid denormalized floats.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param output   On input, constains exp(x-shift). On output,
+ *                  contains the layer output.
+ *  @param sums     sum(exp(x-shift)) for each column
  */
-__global__ void fp_scale_kernel(El::Int height, El::Int width,
-                                DataType* __restrict__ output,
-                                El::Int output_ldim,
-                                const DataType* __restrict__ sums,
-                                El::Int sums_stride) {
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
-  const auto& min_output = get_min_output();
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& scale = 1 / sums[col * sums_stride];
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      auto& y = output[row + col * output_ldim];
-      y = cuda::max(scale * y, min_output);
+template <typename TensorDataType>
+__global__ void fp_output_kernel(size_t height,
+                                 size_t width,
+                                 TensorDataType* __restrict__ output,
+                                 size_t output_ldim,
+                                 const TensorDataType* __restrict__ sums) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t col = gidy; col < width; col += nthreadsy) {
+    const auto& denom = sums[col];
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      auto& y = output[row+col*output_ldim];
+      y /= denom;
+#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD
+      y = cuda::max(y, cuda::sqrt(cuda::min<TensorDataType>()));
+#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD
     }
   }
 }
 
-/** Compute dot products between output and gradient w.r.t. output. */
-template <El::Int block_size>
-__global__ void bp_dot_product_kernel(El::Int height, El::Int width,
-                                      const DataType* __restrict__ output,
-                                      El::Int output_ldim,
-                                      const DataType* __restrict__ gradient_wrt_output,
-                                      El::Int gradient_wrt_output_ldim,
-                                      DataType* __restrict__ dot_products,
-                                      El::Int dot_products_stride) {
+/** @brief Compute dot(y,dy) for each matrix column
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ */
+template <size_t bsize, typename TensorDataType>
+__global__ void bp_dot_product_kernel(
+  size_t height,
+  size_t width,
+  const TensorDataType* __restrict__ output,
+  size_t output_ldim,
+  const TensorDataType* __restrict__ gradient_wrt_output,
+  size_t gradient_wrt_output_ldim,
+  TensorDataType* __restrict__ dot_products) {
 
   // Indices
-  const El::Int tid = threadIdx.x;
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nblocksy = gridDim.y;
 
-  // Compute dot product for each matrix column independently
-  for (El::Int col = bidy; col < width; col += nblocksy) {
+  for (size_t col = bidy; col < width; col += nblocksy) {
 
     // Compute dot product contribution for each thread
-    DataType private_dot_product = 0;
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& y = output[row + col * output_ldim];
-      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      private_dot_product += y * dy;
-    }
-
-    // Shared memory reduction to get contribution for each block
-    __shared__ DataType shared_dot_products[block_size];
-    shared_dot_products[tid] = private_dot_product;
-    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
-      __syncthreads();
-      if (tid < stride) {
-        shared_dot_products[tid] += shared_dot_products[tid + stride];
-      }
+    TensorDataType thread_dot_product{0};
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row+col*output_ldim];
+      const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim];
+      thread_dot_product += y * dy;
     }
 
-    // Atomic add to global dot product
+    // Compute dot product contribution for each block
+    const TensorDataType block_dot_product
+      = cuda::block_reduce<bsize,1,1>(thread_dot_product);
     if (tid == 0) {
-      cuda::atomic_add(&dot_products[col * dot_products_stride],
-                       shared_dot_products[0]);
+      cuda::atomic_add(&dot_products[col], block_dot_product);
     }
 
   }
 
 }
 
-/** Compute gradient w.r.t. input. */
-template <El::Int block_size>
-__global__ void bp_kernel(El::Int height, El::Int width,
-                          const DataType* __restrict__ output,
-                          El::Int output_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          El::Int gradient_wrt_output_ldim,
-                          const DataType* __restrict__ dot_products,
-                          El::Int dot_products_stride,
-                          DataType* __restrict__ gradient_wrt_input,
-                          El::Int gradient_wrt_input_ldim) {
-  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int bidy = blockIdx.y;
-  const El::Int nthreadsx = blockDim.x * gridDim.x;
-  const El::Int nblocksy = gridDim.y;
-  const auto& min_output = get_min_output();
-  for (El::Int col = bidy; col < width; col += nblocksy) {
-    const auto& y_dot_dy = dot_products[col * dot_products_stride];
-    for (El::Int row = gidx; row < height; row += nthreadsx) {
-      const auto& y = output[row + col * output_ldim];
-      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
-      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-      dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0);
+/** @brief Compute gradient w.r.t. input
+ *
+ *  dx = y * (dy - dot(y,dy))
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimension: (height / bsize) x width x 1
+ *
+ *  @param dot_products dot(y,dy) for each matrix column
+ */
+template <size_t bsize, typename TensorDataType>
+__global__ void bp_kernel(size_t height,
+                          size_t width,
+                          const TensorDataType* __restrict__ output,
+                          size_t output_ldim,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          size_t gradient_wrt_output_ldim,
+                          const TensorDataType* __restrict__ dot_products,
+                          TensorDataType* __restrict__ gradient_wrt_input,
+                          size_t gradient_wrt_input_ldim) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t col = gidy; col < width; col += nthreadsy) {
+    const auto& y_dot_dy = dot_products[col];
+    for (size_t row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row+col*output_ldim];
+      const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row+col*gradient_wrt_input_ldim];
+      dx = y * (dy - y_dot_dy);
     }
   }
 }
 
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void fp_compute_distconv(softmax_distconv_adapter<TensorDataType, Layout, Device> &dc) {
+  dc.m_softmax->forward(dc.get_prev_activations(), dc.get_activations());
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void bp_compute_distconv(softmax_distconv_adapter<TensorDataType, Layout, Device> &dc) {
+  dc.m_softmax->backward(dc.get_activations(),
+                         dc.get_prev_error_signals(),
+                         dc.get_error_signals());
+}
+#endif // LBANN_HAS_DISTCONV
+
 } // namespace
 
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+template <typename TensorDataType>
+void fp_compute_impl(softmax_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
+#ifdef LBANN_HAS_DISTCONV
+  if (l.distconv_enabled()) {
+    fp_compute_distconv(l.get_distconv_adapter());
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+
+  cudnnSoftmaxMode_t cudnn_softmax_mode;
+  switch(l.m_mode) {
+    case softmax_mode::INSTANCE:
+      cudnn_softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+      break;
+    case softmax_mode::CHANNEL:
+      cudnn_softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
+      break;
+    default:
+      LBANN_ERROR("Unsupported softmax mode");
+  }
+
+  const cudnn::ScalingParamType<TensorDataType> zero = 0.;
+  const cudnn::ScalingParamType<TensorDataType> one = 1.;
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_prev_activations());
+  auto& local_output = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_activations());
   if (!local_input.IsEmpty()) {
     CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(),
                                     CUDNN_SOFTMAX_ACCURATE,
-                                    CUDNN_SOFTMAX_MODE_INSTANCE,
+                                    cudnn_softmax_mode,
                                     &one,
-                                    m_tensors_cudnn_desc.get_prev_activations(),
+                                    l.m_tensors_cudnn_desc.get_prev_activations(),
                                     local_input.LockedBuffer(),
                                     &zero,
-                                    m_tensors_cudnn_desc.get_activations(),
+                                    l.m_tensors_cudnn_desc.get_activations(),
                                     local_output.Buffer()));
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-    cuda::apply_entrywise_unary_operator<fp_threshold_op>(local_output,
-                                                          local_output);
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
+#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD
+    cuda::apply_entrywise_unary_operator<threshold_op>(local_output,
+                                                       local_output);
+#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD
   }
 }
 
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+template <typename TensorDataType>
+void bp_compute_impl(softmax_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
+#ifdef LBANN_HAS_DISTCONV
+  if (l.distconv_enabled()) {
+    bp_compute_distconv(l.get_distconv_adapter());
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+
+  cudnnSoftmaxMode_t cudnn_softmax_mode;
+  switch(l.m_mode) {
+    case softmax_mode::INSTANCE:
+      cudnn_softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+      break;
+    case softmax_mode::CHANNEL:
+      cudnn_softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
+      break;
+    default:
+      LBANN_ERROR("Unsupported softmax mode");
+  }
+
+  const cudnn::ScalingParamType<TensorDataType> zero = 0.;
+  const cudnn::ScalingParamType<TensorDataType> one = 1.;
+  const auto& local_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_activations());
+  const auto& local_gradient_wrt_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_prev_error_signals());
+  auto& local_gradient_wrt_input = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_error_signals());
   if (!local_output.IsEmpty()) {
     CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(),
                                      CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_INSTANCE,
+                                     cudnn_softmax_mode,
                                      &one,
-                                     m_tensors_cudnn_desc.get_activations(),
+                                     l.m_tensors_cudnn_desc.get_activations(),
                                      local_output.LockedBuffer(),
-                                     m_tensors_cudnn_desc.get_prev_error_signals(),
+                                     l.m_tensors_cudnn_desc.get_prev_error_signals(),
                                      local_gradient_wrt_output.LockedBuffer(),
                                      &zero,
-                                     m_tensors_cudnn_desc.get_error_signals(),
+                                     l.m_tensors_cudnn_desc.get_error_signals(),
                                      local_gradient_wrt_input.Buffer()));
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-    cuda::apply_entrywise_binary_operator<bp_threshold_op>(local_output,
-                                                           local_gradient_wrt_input,
-                                                           local_gradient_wrt_input);
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
   }
 }
 
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
+
+  if(l.m_mode != softmax_mode::INSTANCE) {
+    LBANN_ERROR("Unsupported softmax mode");
+  }
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  auto& local_workspace = m_workspace->Matrix();
-  const auto& local_height = local_input.Height();
-  const auto& local_width = local_input.Width();
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_prev_activations());
+  auto& local_output = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_activations());
+  auto& local_workspace = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.m_workspace->Matrix());
+  const size_t local_height = local_input.Height();
+  const size_t local_width = local_input.Width();
 
   // GPU objects
   auto&& stream = El::GPUManager::Stream();
   auto&& event = El::GPUManager::Event();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
-  // Initialize CUDA threads/blocks
-  // Note: kernels use a 2D thread distribution with a 256 x 1 block
-  // and nblocksx x local_width grid.
-  constexpr El::Int block_size = 256;
-  dim3 block_dims, grid_dims;
-  block_dims.x = block_size;
-  grid_dims.y = local_width;
-
-  // Find column-wise maximum entries
-  grid_dims.x = (local_height + block_size - 1) / block_size;
-  if (grid_dims.x < 1) { grid_dims.x = 1; }
-  cuda::thrust::vector<DataType> max_vals(grid_dims.x * local_width);
-  reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
-    local_height, local_width,
-    local_input.LockedBuffer(), local_input.LDim(),
-    max_vals.data().get());
-  while (grid_dims.x > 1) {
-    const El::Int prev_height = grid_dims.x;
-    grid_dims.x = (prev_height + block_size - 1) / block_size;
-    cuda::thrust::vector<DataType> prev_vals(std::move(max_vals));
+  // Find max value in each column
+  cuda::thrust::vector<TensorDataType> max_vals;
+  if (local_output.IsEmpty()) {
+    max_vals.resize(local_width,
+                    -std::numeric_limits<TensorDataType>::infinity());
+  }
+  else {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
     max_vals.resize(grid_dims.x * local_width);
     reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
-      prev_height, local_width,
-      prev_vals.data().get(), prev_height,
+      local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
       max_vals.data().get());
+    while (grid_dims.x > 1) {
+      const size_t prev_height = grid_dims.x;
+      grid_dims.x = (prev_height + block_size - 1) / block_size;
+      cuda::thrust::vector<TensorDataType> prev_vals(std::move(max_vals));
+      max_vals.resize(grid_dims.x * local_width);
+      reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+        prev_height, local_width,
+        prev_vals.data().get(), prev_height,
+        max_vals.data().get());
+    }
   }
   El::mpi::AllReduce(max_vals.data().get(), max_vals.size(),
-                     El::mpi::MAX, m_workspace->RedundantComm(),
+                     El::mpi::MAX, l.m_workspace->RedundantComm(),
                      sync_info);
 
-  // Exponentiate outputs and compute column sums
-  El::Zero(*m_workspace);
+  // Compute exp(x-max_val) and sum(exp(x-max_val))
+  El::Zero(*l.m_workspace);
   if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
     fp_exp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
       local_input.LockedBuffer(), local_input.LDim(),
       local_output.Buffer(), local_output.LDim(),
-      max_vals.data().get(), 1,
-      local_workspace.Buffer(), 1);
+      max_vals.data().get(),
+      local_workspace.Buffer());
   }
-  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+  El::AllReduce(*l.m_workspace, l.m_workspace->RedundantComm());
 
-  // Divide activations by column sums
+  // Compute output
+  // Note: y = exp(x-max_val) / sum(exp(x-max_val))
   if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
-    fp_scale_kernel<<<grid_dims, block_dims, 0, stream>>>(
+    grid_dims.y = local_width;
+    fp_output_kernel<<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
       local_output.Buffer(), local_output.LDim(),
-      local_workspace.LockedBuffer(), 1);
+      local_workspace.LockedBuffer());
   }
 
 }
 
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
+template <typename TensorDataType>
+void bp_compute_impl(softmax_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
+
+  if(l.m_mode != softmax_mode::INSTANCE) {
+    LBANN_ERROR("Unsupported softmax mode");
+  }
 
   // Local matrices
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_activations());
+  const auto& local_gradient_wrt_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_prev_error_signals());
+  auto& local_gradient_wrt_input = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.get_local_error_signals());
+  auto& local_workspace = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(l.m_workspace->Matrix());
   const auto& local_height = local_output.Height();
   const auto& local_width = local_output.Width();
 
@@ -394,18 +466,14 @@ void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
   auto&& event = El::GPUManager::Event();
   El::SyncInfo<El::Device::GPU> sync_info{stream, event};
 
-  // Initialize CUDA threads/blocks
-  // Note: kernels use a 2D thread distribution with a 256 x 1 block
-  // and nblocksx x local_width grid.
-  constexpr El::Int block_size = 256;
-  dim3 block_dims, grid_dims;
-  block_dims.x = block_size;
-  grid_dims.y = local_width;
-
-  // Compute dot products between output and gradient w.r.t. output
+  // Compute dot(y,dy)
   El::Zero(local_workspace);
   if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
     bp_dot_product_kernel<block_size>
       <<<grid_dims, block_dims, 0, stream>>>(
         local_height, local_width,
@@ -413,24 +481,45 @@ void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
         local_output.LDim(),
         local_gradient_wrt_output.LockedBuffer(),
         local_gradient_wrt_output.LDim(),
-        local_workspace.Buffer(), 1);
+        local_workspace.Buffer());
   }
-  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+  El::AllReduce(*l.m_workspace, l.m_workspace->RedundantComm());
 
   // Compute gradient w.r.t. input
   if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
     bp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
       local_height, local_width,
       local_output.LockedBuffer(),
       local_output.LDim(),
       local_gradient_wrt_output.LockedBuffer(),
       local_gradient_wrt_output.LDim(),
-      local_workspace.Buffer(), 1,
+      local_workspace.Buffer(),
       local_gradient_wrt_input.Buffer(),
       local_gradient_wrt_input.LDim());
   }
 
 }
 
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void softmax_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_compute_impl(*this);
+}
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void softmax_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_compute_impl(*this);
+}
+
+// Template instantiation
+#define PROTO(T)                                      \
+  template class softmax_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class softmax_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/activations/softmax_builder.cpp b/src/layers/activations/softmax_builder.cpp
new file mode 100644
index 00000000000..aabc81c1dc9
--- /dev/null
+++ b/src/layers/activations/softmax_builder.cpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/activations/softmax.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <layers.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_softmax_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, softmax);
+  using LayerType = softmax_layer<TensorDataType, Layout, Device>;
+  const auto& sm_mode = proto_layer.softmax().softmax_mode();
+  if (sm_mode == "instance" || sm_mode == "")
+    return lbann::make_unique<LayerType>(comm, softmax_mode::INSTANCE);
+  else if (sm_mode == "channel")
+    return lbann::make_unique<LayerType>(comm, softmax_mode::CHANNEL);
+  else
+    return lbann::make_unique<LayerType>(comm, softmax_mode::INVALID);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(softmax, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
new file mode 100644
index 00000000000..e744d505896
--- /dev/null
+++ b/src/layers/data_type_distconv_adapter.cpp
@@ -0,0 +1,888 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/data_type_distconv_adapter.hpp"
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+
+namespace lbann {
+
+namespace {
+template <typename TensorDataType>
+using TensorDevPtr = std::unique_ptr<typename data_type_distconv_adapter<TensorDataType>::TensorDevType>;
+} // namespace
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_activations(const Layer& child) const {
+  if (layer().get_num_children() == 0) {
+    LBANN_ERROR("This layer has no children");
+  }
+  const int child_index = layer().find_child_layer_index(&child);
+  if (child_index >= layer().get_num_children()) {
+    LBANN_ERROR("attempted to get activation tensor of ",
+                "layer \"", get_name(), "\" ",
+                "corresponding to layer\"", child.get_name(), "\", ",
+                "which is not a child layer");
+  }
+  return get_activations(child_index);
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_activations(int child_index) const {
+  if (child_index < 0 || child_index >= (int) m_outputs.size()) {
+    LBANN_ERROR("attempted to access invalid distconv activation tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", child_index, ", but there are ",
+                m_outputs.size(), " activation tensors)");
+  }
+  const auto &tensor_ptr = m_outputs[child_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("activation tensor of layer ", get_name(),
+                " is not set at index ", child_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_activations(int child_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(*this).get_activations(child_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_activations(
+    int child_index) const {
+  if (child_index < 0 || child_index >= (int) m_original_outputs.size()) {
+    LBANN_ERROR("attempted to access invalid original activation tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", child_index, ", but there are ",
+                m_original_outputs.size(), " original activation tensors)");
+  }
+  const auto &tensor_ptr = m_original_outputs[child_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("original activation tensor of layer ", get_name(),
+                " is not set at index ", child_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_activations(
+    int child_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_original_activations(child_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_prev_activations(int parent_index) const {
+  if (parent_index < 0 || parent_index >= (int) m_inputs.size()) {
+    LBANN_ERROR("attempted to access invalid distconv previous activation tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", parent_index, ", but there are ",
+                m_inputs.size(), " previous activation tensors)");
+  }
+  const auto &tensor_ptr = m_inputs[parent_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("previous activation tensor of layer ", get_name(),
+                " is not set at index ", parent_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_prev_activations(int parent_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_prev_activations(parent_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_prev_activations(
+    int parent_index) const {
+  if (parent_index < 0 || parent_index >= (int) m_original_inputs.size()) {
+    LBANN_ERROR("attempted to access invalid original previous activation tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", parent_index, ", but there are ",
+                m_original_inputs.size(), " original previous activation tensors)");
+  }
+  const auto &tensor_ptr = m_original_inputs[parent_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("original previous activation tensor of layer ", get_name(),
+                " is not set at index ", parent_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_prev_activations(
+    int parent_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_original_prev_activations(parent_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_error_signals(const Layer& parent) const {
+  if (layer().get_num_parents() == 0) {
+    LBANN_ERROR("This layer has no parents");
+  }
+  const int parent_index = layer().find_parent_layer_index(&parent);
+  if (parent_index >= layer().get_num_parents()) {
+    LBANN_ERROR("attempted to get error signal tensor of ",
+                "layer \"", get_name(), "\" ",
+                "corresponding to layer\"", parent.get_name(), "\", ",
+                "which is not a parent layer");
+  }
+  return get_error_signals(parent_index);
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_error_signals(int parent_index) const {
+  if (parent_index < 0 || parent_index >= (int) m_gradient_wrt_inputs.size()) {
+    LBANN_ERROR("attempted to access invalid distconv error signal tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", parent_index, ", but there are ",
+                m_gradient_wrt_inputs.size(), " error signal tensors)");
+  }
+  const auto &tensor_ptr = m_gradient_wrt_inputs[parent_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("error signal tensor of layer ", get_name(),
+                " is not set at index ", parent_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_error_signals(int parent_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_error_signals(parent_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_error_signals(
+    int parent_index) const {
+  if (parent_index < 0 || parent_index >= (int) m_original_gradient_wrt_inputs.size()) {
+    LBANN_ERROR("attempted to access invalid original error signal tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", parent_index, ", but there are ",
+                m_original_gradient_wrt_inputs.size(), " original error signal tensors)");
+  }
+  const auto &tensor_ptr = m_original_gradient_wrt_inputs[parent_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("original error signal tensor of layer ", get_name(),
+                " is not set at index ", parent_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_error_signals(
+    int parent_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_original_error_signals(parent_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_prev_error_signals(int child_index) const {
+  if (child_index < 0 || child_index >= (int) m_gradient_wrt_outputs.size()) {
+    LBANN_ERROR("attempted to access invalid distconv previous error signal tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", child_index, ", but there are ",
+                m_gradient_wrt_outputs.size(), " previous error signal tensors)");
+  }
+  const auto &tensor_ptr = m_gradient_wrt_outputs[child_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("previous error signal tensor of layer ", get_name(),
+                " is not set at index ", child_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_prev_error_signals(int child_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_prev_error_signals(child_index));
+}
+
+template <typename TensorDataType>
+const typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_prev_error_signals(int child_index) const {
+  if (child_index < 0 || child_index >= (int) m_original_gradient_wrt_outputs.size()) {
+    LBANN_ERROR("attempted to access invalid original previous error signal tensor ",
+                "from ", get_name(), " ",
+                "(requested index ", child_index, ", but there are ",
+                m_original_gradient_wrt_outputs.size(), " previous error signal tensors)");
+  }
+  const auto &tensor_ptr = m_original_gradient_wrt_outputs[child_index];
+  if (tensor_ptr == nullptr) {
+    LBANN_ERROR("original previous error signal tensor of layer ", get_name(),
+                " is not set at index ", child_index);
+  }
+  return *tensor_ptr;
+}
+
+template <typename TensorDataType>
+typename data_type_distconv_adapter<TensorDataType>::TensorDevType&
+data_type_distconv_adapter<TensorDataType>::get_original_prev_error_signals(int child_index) {
+  return const_cast<TensorDevType&>(
+      static_cast<const data_type_distconv_adapter<TensorDataType>&>(
+          *this).get_original_prev_error_signals(child_index));
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_original_prev_activations() {
+  m_original_inputs.clear();
+  for (int i = 0; i < layer().get_num_parents(); ++i) {
+    m_original_inputs.emplace_back(setup_original_prev_activations_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_original_prev_activations_i(int index) const {
+  TensorDevPtr<TensorDataType> t = nullptr;
+  if (parent_copy_required(index)) {
+    const auto shape = get_prev_activations_shape();
+    auto local_shape = shape;
+    // Set the sample dimension as 0 so that its actual value is
+    // calculated by Distconv
+    local_shape[-1] = 0;
+    const auto dist = dc::get_hydrogen_data_parallel_distribution(
+        dc::get_num_dims(layer()));
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+  } else if (parent_shuffle_required(index)) {
+    // NOTE: previous activations are assumed to be of the same
+    // tensor data type.
+    // Create a shallow copy of the activations of the prev layer
+    const auto &parent_activations =
+        dynamic_cast<const TensorDevType&>(
+            layer().get_parent_layers()[index]->get_distconv_adapter().get_activations(layer()));
+    t = make_unique<TensorDevType>(parent_activations);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_prev_activations() {
+  m_inputs.clear();
+  for (int i = 0; i < layer().get_num_parents(); ++i) {
+    m_inputs.emplace_back(setup_prev_activations_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_prev_activations_i(int index) const {
+  const auto &dist = this->get_prev_activations_dist();
+  TensorDevPtr<TensorDataType> t = nullptr;
+  if (parent_copy_required(index) || parent_shuffle_required(index)) {
+    if (index != 0) LBANN_ERROR("Copyin of non-first tensor not supported yet");
+    const auto shape = get_prev_activations_shape(index);
+    const auto local_shape = get_prev_activations_local_shape(index);
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+    assert0(t->allocate());
+    t->zero(El::GPUManager::Stream());
+  } else {
+    // Create a shallow copy
+    const auto &parent_activations =
+        dynamic_cast<const TensorDevType&>(
+            layer().get_parent_layers()[index]->get_distconv_adapter().get_activations(layer()));
+    // Sanity check
+    assert_always(parent_activations.get_distribution() == dist);
+    t = make_unique<TensorDevType>(parent_activations);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_prev_activations_shape(
+    int input_index) const {
+  const auto input_dims = layer().get_input_dims(input_index);
+  std::vector<int> input_tensor_shape_v(input_dims.rbegin(), input_dims.rend());
+  input_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size());
+  return dc::Shape(input_tensor_shape_v);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_prev_activations_local_shape(
+    int input_index) const {
+  // No enforced local shape.
+  return dc::Shape(dc::get_num_dims(layer()), 0);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_activations_shape(
+    int output_index) const {
+  const auto output_dims = layer().get_output_dims(output_index);
+  std::vector<int> output_tensor_shape_v(output_dims.rbegin(), output_dims.rend());
+  output_tensor_shape_v.push_back(layer().get_model()->get_max_mini_batch_size());
+  return dc::Shape(output_tensor_shape_v);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_activations_local_shape(int index) const {
+  // Note that, as the default case, it is assumed that the local
+  // shape is the same as the local shape of the first previous
+  // activations.
+  if (index > 0) {
+    LBANN_ERROR("Unknown local shape for activations[", index, "]");
+  }
+  return get_prev_activations(0).get_local_shape();
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_prev_error_signals_shape(
+    int index) const {
+  // Activations and previous error signals should have the same shape.
+  return get_activations_shape(index);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_prev_error_signals_local_shape(int index) const {
+  // Activations and previous error signals should have the same local
+  // shape.
+  return get_activations_local_shape(index);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_error_signals_shape(
+    int index) const {
+  // Previous activations and error signals should have the same shape.
+  return get_prev_activations_shape(index);
+}
+
+template <typename TensorDataType>
+dc::Shape data_type_distconv_adapter<TensorDataType>::get_error_signals_local_shape(int index) const {
+  // Previous activations and error signals should have the same local
+  // shape.
+  return get_prev_activations(index).get_local_shape();
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_activations() {
+  m_outputs.clear();
+  for (int i = 0; i < layer().get_num_children(); ++i) {
+    m_outputs.emplace_back(setup_activations_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_activations_i(int index) const {
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+  const auto &dist = this->get_activations_dist();
+  const auto shape = get_activations_shape(index);
+  const auto local_shape = get_activations_local_shape(index);
+  auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+  assert0(t->allocate());
+  t->zero(El::GPUManager::Stream());
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_original_activations() {
+  m_original_outputs.clear();
+  for (int i = 0; i < layer().get_num_children(); ++i) {
+    m_original_outputs.emplace_back(setup_original_activations_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_original_activations_i(int index) const {
+  // Create a original tensor only when copyout is needed. Note that
+  // when the next layer is a distconv layer and has a different
+  // distribution, tensor shuffling is necessary but is done at the
+  // next layer.
+  TensorDevPtr<TensorDataType> t = nullptr;
+  if (child_copy_required(index)) {
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    const auto dist = dc::get_hydrogen_data_parallel_distribution(dc::get_num_dims(layer()));
+    const auto shape = get_activations_shape(index);
+    assert_always(!shape.is_empty());
+    auto local_shape = shape;
+    // Set the sample dimension as 0 so that its actual value is
+    // calculated by Distconv
+    local_shape[-1] = 0;
+    t = make_unique<TensorDevType>(
+        shape, loc, dist, local_shape);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_prev_error_signals() {
+  m_gradient_wrt_outputs.clear();
+  for (int i = 0; i < layer().get_num_children(); ++i) {
+    m_gradient_wrt_outputs.emplace_back(setup_prev_error_signals_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_prev_error_signals_i(int index) const {
+  TensorDevPtr<TensorDataType> t = nullptr;
+  const auto &dist = this->get_prev_error_signals_dist();
+  if (child_copy_required(index) || child_shuffle_required(index)) {
+    const auto shape = get_prev_error_signals_shape(index);
+    const auto local_shape = get_prev_error_signals_local_shape(index);
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+    assert0(t->allocate());
+    t->zero(El::GPUManager::Stream());
+  } else {
+    // Create a shallow copy
+    const auto &child_error_signals =
+        dynamic_cast<const TensorDevType&>(
+            layer().get_child_layers()[index]->get_distconv_adapter().get_error_signals(layer()));
+    // Just sanity check
+    assert_always(child_error_signals.get_distribution() == dist);
+    t = make_unique<TensorDevType>(child_error_signals);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_original_prev_error_signals() {
+ m_original_gradient_wrt_outputs.clear();
+ for (int i = 0; i < layer().get_num_children(); ++i) {
+   m_original_gradient_wrt_outputs.emplace_back(
+       setup_original_prev_error_signals_i(i));
+ }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_original_prev_error_signals_i(int index) const {
+  TensorDevPtr<TensorDataType> t = nullptr;
+  if (this->child_copy_required(index)) {
+    const auto shape = get_prev_error_signals_shape(index);
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    const auto dist = dc::get_hydrogen_data_parallel_distribution(
+        dc::get_num_dims(layer()));
+    auto local_shape = shape;
+    // Set the sample dimension as 0 so that its actual value is
+    // calculated by Distconv
+    local_shape[-1] = 0;
+    t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+  } else if (this->child_shuffle_required(index)) {
+    // NOTE: previous activations are assumed to be of the same
+    // tensor data type.
+    // Create a shallow copy of the activations of the prev layer
+    const auto &child_error_signals =
+        dynamic_cast<const TensorDevType&>(
+            layer().get_child_layers()[index]->get_distconv_adapter().get_error_signals(layer()));
+    t = make_unique<TensorDevType>(child_error_signals);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_error_signals() {
+  m_gradient_wrt_inputs.clear();
+  for (int i = 0; i < layer().get_num_parents(); ++i) {
+    m_gradient_wrt_inputs.emplace_back(setup_error_signals_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_error_signals_i(int index) const {
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+  const auto &dist = this->get_error_signals_dist();
+  const auto shape = get_error_signals_shape(index);
+  const auto local_shape = get_error_signals_local_shape(index);
+  auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+  assert0(t->allocate());
+  t->zero(El::GPUManager::Stream());
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::setup_original_error_signals() {
+  m_original_gradient_wrt_inputs.clear();
+  for (int i = 0; i < layer().get_num_parents(); ++i) {
+    m_original_gradient_wrt_inputs.emplace_back(
+        setup_original_error_signals_i(i));
+  }
+}
+
+template <typename TensorDataType>
+TensorDevPtr<TensorDataType> data_type_distconv_adapter<TensorDataType>::
+setup_original_error_signals_i(int index) const {
+  TensorDevPtr<TensorDataType> t = nullptr;
+  if (parent_copy_required(index)) {
+    const auto shape = get_error_signals_shape(index);
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    const auto dist = dc::get_hydrogen_data_parallel_distribution(
+        dc::get_num_dims(layer()));
+    auto local_shape = shape;
+    // Set the sample dimension as 0 so that its actual value is
+    // calculated by Distconv
+    local_shape[-1] = 0;
+    t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+  }
+  return t;
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::
+set_activations_outermost_dimension(size_t dim) {
+  for (auto &t: m_inputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_original_inputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_outputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_original_outputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::
+set_error_signals_outermost_dimension(size_t dim) {
+  for (auto &t: m_gradient_wrt_outputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_original_gradient_wrt_outputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_gradient_wrt_inputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+  for (auto &t: m_original_gradient_wrt_inputs) {
+    if (t == nullptr) continue;
+    t->set_outermost_dimension(dim);
+    assert_eq(t->get_shape()[-1], dim);
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::fp_setup(El::Int mini_batch_size) {
+  const auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  // Reconfigure the sample dimension as the mini batch size may vary
+  // at the end of epoch
+  set_activations_outermost_dimension(mini_batch_size);
+  for (int i = 0; i < l.get_num_parents(); ++i) {
+    if (parent_copy_required(i) || parent_shuffle_required(i)) {
+      if (i != 0) {
+        LBANN_ERROR("Copyin non-first tensor not supported");
+      }
+      if (parent_copy_required(i)) {
+        // Parent is assumed to be data parallel, so the local
+        // size of the sample dimension should be equal to
+        // the local width of previous activations. The check only
+        // matters for split root processes as the rest just hold
+        // invalid copy of the root data.
+        if (get_original_prev_activations().is_split_root()) {
+          assert_eq(
+              (int)get_original_prev_activations().get_local_shape()[-1],
+              l.get_prev_activations().LocalWidth());
+        }
+      }
+    }
+  }
+  // TODO: Needs to check other output tensors
+  if (child_copy_required(0) && get_original_activations().is_split_root()) {
+    assert_eq((int)get_original_activations().get_local_shape()[-1],
+              l.get_activations().LocalWidth());
+  }
+  ensure_prev_activations();
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::fp_postprocess() {
+  copy_out_activations();
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::bp_setup(El::Int mini_batch_size) {
+  const auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  // Reconfigure the sample dimension as the mini batch size may vary
+  // at the end of epoch
+  set_error_signals_outermost_dimension(mini_batch_size);
+  for (int i = 0; i < l.get_num_children(); ++i) {
+    if (child_copy_required(i) || child_shuffle_required(i)) {
+      auto &original_input = get_original_prev_error_signals(i);
+      if (i != 0) {
+        LBANN_ERROR("Copyout non-first tensor not supported");
+      }
+      if (child_copy_required(i) && original_input.is_split_root()) {
+        assert_eq(
+            (int)original_input.get_local_shape()[-1],
+            l.get_prev_error_signals().LocalWidth());
+      }
+    }
+    // TODO: Check other input tensors
+    if (i == 0) {
+      if (parent_copy_required(i) &&
+          get_original_error_signals().is_split_root()) {
+        assert_eq((int)get_original_error_signals().get_local_shape()[-1],
+                  l.get_error_signals().LocalWidth());
+      }
+    }
+  }
+  ensure_prev_error_signals();
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::bp_postprocess() {
+  copy_out_error_signals();
+}
+
+namespace {
+template <typename TensorDataType>
+dc::TensorShuffler<TensorDataType> &get_shuffler(
+    const Layer &layer,
+    std::array<dc::TensorShuffler<TensorDataType>*, 4> &shufflers,
+    const dc::TensorDev<TensorDataType> &src,
+    const dc::TensorDev<TensorDataType> &dst) {
+  const auto& c = static_cast<sgd_execution_context&>(
+      layer.get_model()->get_execution_context());
+  const auto& mini_batch_size = c.get_current_mini_batch_size();
+  int shuffler_idx = -1;
+  if (layer.get_model()->get_max_mini_batch_size() == mini_batch_size) {
+    shuffler_idx = 0;
+  } else {
+    // The last remaining mini-batches for the train, validation, and
+    // testing modes
+    auto mode = layer.get_model()->get_execution_context().get_execution_mode();
+    auto ctxt_idx = static_cast<int>(mode);
+    assert_always(ctxt_idx >= 0 && ctxt_idx < 3);
+    shuffler_idx = ctxt_idx + 1;
+  }
+  assert_always(shuffler_idx >= 0 && shuffler_idx < 4);
+  if (shufflers[shuffler_idx] == nullptr) {
+    shufflers[shuffler_idx] = dc::get_tensor_shuffler(src, dst);
+  }
+  return *shufflers[shuffler_idx];
+}
+} // namespace
+
+template <typename TensorDataType>
+dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
+get_prev_activations_shuffler(
+    const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
+  return get_shuffler(layer(), m_prev_activations_shufflers, src, dst);
+}
+
+template <typename TensorDataType>
+dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
+get_activations_shuffler(
+    const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
+  return get_shuffler(layer(), m_activations_shufflers, src, dst);
+}
+
+template <typename TensorDataType>
+dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
+get_prev_error_signals_shuffler(
+    const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
+  return get_shuffler(layer(), m_prev_error_signals_shufflers, src, dst);
+}
+
+template <typename TensorDataType>
+dc::TensorShuffler<TensorDataType>& data_type_distconv_adapter<TensorDataType>::
+get_error_signals_shuffler(
+    const dc::TensorDev<TensorDataType> &src, const dc::TensorDev<TensorDataType> &dst) {
+  return get_shuffler(layer(), m_error_signals_shufflers, src, dst);
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::ensure_prev_activations() {
+  auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  for (int i = 0; i < l.get_num_parents(); ++i) {
+    if (!(parent_copy_required(i) || parent_shuffle_required(i))) {
+      continue;
+    }
+    if (i != 0) {
+      LBANN_ERROR(layer().get_name(), ": copyin of non-first tensor not supported");
+    }
+    if (parent_copy_required(i)) {
+      dc::MPIPrintStreamDebug()
+          << "Copying previous activations from sample decomposition";
+      assert0(dc::tensor::View(
+          get_original_prev_activations(),
+          l.get_prev_activations().LockedBuffer()));
+    }
+    auto &shuffler = get_prev_activations_shuffler(
+        get_original_prev_activations(),
+        get_prev_activations());
+    shuffler.shuffle_forward(
+        get_original_prev_activations().get_const_base_ptr(),
+        get_prev_activations().get_base_ptr(),
+        El::GPUManager::Stream());
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::copy_out_activations() {
+  auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  for (int i = 0; i < l.get_num_children(); ++i) {
+    if (!child_copy_required(i)) continue;
+    if (i != 0) {
+      LBANN_ERROR(layer().get_name(), ": Copyout of non-first tensor not supported");
+    }
+    dc::MPIPrintStreamDebug()
+        << "Copying activations back to sample decomposition";
+    assert0(dc::tensor::View(
+        get_original_activations(), l.get_activations().Buffer()));
+    auto &shuffler = get_activations_shuffler(
+        get_activations(),
+        get_original_activations());
+    shuffler.shuffle_forward(
+        get_activations().get_const_base_ptr(),
+        get_original_activations().get_base_ptr(),
+        El::GPUManager::Stream());
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::ensure_prev_error_signals() {
+  auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  for (int i = 0; i < l.get_num_children(); ++i) {
+    if (!(child_copy_required(i) || child_shuffle_required(i))) {
+      continue;
+    }
+    if (i != 0) {
+      LBANN_ERROR(layer().get_name(), ": copyin of non-first tensor not supported");
+    }
+    if (child_copy_required(i)) {
+      dc::MPIPrintStreamDebug()
+          << "Copying previous error signals from sample decomposition";
+      assert0(dc::tensor::View(
+          get_original_prev_error_signals(i),
+          l.get_prev_error_signals(i).LockedBuffer()));
+    }
+    auto &shuffler = get_prev_error_signals_shuffler(
+        get_original_prev_error_signals(i),
+        get_prev_error_signals(i));
+    shuffler.shuffle_forward(
+        get_original_prev_error_signals(i).get_const_base_ptr(),
+        get_prev_error_signals(i).get_base_ptr(),
+        El::GPUManager::Stream());
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::copy_out_error_signals() {
+  auto &l = dynamic_cast<data_type_layer<TensorDataType>&>(layer());
+  for (int i = 0; i < l.get_num_parents(); ++i) {
+    if (!parent_copy_required(i)) continue;
+    if (i != 0) {
+      LBANN_ERROR(layer().get_name(), ": Copyout of non-first tensor not supported");
+    }
+    dc::MPIPrintStreamDebug()
+        << "Copying error signals back to sample decomposition";
+    assert0(dc::tensor::View(
+        get_original_error_signals(i),
+        l.get_error_signals(i).Buffer()));
+    auto &shuffler = get_error_signals_shuffler(
+        get_error_signals(i),
+        get_original_error_signals(i));
+    shuffler.shuffle_forward(
+        get_error_signals(i).get_const_base_ptr(),
+        get_original_error_signals(i).get_base_ptr(),
+        El::GPUManager::Stream());
+  }
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::dump_activations() const {
+  dc::dump_tensor(get_activations(), get_name() + "_activations");
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::dump_original_activations() {
+  const auto &l = dynamic_cast<const data_type_layer<TensorDataType>&>(layer());
+  assert0(dc::tensor::View(
+      get_original_activations(), l.get_activations().LockedBuffer()));
+  dc::dump_tensor(get_original_activations(),
+                  get_name() + "_activations_original");
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::dump_error_signals() const {
+  dc::dump_tensor(get_error_signals(0), get_name() + "_error_signals");
+}
+
+template <typename TensorDataType>
+void data_type_distconv_adapter<TensorDataType>::dump_original_error_signals() {
+  const auto &l = dynamic_cast<const data_type_layer<TensorDataType>&>(layer());
+  assert0(dc::tensor::View(
+      get_original_activations(), l.get_activations().LockedBuffer()));
+  dc::dump_tensor(get_original_error_signals(0),
+                  get_name() +  "_error_signals_original");
+}
+
+#define PROTO(T)                                \
+  template class data_type_distconv_adapter<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+}  // namespace lbann
diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp
new file mode 100644
index 00000000000..f2f6ad3c1b0
--- /dev/null
+++ b/src/layers/data_type_layer.cpp
@@ -0,0 +1,886 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DATA_TYPE_LAYER_INSTANTIATE
+
+#include "matrix_builder.hpp"
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType>
+data_type_layer<TensorDataType>::data_type_layer(const data_type_layer<TensorDataType>& other) :
+  Layer(other),
+  m_persistent_error_signals(other.m_persistent_error_signals) {
+
+  // Deep matrix copies
+  m_inputs.reserve(other.m_inputs.size());
+  m_outputs.reserve(other.m_outputs.size());
+  m_gradient_wrt_outputs.reserve(other.m_gradient_wrt_outputs.size());
+  m_gradient_wrt_inputs.reserve(other.m_gradient_wrt_inputs.size());
+  for (const auto& ptr : other.m_inputs) {
+    m_inputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_outputs) {
+    m_outputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_gradient_wrt_outputs) {
+    m_gradient_wrt_outputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_gradient_wrt_inputs) {
+    m_gradient_wrt_inputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+}
+
+template <typename TensorDataType>
+data_type_layer<TensorDataType>& data_type_layer<TensorDataType>::operator=(const data_type_layer<TensorDataType>& other) {
+  Layer::operator=(other);
+
+  // Deep matrix copies
+  m_inputs.clear();
+  m_outputs.clear();
+  m_gradient_wrt_outputs.clear();
+  m_gradient_wrt_inputs.clear();
+  m_inputs.reserve(other.m_inputs.size());
+  m_outputs.reserve(other.m_outputs.size());
+  m_gradient_wrt_outputs.reserve(other.m_gradient_wrt_outputs.size());
+  m_gradient_wrt_inputs.reserve(other.m_gradient_wrt_inputs.size());
+  for (const auto& ptr : other.m_inputs) {
+    m_inputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_outputs) {
+    m_outputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_gradient_wrt_outputs) {
+    m_gradient_wrt_outputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  for (const auto& ptr : other.m_gradient_wrt_inputs) {
+    m_gradient_wrt_inputs.emplace_back(ptr ? ptr->Copy() : nullptr);
+  }
+  m_persistent_error_signals = other.m_persistent_error_signals;
+  return *this;
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::setup_weights(size_t idx, weights& w) {
+  if (idx >= m_weights_proxy.size()) {
+    m_weights_proxy.resize(idx+1);
+  }
+  m_weights_proxy[idx].setup(w);
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::forward_prop() {
+  const auto fp_start = get_time();
+
+  // Setup weights proxies
+  if (this->has_weights()) {
+    if ((m_weights_proxy.size() == 0) || m_weights_proxy[0].empty()) {
+      auto const num_weights = this->num_weights();
+      m_weights_proxy.resize(num_weights);
+      for (size_t ii = 0; ii < num_weights; ++ii) {
+        auto& w = this->get_weights(ii);
+        m_weights_proxy[ii].setup(w);
+      }
+    }
+    for (auto& wp : m_weights_proxy)
+      wp.synchronize_with_master();
+  }
+
+  // Setup tensors
+  const auto& c = static_cast<sgd_execution_context&>(m_model->get_execution_context());
+  const auto& mini_batch_size = c.get_current_mini_batch_size();
+  fp_setup_inputs(mini_batch_size);
+  fp_setup_outputs(mini_batch_size);
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+  // Synchronize GPUs and check for errors
+  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+
+#ifdef LBANN_HAS_DISTCONV
+  if (distconv_enabled()) get_distconv_adapter().fp_setup(mini_batch_size);
+#endif // LBANN_HAS_DISTCONV
+
+  // Apply layer's compute function
+  const auto fp_compute_start = get_time();
+  fp_compute();
+  m_fp_compute_time += get_time() - fp_compute_start;
+
+#ifdef LBANN_HAS_DISTCONV
+  if (distconv_enabled()) get_distconv_adapter().fp_postprocess();
+#endif // LBANN_HAS_DISTCONV
+
+  // Add this layer as a gradient source for weight optimizers
+  this->add_as_gradient_source();
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+  // Synchronize GPUs and check for errors
+  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+
+  m_fp_time += get_time() - fp_start;
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::back_prop_impl_() {
+  const auto bp_start = get_time();
+
+  // Setup tensors
+  const auto& c = static_cast<sgd_execution_context&>(
+    m_model->get_execution_context());
+  const auto& mini_batch_size = c.get_current_mini_batch_size();
+  bp_setup_gradient_wrt_inputs(mini_batch_size);
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+  // Synchronize GPUs and check for errors
+  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+
+#ifdef LBANN_HAS_DISTCONV
+  if (distconv_enabled()) get_distconv_adapter().bp_setup(mini_batch_size);
+#endif // LBANN_HAS_DISTCONV
+
+  // Backprop the compute function.
+  const auto bp_compute_start = get_time();
+  bp_compute();
+  m_bp_compute_time += get_time() - bp_compute_start;
+
+#ifdef LBANN_HAS_DISTCONV
+  if (distconv_enabled()) get_distconv_adapter().bp_postprocess();
+#endif // LBANN_HAS_DISTCONV
+
+  // Remove this layer as a gradient source for weight optimizers
+  this->remove_as_gradient_source();
+
+#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+  // Synchronize GPUs and check for errors
+  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
+
+  m_bp_time += get_time() - bp_start;
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::summarize_matrices(lbann_summary& summarizer, int step) {
+
+  // Summarize activation matrices
+  const int num_children = get_num_children();
+  for (int i = 0; i < num_children; ++i) {
+    AbsDistMatReadProxyType<El::Device::CPU> acts(*m_outputs[i]);
+    std::string prefix = m_name + "/activations";
+    if (num_children > 1) { prefix += std::to_string(i); }
+    summarizer.reduce_mean(prefix + "/mean", acts.GetLocked(), step);
+    summarizer.reduce_min(prefix + "/min", acts.GetLocked(), step);
+    summarizer.reduce_max(prefix + "/max", acts.GetLocked(), step);
+    summarizer.reduce_stdev(prefix + "/stdev", acts.GetLocked(), step);
+    summarizer.reduce_2norm(prefix + "/2norm2", acts.GetLocked(), step);
+  }
+
+  // Summarize error signal matrices
+  const int num_parents = get_num_parents();
+  for (int i = 0; i < num_parents; ++i) {
+    if (!m_gradient_wrt_inputs[i]) continue;
+
+    AbsDistMatReadProxyType<El::Device::CPU> error_signals(*m_gradient_wrt_inputs[i]);
+    std::string prefix = m_name + "/error_signals";
+    if (num_parents > 1) { prefix += std::to_string(i); }
+    summarizer.reduce_mean(prefix + "/mean", error_signals.GetLocked(), step);
+    summarizer.reduce_min(prefix + "/min", error_signals.GetLocked(), step);
+    summarizer.reduce_max(prefix + "/max", error_signals.GetLocked(), step);
+    summarizer.reduce_stdev(prefix + "/stdev", error_signals.GetLocked(), step);
+    summarizer.reduce_2norm(prefix + "/2norm2", error_signals.GetLocked(), step);
+  }
+
+}
+
+// ===================================================================
+// Tensor access functions
+// ===================================================================
+
+// Accessing distributed matrices
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_prev_activations(int parent_index) const -> const AbsDistMatrixType& {
+  if (parent_index < 0 || parent_index >= (int) m_inputs.size()) {
+    std::stringstream err;
+    err << "attempted to access invalid previous activation matrix "
+        << "from " << m_name << " "
+        << "(requested index " << parent_index << ", but there are "
+        << m_inputs.size() << " previous activation matrices)";
+    LBANN_ERROR(err.str());
+  }
+  return *m_inputs[parent_index];
+}
+
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_activations(int child_index) const -> const AbsDistMatrixType& {
+  if (child_index < 0 || child_index >= (int) m_outputs.size()) {
+    std::stringstream err;
+    err << "attempted to access invalid activation matrix "
+        << "from " << m_name << " "
+        << "(requested index " << child_index << ", but there are "
+        << m_outputs.size() << " activation matrices)";
+    LBANN_ERROR(err.str());
+  }
+  return *m_outputs[child_index];
+}
+
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_prev_error_signals(int child_index) const -> const AbsDistMatrixType& {
+  if (child_index < 0 || child_index >= (int) m_gradient_wrt_outputs.size()) {
+    LBANN_ERROR(
+      "Attempted to access invalid previous error signal matrix "
+      "from ", m_name, ".\n\nRequested index ", child_index, ", "
+      "but there are ", m_gradient_wrt_outputs.size(),
+      " previous error signal matrices)");
+  }
+  if (!m_gradient_wrt_outputs[child_index]) {
+    LBANN_ERROR("Previous error signal from", m_name,
+                "(index=", child_index, ") is not currently allocated.");
+  }
+  return *m_gradient_wrt_outputs[child_index];
+}
+
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_error_signals(int parent_index) const
+  -> const AbsDistMatrixType& {
+  if (parent_index < 0 || parent_index >= (int) m_gradient_wrt_inputs.size()) {
+    LBANN_ERROR("Attempted to access invalid error signal matrix "
+                "from ", m_name, ". Requested index ", parent_index, ", "
+                "but there are ", m_gradient_wrt_inputs.size(),
+                " error signal matrices)");
+  }
+  if (!m_gradient_wrt_inputs[parent_index]) {
+    LBANN_ERROR("Error signal ", parent_index,
+                " is currently not available.\n",
+                "num parents = ", get_num_parents(), "\n",
+                "num children = ", get_num_children(), "\n");
+  }
+  return *m_gradient_wrt_inputs[parent_index];
+}
+
+// Accessing non-const distributed matrices
+// Note: Using idiom from Item 3, p. 23 in "Effective C++", 3rd ed.,
+// by Scott Meyers.
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_activations(int child_index) -> AbsDistMatrixType& {
+  return const_cast<AbsDistMatrixType&>(static_cast<const data_type_layer<TensorDataType>&>(*this).get_activations(child_index));
+}
+
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_error_signals(int parent_index) -> AbsDistMatrixType& {
+  return const_cast<AbsDistMatrixType&>(static_cast<const data_type_layer<TensorDataType>&>(*this).get_error_signals(parent_index));
+}
+
+// Accessing local matrices
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_activations(int child_index) -> AbsMatrixType& {
+  return get_activations(child_index).Matrix();
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_error_signals(int parent_index) -> AbsMatrixType& {
+  return get_error_signals(parent_index).Matrix();
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_prev_activations(int parent_index) const -> const AbsMatrixType&{
+  return get_prev_activations(parent_index).LockedMatrix();
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_activations(int child_index) const -> const AbsMatrixType& {
+  return get_activations(child_index).LockedMatrix();
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_prev_error_signals(int child_index) const -> const AbsMatrixType& {
+  return get_prev_error_signals(child_index).LockedMatrix();
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_local_error_signals(int parent_index) const -> const AbsMatrixType& {
+  return get_error_signals(parent_index).LockedMatrix();
+}
+
+// Accessing matrices corresponding to parent/child layer
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_activations(const Layer& child) const -> const BaseDistMat& {
+  if(m_child_layers.empty()) {
+    LBANN_ERROR("This layer has no children");
+  }
+  const int child_index = find_child_layer_index(&child);
+  if (child_index >= get_num_children()) {
+    std::stringstream err;
+    err << "attempted to get activation tensor of "
+        << "layer \"" << get_name() << "\" "
+        << "corresponding to layer\"" << child.get_name() << "\", "
+        << "which is not a child layer";
+    LBANN_ERROR(err.str());
+  }
+  return get_activations(child_index);
+}
+template <typename TensorDataType>
+auto data_type_layer<TensorDataType>::get_error_signals(const Layer& parent) const -> const BaseDistMat& {
+  const int parent_index = find_parent_layer_index(&parent);
+  if (parent_index >= get_num_parents()) {
+    LBANN_ERROR("attempted to get error signal tensor of "
+                "layer \"", get_name(), "\" "
+                "corresponding to layer\"", parent.get_name(), "\", "
+                "which is not a parent layer");
+  }
+  return get_error_signals(parent_index);
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::set_keep_error_signals(bool flag)
+{
+  m_persistent_error_signals = flag;
+}
+
+namespace {
+
+// Some indirection around building matrices to keep things tidy in
+// the real code. This is just to hide multiple switches without
+// building a full-blown dispatch engine... This also keeps bad
+// type/device combinations from being instantiated (eg, cpu_fp16 on
+// Device::GPU).
+using namespace h2::meta;
+
+#ifdef LBANN_HAS_GPU
+template <typename T, data_layout Layout,
+          typename=EnableWhenV<El::IsStorageType<T, El::Device::GPU>>>
+auto MakeMatBuilderGPU()
+  -> std::unique_ptr<details::MatrixBuilder<T>>
+{
+  return make_unique<
+      details::DefaultMemoryMatrixBuilder<T,Layout,El::Device::GPU>>();
+}
+
+template <typename T, data_layout Layout,
+          typename=EnableUnlessV<El::IsComputeType<T, El::Device::GPU>>,
+          typename=void>
+auto MakeMatBuilderGPU()
+  -> std::unique_ptr<details::MatrixBuilder<T>>
+{
+  LBANN_ERROR("Bad type/device combination.");
+  return nullptr;
+}
+#endif // LBANN_HAS_GPU
+
+template <typename T, data_layout Layout>
+auto MakeMatBuilderDev(El::Device const device)
+  -> std::unique_ptr<details::MatrixBuilder<T>>
+{
+  switch (device) {
+  case El::Device::CPU:
+    return make_unique<
+      details::DefaultMemoryMatrixBuilder<T,Layout,El::Device::CPU>>();
+#ifdef LBANN_HAS_GPU
+  case El::Device::GPU:
+    return MakeMatBuilderGPU<T, Layout>();
+#endif // LBANN_HAS_GPU
+  default:
+    LBANN_ERROR("Invalid device type");
+  }
+}
+template <typename T>
+auto MakeMatBuilder(data_layout const layout, El::Device const device)
+  -> std::unique_ptr<details::MatrixBuilder<T>>
+{
+  switch (layout) {
+  case data_layout::DATA_PARALLEL:
+    return MakeMatBuilderDev<T, data_layout::DATA_PARALLEL>(device);
+  case data_layout::MODEL_PARALLEL:
+    return MakeMatBuilderDev<T, data_layout::MODEL_PARALLEL>(device);
+  default:
+    LBANN_ERROR("Invalid data layout");
+  }
+  return nullptr;
+}
+
+}// namespace <anon>
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::setup_matrices(const El::Grid& grid) {
+
+  using MatrixBuilderType = details::MatrixBuilder<TensorDataType>;
+
+  // DEBUG
+  {
+    char* keep_error_signals = getenv("LBANN_KEEP_ERROR_SIGNALS");
+    if (!keep_error_signals || (std::stoi(keep_error_signals) == 0))
+      m_persistent_error_signals = false;
+    else
+      m_persistent_error_signals = true;
+  }
+
+  // If no CUB, force persistent error signals:
+#if defined(HYDROGEN_HAVE_CUDA) && !defined(HYDROGEN_HAVE_CUB)
+  if (this->get_device_allocation() == El::Device::GPU)
+    m_persistent_error_signals = true;
+#endif
+
+  // Figure out how to make new matrices
+  std::unique_ptr<MatrixBuilderType> mat_builder =
+    MakeMatBuilder<TensorDataType>(
+      this->get_data_layout(), this->get_device_allocation());
+
+  // Destroy previously setup matrices
+  m_inputs.clear();
+  m_outputs.clear();
+  m_gradient_wrt_outputs.clear();
+  m_gradient_wrt_inputs.clear();
+
+  // Construct matrices
+  m_inputs.resize(get_num_parents());
+  m_outputs.resize(get_num_children());
+  m_gradient_wrt_outputs.resize(get_num_children());
+  m_gradient_wrt_inputs.resize(get_num_parents());
+  for (auto& input : m_inputs) {
+    input = mat_builder->MakeEmpty(grid, 0);
+  }
+  for (auto& output : m_outputs) {
+    output = mat_builder->MakeEmpty(grid, 0);
+  }
+  for (auto& grad_wrt_input : m_gradient_wrt_inputs) {
+    grad_wrt_input = mat_builder->MakeEmpty(grid, 0);
+  }
+  for (auto& grad_wrt_output : m_gradient_wrt_outputs) {
+    grad_wrt_output = mat_builder->MakeEmpty(grid, 0);
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::setup_data(size_t max_mini_batch_size) {
+  Layer::setup_data(max_mini_batch_size);
+
+  // Initialize input and output tensors
+  fp_setup_inputs(max_mini_batch_size);
+  fp_setup_outputs(max_mini_batch_size);
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::bp_compute() {
+  for (int i = 0; i < get_num_parents(); ++i) {
+    El::Zero(get_error_signals(i));
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::check_setup() {
+  Layer::check_setup();
+  std::stringstream err;
+
+  // Check number of tensors
+  const int num_parents = get_num_parents();
+  const int num_children = get_num_children();
+  if ((int) m_inputs.size() != num_parents
+      || (int) m_outputs.size() != num_children
+      || (int) m_gradient_wrt_outputs.size() != num_children
+      || (int) m_gradient_wrt_inputs.size() != num_parents) {
+    err << "layer \"" << get_name() << "\" has an "
+        << "invalid number of input and output tensors "
+        << "(found " << num_parents << " parent layers, "
+        << num_children << " child layers, "
+        << m_inputs.size() << " input tensors, "
+        << m_outputs.size() << " output tensors, "
+        << m_gradient_wrt_outputs.size() << " gradient w.r.t. output tensors, "
+        << m_gradient_wrt_inputs.size() << " gradient w.r.t. input tensors)";
+    LBANN_ERROR(err.str());
+  }
+
+  // Check that tensors are initialized
+  for (int i = 0; i < get_num_parents(); ++i) {
+    if (m_inputs[i] == nullptr) {
+      err << "layer \"" << get_name() << "\" has an "
+          << "uninitialized input tensor (index " << i << ")";
+      LBANN_ERROR(err.str());
+    }
+  }
+  for (int i = 0; i < get_num_children(); ++i) {
+    if (m_outputs[i] == nullptr) {
+      err << "layer \"" << get_name() << "\" has an "
+          << "uninitialized output tensor (index " << i << ")";
+      LBANN_ERROR(err.str());
+    }
+  }
+  for (int i = 0; i < get_num_children(); ++i) {
+    if (!m_gradient_wrt_outputs[i]) {
+      err << "layer \"" << get_name() << "\" has an "
+          << "uninitialized gradient w.r.t. output tensor "
+          << "(index " << i << ")";
+      LBANN_ERROR(err.str());
+    }
+  }
+  for (int i = 0; i < get_num_parents(); ++i) {
+    if (!m_gradient_wrt_inputs[i]) {
+      err << "layer \"" << get_name() << "\" has an "
+          << "uninitialized gradient w.r.t. input tensor "
+          << "(index " << i << ")";
+      LBANN_ERROR(err.str());
+    }
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::fp_setup_inputs(El::Int mini_batch_size) {
+  if (get_num_parents() < 1) { return; }
+
+  // Determine distributed matrix alignment
+  const auto& alignment_dist = m_parent_layers.front()->get_activations(*this).DistData();
+
+  // Iterate through input tensors
+  for (int i = 0; i < get_num_parents(); ++i) {
+#ifdef LBANN_HAS_DISTCONV
+    if (!keep_original_inputs(i)) continue;
+#endif // LBANN_HAS_DISTCONV
+    // Initialize input tensor
+    const auto& parent = *m_parent_layers[i];
+    const auto& parent_output = parent.get_activations(*this);
+    auto& input = *m_inputs[i];
+    input.Empty(false);
+    input.AlignWith(alignment_dist);
+    if (parent_output.DistData() == input.DistData()) {
+      El::LockedView(input, dynamic_cast<const AbsDistMatrixType&>(parent_output));
+    } else {
+      bool async_copy = false;
+#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      // Asynchronously copy CPU data to GPU data if they are otherwise aligned
+      if (parent_output.GetLocalDevice() == El::Device::CPU
+          && input.GetLocalDevice() == El::Device::GPU) {
+        auto parent_dist_data = parent_output.DistData();
+        parent_dist_data.device = El::Device::GPU;
+        async_copy = parent_dist_data == input.DistData();
+      }
+#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      if (async_copy) {
+        El::CopyAsync(parent_output, input);
+      } else {
+        El::Copy(parent_output, input);
+      }
+    }
+
+    // Check input matrix dimensions
+    const auto& height = get_input_size(i);
+    const auto& width = mini_batch_size;
+    if (input.Height() != height || input.Width() != width) {
+      std::stringstream err;
+      err << "layer \"" << get_name() << "\" "
+          << "expected an input tensor stored in a "
+          << height << " x " << width << " matrix "
+          << "from layer \"" << parent.get_name() << "\", but got a "
+          << input.Height() << " x " << input.Width() << " matrix";
+      LBANN_ERROR(err.str());
+    }
+
+  }
+
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::fp_setup_outputs(El::Int mini_batch_size) {
+  if (get_num_children() < 1) { return; }
+
+  // Determine distributed matrix alignment
+  const bool align_outputs = get_num_parents() > 0;
+  const auto& alignment_dist = (align_outputs ?
+                                get_prev_activations().DistData() :
+                                get_activations().DistData());
+
+  // Initialize output tensors
+  for (int i = 0; i < get_num_children(); ++i) {
+#ifdef LBANN_HAS_DISTCONV
+    if (!keep_original_outputs(i)) continue;
+#endif // LBANN_HAS_DISTCONV
+    auto& output = get_activations(i);
+    output.Empty(false);
+    if (align_outputs) { output.AlignWith(alignment_dist); }
+    output.Resize(get_output_size(i), mini_batch_size);
+  }
+
+}
+
+// Implementation details for back-propagation.
+namespace {
+
+// There's some strange logic for whether to do this copy
+// asynchronously or not -- encapsulate it in this little function.
+template <typename TDT>
+void do_tensor_copy(const BaseDistMat& src,
+                    El::AbstractDistMatrix<TDT>& tgt) {
+  bool copy_async = false;
+#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+  auto src_dist_data = src.DistData();
+  auto tgt_dist_data = tgt.DistData();
+  // Asynchronously copy CPU data to GPU data if they are otherwise aligned
+  if ((src.dist_data.device == El::Device::CPU)
+      && (tgt_dist_data.device == El::Device::GPU)) {
+    src_dist_data.device = El::Device::GPU;
+    copy_async = (src_dist_data == tgt_dist_data);
+  }
+#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+  if (copy_async) {
+    El::CopyAsync(src, tgt);
+  }
+  else {
+    El::Copy(src, tgt);
+  }
+}
+
+// This was just cluttering up things.
+void assert_tensor_size(const BaseDistMat& mat,
+                        El::Int expected_height, El::Int expected_width,
+                        std::string const& this_layer_name,
+                        std::string const& child_layer_name)
+{
+  if ((mat.Height() != expected_height) || (mat.Width() != expected_width)) {
+    LBANN_ERROR(
+      "layer \"", this_layer_name, "\" expected a tensor stored in a ",
+      expected_height, " x ", expected_width, " matrix from layer "
+      "\"", child_layer_name, "\", but got a ",
+      mat.Height(), " x ", mat.Width(), " matrix.");
+  }
+}
+
+// Layers only store pointers-to-const to their
+// parents/children. Sometimes we need nonconst access to be able to
+// efficiently move/swap things.
+Layer& get_nonconst_layer(std::vector<Layer*> const& layers,
+                          Layer const* layer_in)
+{
+  // Thanks to the lack of const-correctness in Layers, we cannot
+  // modify parent/child layers directly, but we can through the
+  // model. Unfortunately, this costs us a linear search through the
+  // layer list. Maybe it's better to just "const-cast" things away,
+  // since const-correctness is already long gone.
+  auto p_layer_it = std::find(begin(layers), end(layers), layer_in);
+  if (p_layer_it == end(layers)) {
+    LBANN_ERROR("Layer \"", layer_in->get_name(), "\" not found in model.");
+  }
+  return **p_layer_it;
+}
+
+}// namespace <anon>
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::view_or_copy_prev_error_signal_(
+  const Layer& child, const BaseDistMat& signal)
+{
+  auto layer_idx = find_child_layer_index(std::addressof(child));
+#ifdef LBANN_HAS_DISTCONV
+  if (!keep_original_gradient_wrt_outputs(layer_idx)) return;
+#endif // LBANN_HAS_DISTCONV
+
+  // Check the signal size
+  assert_tensor_size(
+    signal, get_output_size(layer_idx), m_outputs[layer_idx]->Width(),
+    m_name, child.get_name());
+
+  // If the distributions are compatible, we can just view
+  // things. Otherwise, deep-copy the data.
+  auto& prev_error_sig = *m_gradient_wrt_outputs[layer_idx];
+  if (signal.DistData() == prev_error_sig.DistData()) {
+    El::LockedView(prev_error_sig,
+                   dynamic_cast<const AbsDistMatrixType&>(signal));
+  }
+  else {
+    do_tensor_copy(signal, prev_error_sig);
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::move_or_copy_prev_error_signal_(
+  const Layer& child, std::unique_ptr<BaseDistMat> signal_in)
+{
+    auto layer_idx = find_child_layer_index(std::addressof(child));
+#ifdef LBANN_HAS_DISTCONV
+  if (!keep_original_gradient_wrt_outputs(layer_idx)) return;
+#endif // LBANN_HAS_DISTCONV
+
+  // Check the signal size
+  auto& signal = *signal_in;
+  assert_tensor_size(
+    signal, get_output_size(layer_idx), m_outputs[layer_idx]->Width(),
+    m_name, child.get_name());
+
+  // If the distribution is OK, then we can just swap data
+  // around. Otherwise, deep copy into correct distribution.
+  El::DistData expected_distdata = m_outputs[layer_idx]->DistData();
+  if (signal.DistData() == expected_distdata) {
+    if (auto sig_ptr = dynamic_cast<AbsDistMatrixType*>(signal_in.get())) {
+      signal_in.release();
+      m_gradient_wrt_outputs[layer_idx].reset(sig_ptr);
+    }
+    else {
+      LBANN_ERROR("Logic error: DistData objects compare equal "
+                  "but matrices have different dynamic types.");
+    }
+  }
+  else // Deep copy
+  {
+    if (!m_gradient_wrt_outputs[layer_idx]) {
+      m_gradient_wrt_outputs[layer_idx] =
+        MakeMatBuilder<TensorDataType>(
+          this->get_data_layout(),
+          this->get_device_allocation())->MakeEmpty(*expected_distdata.grid, 0);
+    }
+
+    do_tensor_copy(signal, *m_gradient_wrt_outputs[layer_idx]);
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::deep_copy_prev_error_signal_(
+  const Layer& child, const BaseDistMat& signal)
+{
+  auto layer_idx = find_child_layer_index(std::addressof(child));
+#ifdef LBANN_HAS_DISTCONV
+  if (!keep_original_gradient_wrt_outputs(layer_idx)) return;
+#endif // LBANN_HAS_DISTCONV
+
+  // Check the signal size
+  assert_tensor_size(
+    signal, get_output_size(layer_idx), m_outputs[layer_idx]->Width(),
+    m_name, child.get_name());
+
+  // If the distributions are compatible, we can just view
+  // things. Otherwise, deep-copy the data.
+  auto& prev_error_sig = *m_gradient_wrt_outputs[layer_idx];
+  do_tensor_copy(signal, prev_error_sig);
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::clear_prev_error_signals_() {
+  if (!m_persistent_error_signals) {
+    for (auto& es : m_gradient_wrt_outputs)
+      es->Empty(true);
+  }
+}
+
+void attempt_view_error_signal(
+  Layer& parent, const Layer& child, const BaseDistMat& signal)
+{
+  parent.view_or_copy_prev_error_signal_(child, signal);
+}
+
+void attempt_move_error_signal(
+  Layer& parent, const Layer& child, std::unique_ptr<BaseDistMat> signal)
+{
+  parent.move_or_copy_prev_error_signal_(child, std::move(signal));
+}
+
+void deep_copy_error_signal(
+    Layer& parent, const Layer& child, const BaseDistMat& signal)
+{
+  parent.deep_copy_prev_error_signal_(child, signal);
+}
+
+// If I have persistent error signals, both my "previous error
+// signals" and my new error signals will be persistent. So my parents
+// can simply setup views into my error signals, if layout, alignment,
+// etc is OK.
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::propagate_error_signals_to_parents_() {
+  auto& parents = get_parent_layers();
+  std::vector<Layer*> layer_list = get_model()->get_layers();
+  for (size_t p_idx = 0; p_idx < parents.size(); ++p_idx) {
+    Layer& parent = get_nonconst_layer(layer_list, parents[p_idx]);
+
+    // If my error signals persist, my parent can always view them,
+    // assuming the distdata is right. Otherwise, my views and my data
+    // will be released. Views must be copied and owned data can
+    // either be copied or swapped out.
+    auto& error_signal = *m_gradient_wrt_inputs[p_idx];
+    if (m_persistent_error_signals)
+      attempt_view_error_signal(parent, *this, error_signal);
+    else if (error_signal.Viewing())
+      deep_copy_error_signal(parent, *this, error_signal);
+    else
+      attempt_move_error_signal(parent, *this,
+                                std::move(m_gradient_wrt_inputs[p_idx]));
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::allocate_new_gradients_() {
+  for (int i = 0; i < get_num_parents(); ++i) {
+#ifdef LBANN_HAS_DISTCONV
+    if (!keep_original_gradient_wrt_inputs(i)) continue;
+#endif // LBANN_HAS_DISTCONV
+    if (!m_gradient_wrt_inputs[i]) {
+      m_gradient_wrt_inputs[i] =
+        MakeMatBuilder<TensorDataType>(
+          this->get_data_layout(),
+          this->get_device_allocation())->MakeEmpty(
+            m_inputs[i]->Grid(), 0);
+    }
+    auto& gradient_wrt_input = get_error_signals(i);
+    gradient_wrt_input.Empty(false);
+    gradient_wrt_input.AlignWith(get_prev_activations(i));
+  }
+}
+
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::bp_setup_gradient_wrt_inputs(
+  El::Int mini_batch_size)
+{
+  for (int i = 0; i < get_num_parents(); ++i) {
+#ifdef LBANN_HAS_DISTCONV
+    if (!keep_original_gradient_wrt_inputs(i)) continue;
+#endif // LBANN_HAS_DISTCONV
+    auto& gradient_wrt_input = get_error_signals(i);
+    gradient_wrt_input.Empty(false);
+    gradient_wrt_input.AlignWith(get_prev_activations(i));
+    gradient_wrt_input.Resize(get_input_size(i), mini_batch_size);
+  }
+}
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType>
+void data_type_layer<TensorDataType>::setup_distconv_adapter() {
+  this->get_distconv_adapter_ptr() = make_unique<data_type_distconv_adapter<TensorDataType>>(*this);
+}
+
+template <typename TensorDataType>
+data_type_distconv_adapter<TensorDataType>& data_type_layer<TensorDataType>::get_distconv_adapter() {
+  return const_cast<data_type_distconv_adapter<TensorDataType>&>(
+      static_cast<const data_type_layer<TensorDataType>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType>
+const data_type_distconv_adapter<TensorDataType>& data_type_layer<TensorDataType>::get_distconv_adapter() const {
+  return dynamic_cast<const data_type_distconv_adapter<TensorDataType>&>(*get_distconv_adapter_ptr());
+}
+#endif // LBANN_HAS_DISTCONV
+
+#define PROTO(T)                     \
+  template class data_type_layer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/distconv_adapter.cpp b/src/layers/distconv_adapter.cpp
new file mode 100644
index 00000000000..c4a35650842
--- /dev/null
+++ b/src/layers/distconv_adapter.cpp
@@ -0,0 +1,413 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/distconv_adapter.hpp"
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+distconv_adapter::distconv_adapter(Layer &layer)
+    : m_layer(layer) {
+  setup_tensor_shuffle();
+}
+
+Layer& distconv_adapter::layer() { return m_layer; }
+const Layer& distconv_adapter::layer() const { return m_layer; }
+
+std::string distconv_adapter::get_name() const {
+  return layer().get_name();
+}
+
+dc::Dist &distconv_adapter::get_prev_activations_dist() {
+  return const_cast<dc::Dist&>(static_cast<const distconv_adapter&>(
+      *this).get_prev_activations_dist());
+}
+
+const dc::Dist &distconv_adapter::get_prev_activations_dist() const {
+  size_t idx = 0;
+  if (idx >= m_prev_activations_dists.size()) {
+    LBANN_ERROR("Invalid access to previous activations distributions");
+  }
+  return m_prev_activations_dists[idx];
+}
+
+dc::Dist &distconv_adapter::get_activations_dist() {
+  return const_cast<dc::Dist&>(static_cast<const distconv_adapter&>(
+      *this).get_activations_dist());
+}
+
+const dc::Dist &distconv_adapter::get_activations_dist() const {
+  size_t idx = 0;
+  if (idx >= m_activations_dists.size()) {
+    LBANN_ERROR("Invalid access to activations distributions");
+  }
+  return m_activations_dists[idx];
+}
+
+dc::Dist &distconv_adapter::get_prev_error_signals_dist() {
+  return const_cast<dc::Dist&>(static_cast<const distconv_adapter&>(
+      *this).get_prev_error_signals_dist());
+}
+
+const dc::Dist &distconv_adapter::get_prev_error_signals_dist() const {
+  size_t idx = 0;
+  if (idx >= m_prev_error_signals_dists.size()) {
+    LBANN_ERROR("Invalid access to previous error signals distributions");
+  }
+  return m_prev_error_signals_dists[idx];
+}
+
+dc::Dist &distconv_adapter::get_error_signals_dist() {
+  return const_cast<dc::Dist&>(static_cast<const distconv_adapter&>(
+      *this).get_error_signals_dist());
+}
+
+const dc::Dist &distconv_adapter::get_error_signals_dist() const {
+  size_t idx = 0;
+  if (idx >= m_error_signals_dists.size()) {
+    LBANN_ERROR("Invalid access to error signals distributions");
+  }
+  return m_error_signals_dists[idx];
+}
+
+void distconv_adapter::setup_fp_tensors() {
+  setup_original_prev_activations();
+  setup_prev_activations();
+  setup_activations();
+  setup_original_activations();
+}
+
+void distconv_adapter::setup_bp_tensors() {
+  setup_original_prev_error_signals();
+  setup_prev_error_signals();
+  setup_error_signals();
+  setup_original_error_signals();
+}
+
+bool distconv_adapter::parent_copy_required(size_t input_index) const {
+  if (input_index < m_parent_copy_required.size()) {
+    return m_parent_copy_required.at(input_index);
+  } else {
+    LBANN_ERROR("Out of range error! parent_copy_required size: ",
+                m_parent_copy_required.size(),
+                ", index: ", input_index);
+  }
+}
+
+bool distconv_adapter::parent_shuffle_required(size_t input_index) const {
+  if (input_index < m_parent_shuffle_required.size()) {
+    return m_parent_shuffle_required.at(input_index);
+  } else {
+    LBANN_ERROR("Out of range error! parent_shuffle_required size: ",
+                m_parent_shuffle_required.size(),
+                ", index: ", input_index);
+  }
+}
+
+bool distconv_adapter::child_copy_required(size_t output_index) const {
+  if (output_index < m_child_copy_required.size()) {
+    return m_child_copy_required.at(output_index);
+  } else {
+    LBANN_ERROR("Out of range error! child_copy_required size: ",
+                m_child_copy_required.size(),
+                ", index: ", output_index);
+  }
+}
+
+bool distconv_adapter::child_shuffle_required(size_t output_index) const {
+  if (output_index < m_child_shuffle_required.size()) {
+    return m_child_shuffle_required.at(output_index);
+  } else {
+    LBANN_ERROR("Out of range error! child_shuffle_required size: ",
+                m_child_shuffle_required.size(),
+                ", index: ", output_index);
+  }
+}
+
+void distconv_adapter::setup_tensor_shuffle() {
+  assert_always(layer().distconv_enabled());
+
+  const auto &ps = layer().get_parallel_strategy();
+  for (const auto &p: layer().get_parent_layers()) {
+    m_parent_copy_required.push_back(!p->distconv_enabled());
+    m_parent_shuffle_required.push_back(
+        (!p->distconv_enabled()) ||
+        (ps != p->get_parallel_strategy()));
+  }
+
+  for (const auto &c: layer().get_child_layers()) {
+    m_child_copy_required.push_back(!c->distconv_enabled());
+    m_child_shuffle_required.push_back(
+        (!c->distconv_enabled()) ||
+        (ps != c->get_parallel_strategy()));
+  }
+
+  std::stringstream ss;
+  std::stringstream parent_copyin_ss;
+  std::stringstream parent_shuffle_ss;
+  for (int i = 0; i < layer().get_num_parents(); ++i) {
+    if (m_parent_copy_required[i]) {
+      parent_copyin_ss << " " << i;
+    }
+    if (m_parent_shuffle_required[i]) {
+      parent_shuffle_ss << " " << i;
+    }
+  }
+  std::stringstream child_copyout_ss;
+  std::stringstream child_shuffle_ss;
+  for (int i = 0; i < layer().get_num_children(); ++i) {
+    if (m_child_copy_required[i]) {
+      child_copyout_ss << " " << i;
+    }
+    if (m_child_shuffle_required[i]) {
+      child_shuffle_ss << " " << i;
+    }
+  }
+  if (!parent_copyin_ss.str().empty()) {
+    ss << " parent copyin:" << parent_copyin_ss.str() << ",";
+  }
+  if (!parent_shuffle_ss.str().empty()) {
+    ss << " parent shuffle:" << parent_shuffle_ss.str() << ",";
+  }
+  if (!child_copyout_ss.str().empty()) {
+    ss << " child copyout:" << child_copyout_ss.str() << ",";
+  }
+  if (!child_shuffle_ss.str().empty()) {
+    ss << " child shuffle:" << child_shuffle_ss.str();
+  }
+  if (ss.str().size() > 0) {
+    dc::MPIRootPrintStreamDebug() << get_name() << ":" << ss.str();
+  }
+}
+
+void distconv_adapter::adjust_parallel_strategy() {
+  auto &ps = layer().get_parallel_strategy();
+  // The numerical attributes are 0 when not specified. Assume no
+  // partitioning then.
+  auto n = ps.sample_groups != 0 ? ps.sample_groups : 1;
+  auto c = ps.channel_groups != 0 ? ps.channel_groups : 1;
+  auto f = ps.filter_groups != 0 ? ps.filter_groups : 1;
+  auto d = (dc::get_num_spatial_dims(layer()) == 3 && ps.depth_groups != 0) ?
+      ps.depth_groups : 1;
+  auto h = ps.height_groups != 0 ? ps.height_groups : 1;
+  auto w = ps.width_groups != 0 ? ps.width_groups : 1;
+  auto np = layer().m_comm->get_procs_per_trainer();
+
+  const auto spatial_prod = d * h * w;
+
+  // if only one process is used, do not parallelize
+  if (np == 1) {
+    n = c = f = h = w = d = 1;
+  }
+
+  if (c != f) {
+    LBANN_ERROR("The numbers of channel and filter decomposition should be the same.");
+  }
+  if (c != 1 || f != 1) {
+    LBANN_ERROR("Distconv does not support channel/filter parallelization yet. Layer: ",
+                get_name(), ", parallel strategy: ", ps);
+  }
+  if (n * c * spatial_prod > np) {
+    LBANN_ERROR("The number of MPI ranks must be at least as large as the number of processes implied by parallel strategy: ",
+                ps);
+  }
+  // Put the remaining factor into the outer-most process dimension
+  float rem = np / (float) (n * c * spatial_prod);
+  n *= rem;
+  ps.sample_splits *= rem;
+  if (n * c * spatial_prod != np) {
+    LBANN_ERROR("Can't determine factorization of the number of MPI ranks for parallel strategy: ",
+                ps);
+  }
+
+  assert_always(spatial_prod * n * c == np);
+  assert_always(spatial_prod * n * f == np);
+
+  ps.sample_groups = n;
+  ps.channel_groups = c;
+  ps.filter_groups = f;
+  ps.depth_groups = d;
+  ps.height_groups = h;
+  ps.width_groups = w;
+  // If splits are not set, set them to be equal to the group numbers
+  if (ps.sample_splits == 0) ps.sample_splits = n;
+  if (ps.channel_splits == 0) ps.channel_splits = c;
+  if (ps.filter_splits == 0) ps.filter_splits = f;
+  if (ps.depth_splits == 0) ps.depth_splits = d;
+  if (ps.height_splits == 0) ps.height_splits = h;
+  if (ps.width_splits == 0) ps.width_splits = w;
+}
+
+void distconv_adapter::setup_distributions(tensor_overlap_constraints &constraints) {
+  const auto num_dims = dc::get_num_dims(layer());
+  dc::Shape input_locale_shape(num_dims);
+  dc::Shape input_split_shape(num_dims);
+  dc::Shape output_locale_shape(num_dims);
+  dc::Shape output_split_shape(num_dims);
+
+  adjust_parallel_strategy();
+  const auto &ps = layer().get_parallel_strategy();
+
+  input_locale_shape[dc::get_sample_dim()] = ps.sample_groups;
+  input_locale_shape[dc::get_channel_dim()] = ps.channel_groups;
+  input_locale_shape[0] = ps.width_groups;
+  input_locale_shape[1] = ps.height_groups;
+  if (num_dims == 5)  input_locale_shape[2] = ps.depth_groups;
+
+  input_split_shape[dc::get_sample_dim()] = ps.sample_splits;
+  input_split_shape[dc::get_channel_dim()] = ps.channel_splits;
+  input_split_shape[0] = ps.width_splits;
+  input_split_shape[1] = ps.height_splits;
+  if (num_dims == 5)  input_split_shape[2] = ps.depth_splits;
+
+  output_locale_shape[dc::get_sample_dim()] = ps.sample_groups;
+  output_locale_shape[dc::get_channel_dim()] = ps.filter_groups;
+  output_locale_shape[0] = ps.width_groups;
+  output_locale_shape[1] = ps.height_groups;
+  if (num_dims == 5)  output_locale_shape[2] = ps.depth_groups;
+
+  output_split_shape[dc::get_sample_dim()] = ps.sample_splits;
+  output_split_shape[dc::get_channel_dim()] = ps.filter_splits;
+  output_split_shape[0] = ps.width_splits;
+  output_split_shape[1] = ps.height_splits;
+  if (num_dims == 5)  output_split_shape[2] = ps.depth_splits;
+
+  auto prev_activations_dist = dc::Dist::make_shared_distribution(
+      input_locale_shape, input_split_shape);
+  auto activations_dist = dc::Dist::make_shared_distribution(
+      output_locale_shape, output_split_shape);
+  auto prev_error_signals_dist = activations_dist;
+  auto error_signals_dist = prev_activations_dist;
+
+  m_prev_activations_dists.emplace_back(prev_activations_dist);
+  m_activations_dists.emplace_back(activations_dist);
+  m_prev_error_signals_dists.emplace_back(prev_error_signals_dist);
+  m_error_signals_dists.emplace_back(error_signals_dist);
+}
+
+void distconv_adapter::impose_adjacent_overlap_constraints(
+    tensor_overlap_constraints &constraints) {
+  const auto &l = layer();
+  const auto &ps = l.get_parallel_strategy();
+
+  auto &x = get_prev_activations_dist();
+  auto &y = get_activations_dist();
+  auto &dx = get_error_signals_dist();
+  auto &dy = get_prev_error_signals_dist();
+
+  // TEMPORARY HACK. Each tensor should be able to have its own
+  // distribution, however, the current design only allows for a
+  // single distribution for all output tensors in each layer,
+  // meaning the data and label tensors need to have the same
+  // distribution. The data tensor is likely to have halo as the
+  // next layer will be convolution, whereas the label won't need to
+  // have halo. For now, ignore the child layer for the label data.
+
+  if (l.get_type() == "input") {
+    Layer *child = const_cast<Layer*>(l.get_child_layers()[0]);
+    if (child->distconv_enabled() &&
+        child->get_parallel_strategy() == ps) {
+      auto &child_x = child->get_distconv_adapter().get_prev_activations_dist();
+      auto &child_dx = child->get_distconv_adapter().get_error_signals_dist();
+      constraints.mark_equivalent(y, child_x);
+      constraints.mark_equivalent(dy, child_dx);
+    }
+  } else {
+    for (auto &child: l.get_child_layers()) {
+      if (child->distconv_enabled() &&
+          child->get_parallel_strategy() == ps) {
+        auto &child_x = const_cast<dc::Dist&>(
+            child->get_distconv_adapter().get_prev_activations_dist());
+        auto &child_dx = const_cast<dc::Dist&>(
+            child->get_distconv_adapter().get_error_signals_dist());
+        constraints.mark_equivalent(y, child_x);
+        constraints.mark_equivalent(dy, child_dx);
+      }
+    }
+  }
+  for (auto &parent: l.get_parent_layers()) {
+    if (parent->get_type() == "input") {
+      const int child_index =
+          parent->find_child_layer_index(&l);
+      if (child_index == 1) continue;
+      assert_eq(child_index, 0);
+    }
+    if (parent->distconv_enabled() &&
+        parent->get_parallel_strategy() == ps) {
+      auto &parent_y = const_cast<dc::Dist&>(
+          parent->get_distconv_adapter().get_activations_dist());
+      auto &parent_dy = const_cast<dc::Dist&>(
+          parent->get_distconv_adapter().get_prev_error_signals_dist());
+      constraints.mark_equivalent(x, parent_y);
+      constraints.mark_equivalent(dx, parent_dy);
+    }
+  }
+}
+
+void tensor_overlap_constraints::mark_equivalent(dc::Dist &d1, dc::Dist &d2) {
+  // d1 -> d2
+  if (m_equivalents.find(&d1) == m_equivalents.end()) {
+    m_equivalents.insert(std::make_pair(&d1, dist_set()));
+  }
+  m_equivalents[&d1].insert(&d2);
+  // d2 -> d1
+  if (m_equivalents.find(&d2) == m_equivalents.end()) {
+    m_equivalents.insert(std::make_pair(&d2, dist_set()));
+  }
+  m_equivalents[&d2].insert(&d1);
+}
+
+void tensor_overlap_constraints::mark_updated(const dc::Dist &d) {
+  m_updated.insert(&d);
+}
+
+void tensor_overlap_constraints::mark_invariant(const dc::Dist &d) {
+  m_invariants.insert(&d);
+}
+
+void tensor_overlap_constraints::find_valid_overlap() {
+  while (m_updated.size() > 0) {
+    const_dist_set updated_new;
+    for (const auto d: m_updated) {
+      auto equivalent_dists = m_equivalents.find(d);
+      if (equivalent_dists == m_equivalents.end()) continue;
+      for (auto p: equivalent_dists->second) {
+        if (d->get_overlap() != p->get_overlap()) {
+          // p must have equal dist as d but is different.
+          if (m_invariants.find(p) != m_invariants.end()) {
+            // p can't be changed, so we can't solve the constraint.
+            LBANN_ERROR("Incompatible overlap: ", *d, " <=> ", *p);
+          }
+          p->set_overlap(d->get_overlap());
+          updated_new.insert(p);
+        }
+      }
+    }
+    m_updated = std::move(updated_new);
+  }
+}
+
+}  // namespace lbann
diff --git a/src/layers/image/bilinear_resize.cpp b/src/layers/image/bilinear_resize.cpp
index 4e293070d1c..97dee2ec0f0 100644
--- a/src/layers/image/bilinear_resize.cpp
+++ b/src/layers/image/bilinear_resize.cpp
@@ -24,23 +24,24 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE
 #include "lbann/layers/image/bilinear_resize.hpp"
 
 namespace lbann {
 
-template <>
-void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void bilinear_resize_layer<TensorDataType, Layout, Device>::fp_compute() {
 
   // Useful constants
   constexpr DataType half = 0.5;
   constexpr DataType one = 1;
 
   // Matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
 
   // Dimensions
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const auto& num_dims = input_dims.size();
   const auto& num_samples = local_input.Width();
   const El::Int num_channels = std::accumulate(input_dims.begin(),
@@ -51,13 +52,13 @@ void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_comp
   const El::Int input_width = input_dims[num_dims-1];
 
   // Perform bilinear interpolation for each output pixel
-  const auto& x_stride = static_cast<DataType>(input_width) / m_width;
-  const auto& y_stride = static_cast<DataType>(input_height) / m_height;
+  const auto& x_stride = static_cast<DataType>(input_width) / this->m_width;
+  const auto& y_stride = static_cast<DataType>(input_height) / this->m_height;
   LBANN_OMP_PARALLEL_FOR_COLLAPSE4
   for (El::Int sample = 0; sample < num_samples; ++sample) {
     for (El::Int channel = 0; channel < num_channels; ++channel) {
-      for (El::Int output_row = 0; output_row < m_height; ++output_row) {
-        for (El::Int output_col = 0; output_col < m_width; ++output_col) {
+      for (El::Int output_row = 0; output_row < this->m_height; ++output_row) {
+        for (El::Int output_col = 0; output_col < this->m_width; ++output_col) {
 
           // Interpolation point
           const auto& x = (output_col + half) * x_stride;
@@ -92,8 +93,8 @@ void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_comp
                                             + input_row1 * input_width
                                             + input_col1,
                                             sample);
-          auto& result = local_output(channel * m_height * m_width
-                                      + output_row * m_width
+          auto& result = local_output(channel * this->m_height * this->m_width
+                                      + output_row * this->m_width
                                       + output_col,
                                       sample);
 
@@ -110,4 +111,10 @@ void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_comp
 
 }
 
+#define PROTO(T)                                      \
+  template class bilinear_resize_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>;
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/image/bilinear_resize.cu b/src/layers/image/bilinear_resize.cu
index 166b87c753c..d755373b67b 100644
--- a/src/layers/image/bilinear_resize.cu
+++ b/src/layers/image/bilinear_resize.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE
 #include "lbann/layers/image/bilinear_resize.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -31,27 +32,27 @@ namespace lbann {
 
 namespace {
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void fp_kernel(El::Int num_samples,
                           El::Int num_channels,
                           El::Int input_height,
                           El::Int input_width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
                           El::Int output_height,
                           El::Int output_width,
-                          DataType* __restrict__ output,
+                          TensorDataType* __restrict__ output,
                           El::Int output_ldim) {
 
   // Useful constants
-  constexpr DataType half = 0.5;
-  constexpr DataType one = 1;
+  const TensorDataType half = 0.5;
+  const TensorDataType one = 1.;
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
 
   // Stride between interpolation points
-  const auto& x_stride = static_cast<DataType>(input_width) / output_width;
-  const auto& y_stride = static_cast<DataType>(input_height) / output_height;
+  const auto& x_stride = TensorDataType(input_width) / TensorDataType(output_width);
+  const auto& y_stride = TensorDataType(input_height) / TensorDataType(output_height);
 
   const auto& size = (num_samples * num_channels
                       * output_height * output_width);
@@ -64,8 +65,8 @@ __global__ void fp_kernel(El::Int num_samples,
     const auto& output_col = pos % output_width;
 
     // Interpolation point
-    const auto& x = (output_col + half) * x_stride;
-    const auto& y = (output_row + half) * y_stride;
+    const auto& x = (TensorDataType(output_col) + half) * x_stride;
+    const auto& y = (TensorDataType(output_row) + half) * y_stride;
 
     // Find input pixels near interpolation point
     const auto input_col = static_cast<El::Int>(cuda::floor(x - half));
@@ -76,8 +77,8 @@ __global__ void fp_kernel(El::Int num_samples,
     const auto& input_row1 = cuda::min(input_row+1, input_height-1);
 
     // Interpolation point relative to input pixel centers
-    const auto& unit_x = x - (input_col + half);
-    const auto& unit_y = y - (input_row + half);
+    const auto& unit_x = x - (TensorDataType(input_col) + half);
+    const auto& unit_y = y - (TensorDataType(input_row) + half);
 
     // Input and output pixels
     const auto& pixel00 = input[sample * input_ldim
@@ -114,15 +115,15 @@ __global__ void fp_kernel(El::Int num_samples,
 }
 
 
-template <>
-void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void bilinear_resize_layer<TensorDataType, Layout, Device>::fp_compute() {
 
   // Matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
 
   // Dimensions
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const auto& num_dims = input_dims.size();
   const auto& num_samples = local_input.Width();
   const El::Int num_channels = std::accumulate(input_dims.begin(),
@@ -150,10 +151,16 @@ void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_comp
         num_samples, num_channels,
         input_height, input_width,
         local_input.LockedBuffer(), local_input.LDim(),
-        m_height, m_width,
+        this->m_height, this->m_width,
         local_output.Buffer(), local_output.LDim());
   }
 
 }
 
+#define PROTO(T)                                      \
+  template class bilinear_resize_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/io/input/CMakeLists.txt b/src/layers/io/input/CMakeLists.txt
new file mode 100644
index 00000000000..fd26136188a
--- /dev/null
+++ b/src/layers/io/input/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  input_layer.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp
new file mode 100644
index 00000000000..6efae118ec2
--- /dev/null
+++ b/src/layers/io/input/input_layer.cpp
@@ -0,0 +1,355 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_INPUT_LAYER_INSTANTIATE
+#include "lbann/layers/io/input/input_layer.hpp"
+#include "lbann/utils/profiling.hpp"
+
+namespace lbann {
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+input_distconv_adapter(Layer& layer): data_type_distconv_adapter<TensorDataType>(layer),
+                                      m_shuffle_required(true) {
+  // Input data is only processed when its consumer layer is also
+  // enabled for distconv
+  for (int i = 0; i < layer.get_num_children(); ++i) {
+    m_is_input_processed.push_back(layer.get_child_layers()[i]->distconv_enabled());
+  }
+  if (m_shuffle_required) {
+    m_shufflers.resize(layer.get_num_children());
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+bool input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+is_input_processed(size_t index) const {
+  if (index >= m_is_input_processed.size()) {
+    LBANN_ERROR("Invalid index: ", index);
+  }
+  return m_is_input_processed[index];
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+typename input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::TensorHostShuffler&
+input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::get_shuffler(
+    const TensorHost &src, const TensorHost &dst, int mat_idx) {
+  size_t cur_mb_size = src.get_shape()[dc::get_sample_dim()];
+  auto src_buf = m_shuffler_src_buf.get();
+  auto dst_buf = m_shuffler_dst_buf.get();
+  int shfl_idx = -1;
+  const auto& context = this->layer().get_model()->get_execution_context();
+  if (cur_mb_size == context.get_trainer().get_max_mini_batch_size()) {
+    shfl_idx = 0;
+  } else {
+    // The last remaining mini-batches for the train, validation, and
+    // testing modes
+    auto mode = context.get_execution_mode();
+    shfl_idx = 1 + static_cast<int>(mode);
+  }
+  assert_always(shfl_idx >= 0 && shfl_idx < 4);
+  auto &shfl = m_shufflers[mat_idx][shfl_idx];
+  if (shfl == nullptr) {
+    shfl = make_unique<TensorHostShuffler>(
+        src, dst, src_buf, dst_buf);
+  }
+  return *shfl;
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+void input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::setup_fp_tensors() {
+  const auto sample_dist = dc::get_hydrogen_data_parallel_distribution(
+      dc::get_num_dims(this->layer()));
+  for (int mat_idx = 0; mat_idx < this->layer().get_num_children(); ++mat_idx) {
+    if (!is_input_processed(mat_idx)) continue;
+
+    const auto shape = this->get_activations_shape(mat_idx);
+    auto local_shape = shape;
+    if (m_shuffle_required) {
+      local_shape[dc::get_sample_dim()] = 0;
+    } else {
+      local_shape = 0;
+    }
+
+    // Use the same MPI communicator for both IO buffers. This seems
+    // to work around MPI errors likely caused with the alltoallv for
+    // shuffling.
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+
+    auto dist = this->get_activations_dist();
+    if (mat_idx == 1) {
+      // assumes no halo for the ground-truth data
+      dist.clear_overlap();
+    }
+    auto dist_no_halo = dist;
+    dist_no_halo.clear_overlap();
+
+    const auto original_host_tensor_dist = m_shuffle_required ?
+        sample_dist : dist_no_halo;
+    // Create a view to the host LBANN matrix
+    m_original_host_tensors.emplace_back(
+        make_unique<TensorHost>(shape, loc, original_host_tensor_dist, local_shape));
+
+    // When shuffled, host tensor will have the same distribution as
+    // the final output; otherwise, it is just a view to the host
+    // LBANN matrix, so no overlap.
+    auto host_tensor_dist = m_shuffle_required ? dist : dist_no_halo;
+    m_host_tensors.emplace_back(
+        make_unique<TensorHost>(shape, loc, host_tensor_dist));
+
+    if (m_shuffle_required) {
+      // TODO: This is a temporary hack. Should use
+      // CUDAHostPooledAllocator, but the shuffler is
+      // only specialized for BaseAllocator.
+      size_t buf_size = m_host_tensors.back()->get_local_real_size()
+          * sizeof(TensorDataType);
+      TensorDataType *buf = nullptr;
+      CHECK_CUDA(cudaMallocHost(&buf, buf_size));
+      // Note buf should be deallocated.
+      dc::tensor::View(*m_host_tensors.back(), buf);
+      setup_shuffler_buffers(*m_original_host_tensors.back(),
+                             *m_host_tensors.back());
+    }
+  }
+
+  this->setup_activations();
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+std::unique_ptr<typename input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::TensorDevType>
+input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+setup_activations_i(int index) const {
+  if (!is_input_processed(index)) return nullptr;
+  if (index == 0) {
+    return data_type_distconv_adapter<TensorDataType>::
+        setup_activations_i(index);
+  } else {
+    assert_eq(index, 1);
+    // Note: the default setup_activations_i can't be used because
+    // the distribution might need to be changed to remove
+    // overlap. This can be fixed by making each tensor hav a
+    // different distribution.
+    const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+    auto dist = this->get_activations_dist();
+    dist.clear_overlap();
+    const auto shape = get_activations_shape(index);
+    const auto local_shape = get_activations_local_shape(index);
+    auto t = make_unique<TensorDevType>(shape, loc, dist, local_shape);
+    assert0(t->allocate());
+    t->zero(El::GPUManager::Stream());
+    return t;
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+dc::Shape input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+get_activations_local_shape(int index) const {
+  // No enforced local shape as the activations tensor is always
+  // copied from the El matrix.
+  return dc::Shape(dc::get_num_dims(this->layer()), 0);
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+dc::Shape input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+get_activations_shape(int index) const {
+  if (index == 0) {
+    return data_type_distconv_adapter<TensorDataType>::
+        get_activations_shape(index);
+  } else {
+    assert_eq(index, 1);
+    // TODO: This is a temporary hack. The label tensor shape should
+    //be set based on the shape set by the data reader, but the data
+    //reader does not provide it. Using the shape shape as the data
+    //tensor works fine for the U-Net model.
+    auto shape = this->get_activations_shape(0);
+    auto label_size = data_type_distconv_adapter<TensorDataType>::
+        get_activations_shape(1).reduce_prod();
+    auto num_channels = label_size / shape.reduce_prod();
+    shape[-2] = num_channels;
+    return shape;
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+void input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+setup_shuffler_buffers(const TensorHost &src, const TensorHost &dst) {
+  auto shuffler_src_size = TensorHostShuffler::get_buf_size(src);
+  if (m_shuffler_src_buf_size < shuffler_src_size) {
+    m_shuffler_src_buf_size = shuffler_src_size;
+    m_shuffler_src_buf =
+        std::unique_ptr<TensorDataType>(static_cast<TensorDataType*>(
+            dc::util::aligned_malloc(m_shuffler_src_buf_size)));
+  }
+  auto shuffler_dst_size = TensorHostShuffler::get_buf_size(dst);
+  if (m_shuffler_dst_buf_size < shuffler_dst_size) {
+    m_shuffler_dst_buf_size = shuffler_dst_size;
+    m_shuffler_dst_buf =
+        std::unique_ptr<TensorDataType>(static_cast<TensorDataType*>(
+            dc::util::aligned_malloc(m_shuffler_dst_buf_size)));
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+bool input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+child_copy_required(size_t output_index) const {
+  // Not required when label is not handled.
+  if (output_index == 1 && !is_input_processed(1)) {
+    return false;
+  } else {
+    return data_type_distconv_adapter<TensorDataType>::
+        child_copy_required(output_index);
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+bool input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::
+child_shuffle_required(size_t output_index) const {
+  // Not required when label is not handled.
+  if (output_index == 1 && !is_input_processed(1)) {
+    return false;
+  } else {
+    return data_type_distconv_adapter<TensorDataType>::
+        child_shuffle_required(output_index);
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+void input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>::fp_compute() {
+  auto &l = dynamic_cast<input_layer<
+    TensorDataType, T_io_buffer, T_layout, Dev>&>(this->layer());
+  auto stream = El::GPUManager::Stream();
+  // Note that the mini-batch size of the data reader is not
+  // actually the one for the current mini-batch as the mini-batch
+  // index is already updated by fp_compute.
+  const int mb_size = static_cast<sgd_execution_context&>(
+      l.get_model()->get_execution_context()).get_current_mini_batch_size();
+
+  for (int mat_idx = 0; mat_idx < l.get_num_children(); ++mat_idx) {
+    if (!is_input_processed(mat_idx)) continue;
+
+    assert_eq(mb_size * dc::get_number_of_io_partitions(),
+              l.get_activations(mat_idx).Width());
+
+    auto &original_tensor = *m_original_host_tensors[mat_idx];
+    auto &host_tensor = *m_host_tensors[mat_idx];
+    auto &device_tensor = this->get_activations(mat_idx);
+
+    // Adjust the mini-batch size
+    original_tensor.set_outermost_dimension(mb_size);
+    host_tensor.set_outermost_dimension(mb_size);
+    device_tensor.set_outermost_dimension(mb_size);
+
+    // Setup view
+    assert0(dc::tensor::View(
+        original_tensor,
+        l.get_activations(mat_idx).LockedBuffer()));
+
+    // Shuffle if necessary
+    if (m_shuffle_required) {
+      get_shuffler(
+          original_tensor, host_tensor, mat_idx).shuffle_forward(
+              original_tensor.get_const_base_ptr(),
+              host_tensor.get_base_ptr());
+    } else {
+      // The input buffer is already partitioned
+      assert0(dc::tensor::View(
+          host_tensor, original_tensor.get_const_buffer()));
+    }
+
+    // After this, there is no inter-process communication, so it's
+    // safe to exit if the local tensor is empty.
+    if (host_tensor.get_local_size() == 0) {
+      continue;
+    }
+
+    prof_region_begin("copy-to-device", prof_colors[1], false);
+    assert0(dc::tensor::Copy(
+        device_tensor, host_tensor, stream));
+    prof_region_end("copy-to-device", false);
+  }
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+const input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>&
+input_layer<TensorDataType, T_io_buffer, T_layout, Dev>::get_distconv_adapter() const {
+  return dynamic_cast<const input_distconv_adapter<
+    TensorDataType, T_io_buffer, T_layout, Dev>&>(
+        data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, typename T_io_buffer,
+          data_layout T_layout, El::Device Dev>
+input_distconv_adapter<TensorDataType, T_io_buffer, T_layout, Dev>&
+input_layer<TensorDataType, T_io_buffer, T_layout, Dev>::get_distconv_adapter() {
+  return const_cast<input_distconv_adapter<
+    TensorDataType, T_io_buffer, T_layout, Dev>&>(
+        static_cast<const input_layer<
+        TensorDataType, T_io_buffer, T_layout, Dev>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType,
+          typename T_io_buffer,
+          data_layout T_layout,
+          El::Device Dev>
+bool input_layer<TensorDataType, T_io_buffer, T_layout, Dev>::
+keep_original_outputs(int index) const {
+  // The original output matrices are always needed as we copy them
+  // into distconv tensors.
+  return true;
+}
+
+template <typename TensorDataType,
+          typename T_io_buffer,
+          data_layout T_layout,
+          El::Device Dev>
+void input_layer<TensorDataType, T_io_buffer, T_layout, Dev>::
+fp_compute() {
+  generic_input_layer<TensorDataType>::fp_compute();
+  if (this->distconv_enabled()) {
+    get_distconv_adapter().fp_compute();
+  }
+}
+#endif // LBANN_HAS_DISTCONV
+
+#define PROTO_DEVICE(T, Device) \
+  template class input_layer<T, partitioned_io_buffer<T>, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp
index 2fabfe77505..c7eb4351843 100644
--- a/src/layers/layer.cpp
+++ b/src/layers/layer.cpp
@@ -29,11 +29,16 @@
 #include "lbann/models/model.hpp"
 #include "lbann/io/file_io.hpp"
 #include "lbann/io/persist.hpp"
-#include <string>
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+
+#include <layers.pb.h>
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <string>
+
 // Asynchronous memory transfers for input data
 // Note: This introduces a race condition. It is possible for the
 // input data to be modified by another layer before it is used by
@@ -58,7 +63,6 @@ Layer::Layer(lbann_comm *comm)
 
 Layer::Layer(const Layer& other) :
   m_comm(other.m_comm),
-  m_weights(other.m_weights),
   m_parent_layers(other.m_parent_layers),
   m_child_layers(other.m_child_layers),
   m_expected_num_parent_layers(other.m_expected_num_parent_layers),
@@ -71,34 +75,15 @@ Layer::Layer(const Layer& other) :
   m_bp_compute_time(other.m_bp_compute_time),
   m_update_time(other.m_update_time),
   m_name(other.m_name),
+  m_weights(other.m_weights),
   m_output_dims_list(other.m_output_dims_list),
   m_hint_layer(other.m_hint_layer) {
-
-  // Deep matrix copies
-  m_inputs.reserve(other.m_inputs.size());
-  m_outputs.reserve(other.m_outputs.size());
-  m_gradient_wrt_outputs.reserve(other.m_gradient_wrt_outputs.size());
-  m_gradient_wrt_inputs.reserve(other.m_gradient_wrt_inputs.size());
-  for (const auto& ptr : other.m_inputs) {
-    m_inputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_outputs) {
-    m_outputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_gradient_wrt_outputs) {
-    m_gradient_wrt_outputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_gradient_wrt_inputs) {
-    m_gradient_wrt_inputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-
 }
 
 Layer& Layer::operator=(const Layer& other) {
 
   // Shallow copies
   m_comm = other.m_comm;
-  m_weights = other.m_weights;
   m_parent_layers = other.m_parent_layers;
   m_child_layers = other.m_child_layers;
   m_expected_num_parent_layers = other.m_expected_num_parent_layers;
@@ -111,31 +96,10 @@ Layer& Layer::operator=(const Layer& other) {
   m_bp_compute_time = other.m_bp_compute_time;
   m_update_time = other.m_update_time;
   m_name = other.m_name;
+  m_weights = other.m_weights;
   m_output_dims_list = other.m_output_dims_list;
   m_hint_layer = other.m_hint_layer;
 
-  // Deep matrix copies
-  m_inputs.clear();
-  m_outputs.clear();
-  m_gradient_wrt_outputs.clear();
-  m_gradient_wrt_inputs.clear();
-  m_inputs.reserve(other.m_inputs.size());
-  m_outputs.reserve(other.m_outputs.size());
-  m_gradient_wrt_outputs.reserve(other.m_gradient_wrt_outputs.size());
-  m_gradient_wrt_inputs.reserve(other.m_gradient_wrt_inputs.size());
-  for (const auto& ptr : other.m_inputs) {
-    m_inputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_outputs) {
-    m_outputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_gradient_wrt_outputs) {
-    m_gradient_wrt_outputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-  for (const auto& ptr : other.m_gradient_wrt_inputs) {
-    m_gradient_wrt_inputs.emplace_back(ptr ? nullptr : ptr->Copy());
-  }
-
   return *this;
 }
 
@@ -193,7 +157,7 @@ description Layer::get_description() const {
   }
 
   // Weights
-  const auto& weights_list = get_weights();
+  const auto weights_list = m_weights;
   if (!weights_list.empty()) {
     ss.str(std::string{});
     ss.clear();
@@ -237,6 +201,9 @@ description Layer::get_description() const {
   }
   desc.add("Device", ss.str());
 
+  // DataType
+  desc.add("Data type", get_datatype_name());
+
   // Freeze state
   if (is_frozen()) {
     desc.add("Frozen");
@@ -245,70 +212,6 @@ description Layer::get_description() const {
   return desc;
 }
 
-void Layer::forward_prop() {
-  const auto fp_start = get_time();
-
-  // Setup tensors
-  const auto& mini_batch_size = m_model->get_current_mini_batch_size();
-  fp_setup_inputs(mini_batch_size);
-  fp_setup_outputs(mini_batch_size);
-
-#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-  // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
-#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-
-  // Apply layer's compute function
-  const auto fp_compute_start = get_time();
-  fp_compute();
-  m_fp_compute_time += get_time() - fp_compute_start;
-
-  // Add this layer as a gradient source for weight optimizers
-  for (auto&& w : m_weights) {
-    optimizer* opt = w->get_optimizer();
-    if (opt != nullptr) { opt->add_gradient_source(this); }
-  }
-
-#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-  // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
-#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-
-  m_fp_time += get_time() - fp_start;
-}
-
-void Layer::back_prop() {
-  const auto bp_start = get_time();
-
-  // Setup tensors
-  const auto& mini_batch_size = m_model->get_current_mini_batch_size();
-  bp_setup_gradient_wrt_outputs(mini_batch_size);
-  bp_setup_gradient_wrt_inputs(mini_batch_size);
-
-#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-  // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
-#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-
-  // Backprop the compute function.
-  const auto bp_compute_start = get_time();
-  bp_compute();
-  m_bp_compute_time += get_time() - bp_compute_start;
-
-  // Remove this layer as a gradient source for weight optimizers
-  for (auto&& w : m_weights) {
-    auto&& opt = w->get_optimizer();
-    if (opt != nullptr) { opt->remove_gradient_source(this); }
-  }
-
-#if defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-  // Synchronize GPUs and check for errors
-  if (using_gpus()) { El::GPUManager::SynchronizeDevice(true); }
-#endif // defined(LBANN_HAS_GPU) && defined(LBANN_DEBUG)
-
-  m_bp_time += get_time() - bp_start;
-}
-
 bool Layer::update() {
   if (m_frozen) { return true; }
   // Apply any updates.
@@ -337,7 +240,7 @@ void Layer::summarize_stats(lbann_summary& summarizer, int step) {
   reset_counters();
   // Combine the optimizer step time from all the weights.
   double step_time = 0.0;
-  for (weights *w : get_weights()) {
+  for (auto const& w : m_weights) {
     optimizer *opt = w->get_optimizer();
     if (opt) {
       step_time += opt->get_step_time();
@@ -348,37 +251,6 @@ void Layer::summarize_stats(lbann_summary& summarizer, int step) {
   summarizer.reduce_scalar_all(prefix + "opt_time", step_time, step);
 }
 
-void Layer::summarize_matrices(lbann_summary& summarizer, int step) {
-
-  // Summarize activation matrices
-  const int num_children = get_num_children();
-  for (int i = 0; i < num_children; ++i) {
-    AbsDistMatReadProxy<El::Device::CPU> acts(*m_outputs[i]);
-    std::string prefix = m_name + "/activations";
-    if (num_children > 1) { prefix += std::to_string(i); }
-    summarizer.reduce_mean(prefix + "/mean", acts.GetLocked(), step);
-    summarizer.reduce_min(prefix + "/min", acts.GetLocked(), step);
-    summarizer.reduce_max(prefix + "/max", acts.GetLocked(), step);
-    summarizer.reduce_stdev(prefix + "/stdev", acts.GetLocked(), step);
-    summarizer.reduce_2norm(prefix + "/2norm2", acts.GetLocked(), step);
-  }
-
-  // Summarize error signal matrices
-  const int num_parents = get_num_parents();
-  for (int i = 0; i < num_parents; ++i) {
-    AbsDistMatReadProxy<El::Device::CPU> error_signals(*m_gradient_wrt_inputs[i]);
-    std::string prefix = m_name + "/error_signals";
-    if (num_parents > 1) { prefix += std::to_string(i); }
-    summarizer.reduce_mean(prefix + "/mean", error_signals.GetLocked(), step);
-    summarizer.reduce_min(prefix + "/min", error_signals.GetLocked(), step);
-    summarizer.reduce_max(prefix + "/max", error_signals.GetLocked(), step);
-    summarizer.reduce_stdev(prefix + "/stdev", error_signals.GetLocked(), step);
-    summarizer.reduce_2norm(prefix + "/2norm2", error_signals.GetLocked(), step);
-  }
-
-}
-
-
 // ===================================================================
 // Tensor dimension access functions
 // ===================================================================
@@ -405,10 +277,7 @@ std::vector<int> Layer::get_input_dims(int input_index) const {
 
   // Get dimensions of corresponding output tensor in parent layer
   const auto num_parent_outputs = parent.get_num_children();
-  const int parent_output_index = (std::find(parent.m_child_layers.begin(),
-                                             parent.m_child_layers.end(),
-                                             this)
-                                   - parent.m_child_layers.begin());
+  const int parent_output_index = parent.find_child_layer_index(this);
   if (parent_output_index >= num_parent_outputs) {
     std::stringstream err;
     err << "layer \"" << parent.get_name() << "\" is a parent of "
@@ -421,6 +290,10 @@ std::vector<int> Layer::get_input_dims(int input_index) const {
 
 }
 
+// ===========================================================
+// Tensor dimension access functions
+// ===========================================================
+
 int Layer::get_input_size(int input_index) const {
   const auto& dims = get_input_dims(input_index);
   if (dims.empty()) {
@@ -470,116 +343,33 @@ void Layer::set_output_dims(std::vector<int> dims, int output_index) {
   m_output_dims_list[output_index] = dims;
 }
 
-// ===================================================================
-// Tensor access functions
-// ===================================================================
-
-// Accessing distributed matrices
-const AbsDistMat& Layer::get_prev_activations(int parent_index) const {
-  if (parent_index < 0 || parent_index >= (int) m_inputs.size()) {
-    std::stringstream err;
-    err << "attempted to access invalid previous activation matrix "
-        << "from " << m_name << " "
-        << "(requested index " << parent_index << ", but there are "
-        << m_inputs.size() << " previous activation matrices)";
-    LBANN_ERROR(err.str());
-  }
-  return *m_inputs[parent_index];
-}
-const AbsDistMat& Layer::get_activations(int child_index) const {
-  if (child_index < 0 || child_index >= (int) m_outputs.size()) {
-    std::stringstream err;
-    err << "attempted to access invalid activation matrix "
-        << "from " << m_name << " "
-        << "(requested index " << child_index << ", but there are "
-        << m_outputs.size() << " activation matrices)";
-    LBANN_ERROR(err.str());
-  }
-  return *m_outputs[child_index];
-}
-const AbsDistMat& Layer::get_prev_error_signals(int child_index) const {
-  if (child_index < 0 || child_index >= (int) m_gradient_wrt_outputs.size()) {
-    std::stringstream err;
-    err << "attempted to access invalid previous error signal matrix "
-        << "from " << m_name << " "
-        << "(requested index " << child_index << ", but there are "
-        << m_gradient_wrt_outputs.size() << " previous error signal matrices)";
-    LBANN_ERROR(err.str());
-  }
-  return *m_gradient_wrt_outputs[child_index];
-}
-const AbsDistMat& Layer::get_error_signals(int parent_index) const {
-  if (parent_index < 0 || parent_index >= (int) m_gradient_wrt_inputs.size()) {
-    std::stringstream err;
-    err << "attempted to access invalid error signal matrix "
-        << "from " << m_name << " "
-        << "(requested index " << parent_index << ", but there are "
-        << m_gradient_wrt_inputs.size() << " error signal matrices)";
-    LBANN_ERROR(err.str());
-  }
-  return *m_gradient_wrt_inputs[parent_index];
-}
+// FIXME (trb 05/28/2020): IMO, this function name is somewhat
+// misleading. It's not "replacing" anything -- it's overwriting the
+// weights values of "this" with the weights values of "other_layer",
+// which is left intact.
+//
+// ALSO, really what it does is copies the first "number of weights
+// 'this' expects to have" and ignores any others that might be
+// present in "other_layer".
+//
+// The use-cases of this function are outside the scope of my current
+// work, so I'm "refactoring in-place" and leaving this documentation
+// for a future refactor.
+void Layer::replace_weights(Layer const& other_layer) {
 
-// Accessing non-const distributed matrices
-// Note: Using idiom from Item 3, p. 23 in "Effective C++", 3rd ed.,
-// by Scott Meyers.
-AbsDistMat& Layer::get_activations(int child_index) {
-  return const_cast<AbsDistMat&>(static_cast<const Layer&>(*this).get_activations(child_index));
-}
-AbsDistMat& Layer::get_error_signals(int parent_index) {
-  return const_cast<AbsDistMat&>(static_cast<const Layer&>(*this).get_error_signals(parent_index));
-}
+  auto const other_num_weights = other_layer.num_weights();
+  auto const my_num_weights = this->num_weights();
 
-// Accessing local matrices
-AbsMat& Layer::get_local_activations(int child_index) {
-  return get_activations(child_index).Matrix();
-}
-AbsMat& Layer::get_local_error_signals(int parent_index) {
-  return get_error_signals(parent_index).Matrix();
-}
-const AbsMat& Layer::get_local_prev_activations(int parent_index) const {
-  return get_prev_activations(parent_index).LockedMatrix();
-}
-const AbsMat& Layer::get_local_activations(int child_index) const {
-  return get_activations(child_index).LockedMatrix();
-}
-const AbsMat& Layer::get_local_prev_error_signals(int child_index) const {
-  return get_prev_error_signals(child_index).LockedMatrix();
-}
-const AbsMat& Layer::get_local_error_signals(int parent_index) const {
-  return get_error_signals(parent_index).LockedMatrix();
-}
+  // Minimal sanity check; see longer note above.
+  if (other_num_weights < my_num_weights)
+    LBANN_ERROR("Expected at least ", my_num_weights, " weights in layer \"",
+                other_layer.get_name(), "\" but found ", other_num_weights);
 
-// Accessing matrices corresponding to parent/child layer
-const AbsDistMat& Layer::get_activations(const Layer& child) const {
-  const int child_index = (std::find(m_child_layers.begin(),
-                                     m_child_layers.end(),
-                                     &child)
-                           - m_child_layers.begin());
-  if (child_index >= get_num_children()) {
-    std::stringstream err;
-    err << "attempted to get activation tensor of "
-        << "layer \"" << get_name() << "\" "
-        << "corresponding to layer\"" << child.get_name() << "\", "
-        << "which is not a child layer";
-    LBANN_ERROR(err.str());
-  }
-  return get_activations(child_index);
-}
-const AbsDistMat& Layer::get_error_signals(const Layer& parent) const {
-  const int parent_index = (std::find(m_parent_layers.begin(),
-                                      m_parent_layers.end(),
-                                      &parent)
-                           - m_parent_layers.begin());
-  if (parent_index >= get_num_parents()) {
-    std::stringstream err;
-    err << "attempted to get error signal tensor of "
-        << "layer \"" << get_name() << "\" "
-        << "corresponding to layer\"" << parent.get_name() << "\", "
-        << "which is not a parent layer";
-    LBANN_ERROR(err.str());
+  using IdxT = typename std::decay<decltype(my_num_weights)>::type;
+  for (IdxT ii = 0; ii < my_num_weights; ++ii) {
+    auto const& other_layer_weights = other_layer.get_weights(ii);
+    this->get_weights(ii).set_values(other_layer_weights.get_values());
   }
-  return get_error_signals(parent_index);
 }
 
 void Layer::freeze() {
@@ -599,17 +389,21 @@ void Layer::unfreeze() {
 bool Layer::is_frozen() const {
   for(auto& w : m_weights) {
     if (w->is_frozen() != m_frozen) {
-      LBANN_ERROR("layer and weights of them are inconsistently frozen");
+      LBANN_ERROR("layer ", get_name(), " and weight ", w->get_name(), \
+                  " of it are inconsistently frozen");
     }
   }
   return m_frozen;
 }
 
-void Layer::setup() {
+void Layer::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
   setup_pointers();
-  setup_dims();
+  setup_dims(dr_metadata);
   setup_matrices(m_comm->get_trainer_grid());
-  setup_data();
+#ifdef LBANN_HAS_DISTCONV
+  prepare_distconv();
+#endif // LBANN_HAS_DISTCONV
+  setup_data(max_mini_batch_size);
   if (using_gpus()) { setup_gpu(); }
 }
 
@@ -690,7 +484,7 @@ void Layer::setup_pointers() {
 
 }
 
-void Layer::setup_dims() {
+void Layer::setup_dims(DataReaderMetaData& dr_metadata) {
   m_output_dims_list.resize(get_num_children());
   if (m_hint_layer != nullptr) {
     const auto& hint_dims = m_hint_layer->get_output_dims();
@@ -707,110 +501,6 @@ void Layer::setup_dims() {
   }
 }
 
-void Layer::setup_matrices(const El::Grid& grid) {
-
-  // Destroy previously setup matrices
-  m_inputs.clear();
-  m_outputs.clear();
-  m_gradient_wrt_outputs.clear();
-  m_gradient_wrt_inputs.clear();
-
-  // Construct matrices
-  m_inputs.resize(get_num_parents());
-  m_outputs.resize(get_num_children());
-  m_gradient_wrt_outputs.resize(get_num_children());
-  m_gradient_wrt_inputs.resize(get_num_parents());
-  for (int i = 0; i < get_num_parents(); ++i) {
-    m_inputs[i] = construct_matrix(grid, "input", i);
-  }
-  for (int i = 0; i < get_num_children(); ++i) {
-    m_outputs[i] = construct_matrix(grid, "output", i);
-  }
-  for (int i = 0; i < get_num_children(); ++i) {
-    m_gradient_wrt_outputs[i]
-      = construct_matrix(grid, "gradient_wrt_output", i);
-  }
-  for (int i = 0; i < get_num_parents(); ++i) {
-    m_gradient_wrt_inputs[i]
-      = construct_matrix(grid, "gradient_wrt_input", i);
-  }
-}
-
-std::unique_ptr<AbsDistMat> Layer::construct_matrix(const El::Grid& grid,
-                                                    std::string type,
-                                                    El::Int index) {
-
-  // Choose matrix distribution
-  El::Distribution col_dist, row_dist;
-  El::DistWrap wrap;
-  El::Device device = get_device_allocation();
-  switch (get_data_layout()) {
-  case data_layout::DATA_PARALLEL:
-    col_dist = El::STAR;
-    row_dist = El::VC;
-    wrap     = El::ELEMENT;
-    break;
-  case data_layout::MODEL_PARALLEL:
-    col_dist = El::MC;
-    row_dist = El::MR;
-    wrap     = El::ELEMENT;
-    break;
-  default: LBANN_ERROR("invalid data layout");
-  }
-
-  // Construct matrix
-  std::unique_ptr<AbsDistMat> mat;
-  mat.reset(AbsDistMat::Instantiate(grid, 0,
-                                    col_dist, row_dist, wrap, device));
-
-#ifdef LBANN_HAS_GPU
-  // Allocate GPU memory with the CUDA API
-  if (device == El::Device::GPU) { mat->Matrix().SetMemoryMode(0); }
-  // Use pinned memory for data on the host.
-  if (device == El::Device::CPU) { mat->Matrix().SetMemoryMode(1); }
-#endif // LBANN_HAS_GPU
-
-  return mat;
-}
-
-void Layer::setup_data() {
-
-  // Get mini-batch size
-  const auto& mini_batch_size = m_model->get_max_mini_batch_size();
-
-  // Initialize input and output tensors
-  fp_setup_inputs(mini_batch_size);
-  fp_setup_outputs(mini_batch_size);
-
-  // Initialize gradient w.r.t. output tensors
-  // Note: We guess whether the tensor is a view or needs to allocate
-  // memory, but there are some edge cases that are not handled.
-  for (int i = 0; i < get_num_children(); ++i) {
-    const auto& child = *m_child_layers[i];
-    const auto& output = get_activations(i);
-    auto& gradient_wrt_output = *m_gradient_wrt_outputs[i];
-    gradient_wrt_output.Empty(false);
-    gradient_wrt_output.AlignWith(output);
-    if (child.get_data_layout() == get_data_layout()
-        && child.get_device_allocation() == get_device_allocation()
-        && gradient_wrt_output.DistData() == output.DistData()) {
-      El::LockedView(gradient_wrt_output, output);
-    } else {
-      El::Copy(output, gradient_wrt_output);
-    }
-  }
-
-  // Initialize gradient w.r.t. input tensors
-  bp_setup_gradient_wrt_inputs(mini_batch_size);
-
-}
-
-void Layer::bp_compute() {
-  for (int i = 0; i < get_num_parents(); ++i) {
-    El::Zero(get_error_signals(i));
-  }
-}
-
 void Layer::check_setup() {
   std::stringstream err;
 
@@ -853,70 +543,16 @@ void Layer::check_setup() {
       LBANN_ERROR(err.str());
     }
   }
-
-  // Check number of tensors
-  const int num_parents = get_num_parents();
-  const int num_children = get_num_children();
-  if ((int) m_inputs.size() != num_parents
-      || (int) m_outputs.size() != num_children
-      || (int) m_gradient_wrt_outputs.size() != num_children
-      || (int) m_gradient_wrt_inputs.size() != num_parents) {
-    err << "layer \"" << get_name() << "\" has an "
-        << "invalid number of input and output tensors "
-        << "(found " << num_parents << " parent layers, "
-        << num_children << " child layers, "
-        << m_inputs.size() << " input tensors, "
-        << m_outputs.size() << " output tensors, "
-        << m_gradient_wrt_outputs.size() << " gradient w.r.t. output tensors, "
-        << m_gradient_wrt_inputs.size() << " gradient w.r.t. input tensors)";
-    LBANN_ERROR(err.str());
-  }
-
-  // Check that tensors are initialized
-  for (int i = 0; i < get_num_parents(); ++i) {
-    if (m_inputs[i] == nullptr) {
-      err << "layer \"" << get_name() << "\" has an "
-          << "uninitialized input tensor (index " << i << ")";
-      LBANN_ERROR(err.str());
-    }
-  }
-  for (int i = 0; i < get_num_children(); ++i) {
-    if (m_outputs[i] == nullptr) {
-      err << "layer \"" << get_name() << "\" has an "
-          << "uninitialized output tensor (index " << i << ")";
-      LBANN_ERROR(err.str());
-    }
-  }
-  for (int i = 0; i < get_num_children(); ++i) {
-    if (m_gradient_wrt_outputs[i] == nullptr) {
-      err << "layer \"" << get_name() << "\" has an "
-          << "uninitialized gradient w.r.t. output tensor "
-          << "(index " << i << ")";
-      LBANN_ERROR(err.str());
-    }
-  }
-  for (int i = 0; i < get_num_parents(); ++i) {
-    if (m_gradient_wrt_inputs[i] == nullptr) {
-      err << "layer \"" << get_name() << "\" has an "
-          << "uninitialized gradient w.r.t. input tensor "
-          << "(index " << i << ")";
-      LBANN_ERROR(err.str());
-    }
-  }
 }
 
-void Layer::replace_weights(Layer* other_layer) {
-  if (other_layer == nullptr) {
-    LBANN_ERROR("attempted to add null pointer as a replacement layer");
-  }
-
-  const std::vector<weights *> other_layer_weights = other_layer->get_weights();
-  for (size_t i = 0; i < m_weights.size(); ++i) {
-    m_weights[i]->set_values(other_layer_weights[i]->get_values());
-  }
-
+void Layer::back_prop() {
+  allocate_new_gradients_();
+  back_prop_impl_();
+  propagate_error_signals_to_parents_();
+  clear_prev_error_signals_();
 }
 
+
 bool Layer::save_to_checkpoint_shared(persist& p) const {
   return true;
 }
@@ -940,178 +576,12 @@ void Layer::write_proto(lbann_data::Layer* proto) const {
   if(!m_parent_layers.empty()) proto->set_bottom(m_parent_layers.front()->get_name());
   proto->set_top(get_name());
   //Add weights
-  for (weights *w : m_weights) {
+  for (auto const& w : m_weights) {
     auto weight_proto = proto->add_weights_data();
     w->write_proto(weight_proto);
   }
 }
 
-void Layer::fp_setup_inputs(El::Int mini_batch_size) {
-  if (get_num_parents() < 1) { return; }
-
-  // Determine distributed matrix alignment
-  const auto& alignment_dist
-    = m_parent_layers.front()->get_activations(*this).DistData();
-
-  // Iterate through input tensors
-  for (int i = 0; i < get_num_parents(); ++i) {
-
-    // Initialize input tensor
-    const auto& parent = *m_parent_layers[i];
-    const auto& parent_output = parent.get_activations(*this);
-    auto& input = *m_inputs[i];
-    input.Empty(false);
-    input.AlignWith(alignment_dist);
-    if (parent_output.DistData() == input.DistData()) {
-      El::LockedView(input, parent_output);
-    } else {
-      bool async_copy = false;
-#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
-      // Asynchronously copy CPU data to GPU data if they are otherwise aligned
-      if (parent_output.GetLocalDevice() == El::Device::CPU
-          && input.GetLocalDevice() == El::Device::GPU) {
-        auto parent_dist_data = parent_output.DistData();
-        parent_dist_data.device = El::Device::GPU;
-        async_copy = parent_dist_data == input.DistData();
-      }
-#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
-      if (async_copy) {
-        El::CopyAsync(parent_output, input);
-      } else {
-        El::Copy(parent_output, input);
-      }
-    }
-
-    // Check input matrix dimensions
-    const auto& height = get_input_size(i);
-    const auto& width = mini_batch_size;
-    if (input.Height() != height || input.Width() != width) {
-      std::stringstream err;
-      err << "layer \"" << get_name() << "\" "
-          << "expected an input tensor stored in a "
-          << height << " x " << width << " matrix "
-          << "from layer \"" << parent.get_name() << "\", but got a "
-          << input.Height() << " x " << input.Width() << " matrix";
-      LBANN_ERROR(err.str());
-    }
-
-  }
-
-}
-
-void Layer::fp_setup_outputs(El::Int mini_batch_size) {
-  if (get_num_children() < 1) { return; }
-
-  // Determine distributed matrix alignment
-  const bool align_outputs = get_num_parents() > 0;
-  const auto& alignment_dist = (align_outputs ?
-                                get_prev_activations().DistData() :
-                                get_activations().DistData());
-
-  // Initialize output tensors
-  for (int i = 0; i < get_num_children(); ++i) {
-    auto& output = get_activations(i);
-    output.Empty(false);
-    if (align_outputs) { output.AlignWith(alignment_dist); }
-    output.Resize(get_output_size(i), mini_batch_size);
-  }
-
-}
-
-void Layer::bp_setup_gradient_wrt_outputs(El::Int mini_batch_size) {
-  for (int i = 0; i < get_num_children(); ++i) {
-
-    // Initialize gradient w.r.t. output tensor
-    const auto& child = *m_child_layers[i];
-    const auto& child_gradient_wrt_input = child.get_error_signals(*this);
-    auto& gradient_wrt_output = *m_gradient_wrt_outputs[i];
-    gradient_wrt_output.Empty(false);
-    gradient_wrt_output.AlignWith(get_activations(i));
-    if (child_gradient_wrt_input.DistData()
-        == gradient_wrt_output.DistData()) {
-      El::LockedView(gradient_wrt_output, child_gradient_wrt_input);
-    } else {
-      bool async_copy = false;
-#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
-      // Asynchronously copy CPU data to GPU data if they are otherwise aligned
-      if (child_gradient_wrt_input.GetLocalDevice() == El::Device::CPU
-          && gradient_wrt_output.GetLocalDevice() == El::Device::GPU) {
-        auto child_dist_data = child_gradient_wrt_input.DistData();
-        child_dist_data.device = El::Device::GPU;
-        async_copy = child_dist_data == gradient_wrt_output.DistData();
-      }
-#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
-      if (async_copy) {
-        El::CopyAsync(child_gradient_wrt_input, gradient_wrt_output);
-      } else {
-        El::Copy(child_gradient_wrt_input, gradient_wrt_output);
-      }
-    }
-
-    // Check gradient w.r.t. output matrix dimensions
-    const auto& height = get_output_size(i);
-    const auto& width = mini_batch_size;
-    if (gradient_wrt_output.Height() != height
-        || gradient_wrt_output.Width() != width) {
-      std::stringstream err;
-      err << "layer \"" << get_name() << "\" "
-          << "expected a gradient w.r.t. output tensor stored in a "
-          << height << " x " << width << " matrix "
-          << "from layer \"" << child.get_name() << "\", but got a "
-          << gradient_wrt_output.Height() << " x "
-          << gradient_wrt_output.Width() << " matrix";
-      LBANN_ERROR(err.str());
-    }
-
-  }
-}
-
-void Layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) {
-  for (int i = 0; i < get_num_parents(); ++i) {
-    auto& gradient_wrt_input = get_error_signals(i);
-    gradient_wrt_input.Empty(false);
-    gradient_wrt_input.AlignWith(get_prev_activations(i));
-    gradient_wrt_input.Resize(get_input_size(i), mini_batch_size);
-  }
-}
-
-std::string Layer::get_data_layout_string(data_layout d) const {
-  switch(d) {
-  case data_layout::DATA_PARALLEL:
-    return "data_parallel";
-  case data_layout::MODEL_PARALLEL:
-    return "model_parallel";
-  default:
-    LBANN_ERROR("invalid data layout");
-  }
-}
-
-std::string Layer::get_device_allocation_string(El::Device dev) const {
-  switch(dev) {
-  case El::Device::CPU:
-    return "cpu";
-#ifdef LBANN_HAS_GPU
-  case El::Device::GPU:
-    return "gpu";
-#endif // LBANN_HAS_GPU
-  default:
-    LBANN_ERROR("invalid device allocation");
-  }
-}
-
-std::string Layer::get_device_allocation_string_short(El::Device dev) const {
-  switch(dev) {
-  case El::Device::CPU:
-    return "C";
-#ifdef LBANN_HAS_GPU
-  case El::Device::GPU:
-    return "G";
-#endif // LBANN_HAS_GPU
-  default:
-    LBANN_ERROR("invalid device allocation");
-  }
-}
-
 std::string Layer::get_layer_names(const std::vector<const Layer*>& list) {
   std::string layer_names = ((list.size()==0u || !list[0])? "" : list[0]->get_name());
 
@@ -1175,4 +645,67 @@ void Layer::set_layer_pointers(std::vector<Layer*> layers) {
   pos++;
 }
 
+#ifdef LBANN_HAS_DISTCONV
+void Layer::prepare_distconv() {
+  if (distconv_enabled()) {
+    setup_distconv_adapter();
+  }
+}
+
+bool Layer::distconv_enabled() const {
+  if (!m_distconv_enabled_set) {
+    // Distconv is disabled if no parallel strategy is defined. When no
+    // strategy is defined, the layer has the default strategy of all
+    // zeros, which is invalid, thus should not be used when distconv is
+    // used.
+    const auto &ps = get_parallel_strategy();
+    ParallelStrategy default_zero_ps;
+    if (ps == default_zero_ps) {
+      dc::MPIRootPrintStreamDebug()
+          << "Disable " << get_name()
+          << " as it does not have a parallel strategy.";
+      m_distconv_enabled = false;
+      m_distconv_enabled_set = true;
+    }
+  }
+
+  if (!m_distconv_enabled_set) {
+    // Finally, check whether a layer is supported by distconv.
+    m_distconv_enabled = is_distconv_supported();
+    m_distconv_enabled_set = true;
+  }
+
+  return m_distconv_enabled;
+}
+
+bool Layer::keep_original_inputs(int index) const {
+  return !(distconv_enabled() && !get_distconv_adapter().parent_copy_required(index));
+}
+
+bool Layer::keep_original_outputs(int index) const {
+  return !(distconv_enabled() && !get_distconv_adapter().child_copy_required(index));
+}
+
+bool Layer::keep_original_gradient_wrt_outputs(int index) const {
+  return keep_original_outputs(index);
+}
+
+bool Layer::keep_original_gradient_wrt_inputs(int index) const {
+  return keep_original_inputs(index);
+}
+
+distconv_adapter& Layer::get_distconv_adapter() {
+  return const_cast<distconv_adapter&>(
+      static_cast<const Layer&>(*this).get_distconv_adapter());
+}
+
+const distconv_adapter& Layer::get_distconv_adapter() const {
+  if (m_dc == nullptr) {
+    LBANN_ERROR("Trying to access distconv adapter for layer, ",
+                get_name(), ", without setting up");
+  }
+  return *m_dc;
+}
+#endif // LBANN_HAS_DISTCONV
+
 }  // namespace lbann
diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt
index f89b0827617..3b9207d7f8a 100644
--- a/src/layers/learning/CMakeLists.txt
+++ b/src/layers/learning/CMakeLists.txt
@@ -1,7 +1,26 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  base_convolution.cpp
+  channelwise_fully_connected.cpp
+  channelwise_scale_bias.cpp
+  channelwise_scale_bias_builder.cpp
+  convolution.cpp
+  deconvolution.cpp
+  entrywise_scale_bias.cpp
+  embedding.cpp
+  embedding_builder.cpp
   fully_connected.cpp
   )
 
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    channelwise_scale_bias.cu
+    embedding.cu
+    entrywise_scale_bias.cu
+    )
+endif ()
+
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/learning/base_convolution.cpp b/src/layers/learning/base_convolution.cpp
new file mode 100644
index 00000000000..68c3d42cbf7
--- /dev/null
+++ b/src/layers/learning/base_convolution.cpp
@@ -0,0 +1,1396 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/data_type_layer.hpp"
+#include "lbann/layers/layer.hpp"
+#include "lbann/layers/learning/base_convolution.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/utils/cudnn.hpp"
+#include "lbann/utils/distconv.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/im2col.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/weights/initializer.hpp"
+#include "lbann/weights/variance_scaling_initializers.hpp"
+
+#include <omp.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace lbann {
+
+template <typename TensorDataType, El::Device Device>
+base_convolution_layer<TensorDataType,Device>::base_convolution_layer(
+  lbann_comm* comm,
+  int num_data_dims,
+  int output_channels,
+  std::vector<int> conv_dims,
+  std::vector<int> pads,
+  std::vector<int> strides,
+  std::vector<int> dilations,
+  int groups,
+  bool has_bias)
+  : data_type_layer<TensorDataType>(comm),
+  m_output_channels(output_channels),
+  m_conv_dims(std::move(conv_dims)),
+  m_pads(std::move(pads)),
+  m_strides(std::move(strides)),
+  m_dilations(std::move(dilations)),
+  m_groups(groups),
+  m_bias_scaling_factor(has_bias
+                        ? El::TypeTraits<ScalingType>::One()
+                        : El::TypeTraits<ScalingType>::Zero())
+#ifdef LBANN_HAS_CUDNN
+  , m_tensors_cudnn_desc(this)
+#endif // LBANN_HAS_CUDNN
+{}
+template <typename TensorDataType, El::Device Device>
+base_convolution_layer<TensorDataType,Device>::base_convolution_layer(
+  const base_convolution_layer& other)
+  : data_type_layer<TensorDataType>(other),
+  m_output_channels(other.m_output_channels),
+  m_conv_dims(other.m_conv_dims),
+  m_pads(other.m_pads),
+  m_strides(other.m_strides),
+  m_dilations(other.m_dilations),
+  m_groups(other.m_groups),
+  m_bias_scaling_factor(other.m_bias_scaling_factor)
+#ifdef LBANN_HAS_CUDNN
+  , m_convolution_math_type(other.m_convolution_math_type),
+  m_tensors_cudnn_desc(other.m_tensors_cudnn_desc),
+  m_fwd_cudnn_algos(other.m_fwd_cudnn_algos),
+  m_bwd_data_cudnn_algos(other.m_bwd_data_cudnn_algos),
+  m_bwd_filter_cudnn_algos(other.m_bwd_filter_cudnn_algos)
+#endif // LBANN_HAS_CUDNN
+{
+#ifdef LBANN_HAS_CUDNN
+  copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc,
+                         m_kernel_cudnn_desc);
+  copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc,
+                              m_convolution_cudnn_desc);
+  if (other.m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()) {
+    cudnn::copy_tensor_desc(other.m_bias_cudnn_desc,
+                            m_bias_cudnn_desc);
+  }
+  m_tensors_cudnn_desc.set_layer(this);
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+auto
+base_convolution_layer<TensorDataType,Device>
+::operator=(const base_convolution_layer& other)
+  -> base_convolution_layer& {
+  data_type_layer<TensorDataType>::operator=(other);
+  m_output_channels = other.m_output_channels;
+  m_conv_dims = other.m_conv_dims;
+  m_pads = other.m_pads;
+  m_strides = other.m_strides;
+  m_dilations = other.m_dilations;
+  m_groups = other.m_groups;
+  m_bias_scaling_factor = other.m_bias_scaling_factor;
+
+#ifdef LBANN_HAS_CUDNN
+  // Copy cuDNN objects
+  m_convolution_math_type = other.m_convolution_math_type;
+  copy_kernel_cudnn_desc(other.m_kernel_cudnn_desc,
+                         m_kernel_cudnn_desc);
+  copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc,
+                              m_convolution_cudnn_desc);
+  if (other.m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()) {
+    cudnn::copy_tensor_desc(other.m_bias_cudnn_desc,
+                            m_bias_cudnn_desc);
+  }
+  m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
+  m_tensors_cudnn_desc.set_layer(this);
+  m_fwd_cudnn_algos = other.m_fwd_cudnn_algos;
+  m_bwd_data_cudnn_algos = other.m_bwd_data_cudnn_algos;
+  m_bwd_filter_cudnn_algos = other.m_bwd_filter_cudnn_algos;
+#endif // LBANN_HAS_CUDNN
+
+  return *this;
+}
+
+template <typename TensorDataType, El::Device Device>
+base_convolution_layer<TensorDataType,Device>
+::~base_convolution_layer() {
+#ifdef LBANN_HAS_CUDNN
+  if (m_kernel_cudnn_desc != nullptr) {
+    CHECK_CUDNN_DTOR(cudnnDestroyFilterDescriptor(m_kernel_cudnn_desc));
+  }
+  if (m_convolution_cudnn_desc != nullptr) {
+    CHECK_CUDNN_DTOR(cudnnDestroyConvolutionDescriptor(m_convolution_cudnn_desc));
+  }
+  if (m_bias_cudnn_desc != nullptr) {
+    CHECK_CUDNN_DTOR(cudnnDestroyTensorDescriptor(m_bias_cudnn_desc));
+  }
+#endif // LBANN_HAS_CUDNN
+}
+
+#ifdef LBANN_HAS_CUDNN
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::set_cudnn_math_mode(cudnnMathType_t math_type) noexcept {
+  m_convolution_math_type = math_type;
+}
+#endif // LBANN_HAS_CUDNN
+
+template <typename TensorDataType, El::Device Device>
+description
+base_convolution_layer<TensorDataType,Device>::get_description() const {
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  std::ostringstream ss;
+
+  // Convolution dimensions
+  ss.str(std::string{});
+  ss.clear();
+  for (size_t i = 0; i < m_conv_dims.size(); ++i) {
+    ss << (i > 0 ? ", " : "" ) << m_conv_dims[i];
+  }
+  desc.add("Convolution dimensions", ss.str());
+
+  // Strides
+  ss.str(std::string{});
+  ss.clear();
+  for (size_t i = 0; i < m_strides.size(); ++i) {
+    ss << (i > 0 ? ", " : "" ) << m_strides[i];
+  }
+  desc.add("Strides", ss.str());
+
+  // Pads
+  ss.str(std::string{});
+  ss.clear();
+  for (size_t i = 0; i < m_pads.size(); ++i) {
+    ss << (i > 0 ? ", " : "" ) << m_pads[i];
+  }
+  desc.add("Pads", ss.str());
+
+  // Dilation
+  ss.str(std::string{});
+  ss.clear();
+  for (size_t i = 0; i < m_dilations.size(); ++i) {
+    ss << (i > 0 ? ", " : "" ) << m_dilations[i];
+  }
+  desc.add("Dilations", ss.str());
+
+  // Groups
+  desc.add("Groups", m_groups);
+
+  // Bias
+  ss.str(std::string{});
+  ss.clear();
+  ss << (m_bias_scaling_factor == El::TypeTraits<ScalingType>::Zero() ?
+         "disabled" : "enabled");
+  desc.add("Bias", ss.str());
+
+#ifdef LBANN_HAS_CUDNN
+  if (Device == El::Device::GPU) {
+    desc.add("cuDNN Math Mode",
+             (m_convolution_math_type == CUDNN_DEFAULT_MATH
+              ? "NO tensor cores."
+              : "USE tensor cores."));
+  }
+#endif // LBANN_HAS_CUDNN
+
+  // Result
+  return desc;
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::setup_dims(DataReaderMetaData& dr_metadata) {
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+  std::ostringstream err;
+
+  // Check number of channels and channel groups
+  const auto& input_dims = this->get_input_dims();
+  if (m_output_channels < 1) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has an invalid number of output channels "
+        << "(" << m_output_channels << ")";
+    LBANN_ERROR(err.str());
+  } else if (m_groups < 1) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has an invalid number of groups (" << m_groups << ")";
+    LBANN_ERROR(err.str());
+  } else if (input_dims[0] % m_groups != 0
+             || m_output_channels % m_groups != 0) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has " << m_groups << " groups, which does not divide "
+        << "the input channels (" << input_dims[0] << ") or "
+        << "the output channels (" << m_output_channels << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Check kernel dims, pads, stride, dilations
+  const auto& num_spatial_dims = input_dims.size() - 1;
+  if (m_conv_dims.size() != num_spatial_dims
+      || std::any_of(m_conv_dims.begin(), m_conv_dims.end(),
+                     [](El::Int d) { return d < 1; })) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has invalid spatial dimensions for convolution kernel (";
+    if (m_conv_dims.empty()) { err << "no dimensions"; }
+    for (size_t i = 0; i < m_conv_dims.size(); ++i) {
+      err << (i > 0 ? "x" : "") << m_conv_dims[i];
+    }
+    err << ", expected " << num_spatial_dims << " spatial dimensions)";
+    LBANN_ERROR(err.str());
+  } else if (m_pads.size() != num_spatial_dims) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has invalid convolution pads ((";
+    for (size_t i = 0; i < m_pads.size(); ++i) {
+      err << (i > 0 ? "," : "") << m_pads[i];
+    }
+    err << "), expected " << num_spatial_dims << " spatial dimensions)";
+    LBANN_ERROR(err.str());
+  } else if (m_strides.size() != num_spatial_dims
+             || std::any_of(m_strides.begin(), m_strides.end(),
+                            [](El::Int d) { return d < 1; })) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has invalid convolution strides ((";
+    for (size_t i = 0; i < m_strides.size(); ++i) {
+      err << (i > 0 ? "," : "") << m_strides[i];
+    }
+    err << "), expected " << num_spatial_dims << " spatial dimensions)";
+    LBANN_ERROR(err.str());
+  } else if (m_dilations.size() != num_spatial_dims
+             || std::any_of(m_dilations.begin(), m_dilations.end(),
+                            [](El::Int d) { return d < 1; })) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has invalid convolution dilations ((";
+    for (size_t i = 0; i < m_dilations.size(); ++i) {
+      err << (i > 0 ? "," : "") << m_dilations[i];
+    }
+    err << "), expected " << num_spatial_dims << " spatial dimensions)";
+    LBANN_ERROR(err.str());
+  }
+
+  // Make sure that configuration is supported
+  if (Device == El::Device::CPU
+      && std::any_of(m_dilations.begin(), m_dilations.end(),
+                     [](El::Int d) { return d != 1; })) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has non-unit dilation, which is not yet supported on CPU";
+    LBANN_ERROR(err.str());
+  }
+  if (Device == El::Device::CPU && m_groups != 1) {
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
+        << "has " << m_groups << " groups, "
+        << "but only one group is currently supported on CPU";
+    LBANN_ERROR(err.str());
+  }
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::setup_data(size_t max_mini_batch_size) {
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  // Tensor dimensions
+  const auto& input_dims = this->get_input_dims();
+  const auto& output_dims = this->get_output_dims();
+  const auto& kernel_dims = this->get_kernel_dims();
+  const auto& kernel_size = std::accumulate(kernel_dims.begin(),
+                                            kernel_dims.end(),
+                                            1, std::multiplies<int>());
+
+  // Initialize default weights if none are provided
+  if (this->num_weights() > 2) {
+    LBANN_ERROR("attempted to setup layer \"", this->get_name(), "\" "
+                "with an invalid number of weights "
+                "(expected at most 2, "
+                "found ", this->num_weights(), ")");
+  }
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()) {
+    this->set_num_weights(2);
+  } else {
+    this->set_num_weights(1);
+  }
+  if (!this->has_weights(0)) {
+    auto w = make_unique<WeightsType>(this->get_comm());
+    auto init = make_unique<he_initializer<TensorDataType>>(probability_distribution::gaussian);
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+
+    w->set_name(this->get_name() + "_kernel");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->set_weights(0, w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+  auto& kernel_weights = this->get_weights(0);
+
+  // Initialize variance scaling initialization
+  if (auto* initializer = kernel_weights.get_initializer()) {
+    set_fan_in(*initializer, kernel_size / output_dims[0]);
+    set_fan_out(*initializer, kernel_size / input_dims[0]);
+  }
+
+  // Initialize weight matrices
+  auto dist = this->get_prev_activations().DistData();
+  dist.colDist = El::STAR;
+  dist.rowDist = El::STAR;
+  kernel_weights.set_dims(kernel_dims);
+  kernel_weights.set_matrix_distribution(dist);
+
+  // Set up bias if needed.
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()) {
+    if (!this->has_weights(1)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_bias");
+      w->set_optimizer(std::move(opt));
+      this->set_weights(1, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+    auto& bias_weights = this->get_weights(1);
+    bias_weights.set_dims(output_dims[0]);
+    bias_weights.set_matrix_distribution(dist);
+  }
+
+  // Initialize freeze state
+  auto const num_weights = this->num_weights();
+  for (size_t ii = 0; ii < num_weights; ++ii) {
+    auto& w = this->get_weights(ii);
+    if (this->m_frozen) {
+      w.freeze();
+    } else {
+      w.unfreeze();
+    }
+  }
+  for (size_t ii = 0; ii < num_weights; ++ii) {
+    auto& w = this->get_weights(ii);
+    if (w.is_frozen() != this->m_frozen) {
+      LBANN_ERROR((this->m_frozen ? "" : "un"), "frozen layer "
+                  "\"", this->get_name(), "\" has ",
+                  (w.is_frozen() ? "" : "un"), "frozen weights ",
+                  "\"", w.get_name(), "\"");
+    }
+  }
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_layer<TensorDataType,Device>::setup_gpu() {
+  data_type_layer<TensorDataType>::setup_gpu();
+#ifndef LBANN_HAS_CUDNN
+  LBANN_ERROR("cuDNN not detected");
+#else
+
+  const auto& output_dims = this->get_output_dims();
+  const auto& kernel_dims = this->get_kernel_dims();
+
+  // Set kernel descriptor
+  CHECK_CUDNN(cudnnCreateFilterDescriptor(&m_kernel_cudnn_desc));
+  CHECK_CUDNN(cudnnSetFilterNdDescriptor(m_kernel_cudnn_desc,
+                                         cudnn::get_data_type<TensorDataType>(),
+                                         CUDNN_TENSOR_NCHW,
+                                         kernel_dims.size(),
+                                         kernel_dims.data()));
+
+  // Set convolution descriptor
+  CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&m_convolution_cudnn_desc));
+  CHECK_CUDNN(cudnnSetConvolutionMathType(
+                m_convolution_cudnn_desc, m_convolution_math_type));
+  CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(m_convolution_cudnn_desc,
+                                              m_pads.size(),
+                                              m_pads.data(),
+                                              m_strides.data(),
+                                              m_dilations.data(),
+                                              CUDNN_CROSS_CORRELATION,
+                                              cudnn::get_data_type<TensorDataType>()));
+  CHECK_CUDNN(cudnnSetConvolutionGroupCount(m_convolution_cudnn_desc,
+                                            m_groups));
+
+  // Set bias tensor descriptor
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()) {
+    std::vector<int> bias_dims(output_dims.size() + 1, 1);
+    bias_dims[1] = output_dims[0];
+    cudnn::set_tensor_desc<TensorDataType>(m_bias_cudnn_desc, bias_dims);
+  }
+
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::apply_convolution_cudnn(bool during_forward_prop) {
+#ifndef LBANN_HAS_CUDNN
+  LBANN_ERROR("cuDNN not detected");
+#else
+
+  // Useful constants
+  const auto zero = El::TypeTraits<ScalingType>::Zero();
+  const auto one = El::TypeTraits<ScalingType>::One();
+
+  // Matrices
+  const auto& kernel = this->weights_values(0);
+  const auto& input = (during_forward_prop ?
+                       this->get_local_prev_activations() :
+                       this->get_local_prev_error_signals());
+  auto& output = (during_forward_prop ?
+                  this->get_local_activations() :
+                  this->get_local_error_signals());
+
+  // Do nothing if there is no local data
+  if (input.Height() < 1 || input.Width() < 1
+      || output.Height() < 1 || output.Width() < 1) {
+    return;
+  }
+
+  // Initialize GPU workspace
+  El::Matrix<TensorDataType, El::Device::GPU> workspace;
+#ifdef HYDROGEN_HAVE_CUB
+  workspace.SetMemoryMode(1);
+#endif // HYDROGEN_HAVE_CUB
+  size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
+  workspace.Resize(workspace_size / sizeof(TensorDataType), 1);
+  workspace_size = workspace.Height() * sizeof(TensorDataType);
+
+  // Convolution parameters
+  std::vector<int> input_dims, output_dims;
+  cudnnTensorDescriptor_t input_desc, output_desc;
+  if (during_forward_prop) {
+    input_dims = this->get_input_dims();
+    output_dims = this->get_output_dims();
+    input_desc = m_tensors_cudnn_desc.get_prev_activations();
+    output_desc = m_tensors_cudnn_desc.get_activations();
+  }
+  else {
+    input_dims = this->get_output_dims();
+    output_dims = this->get_input_dims();
+    input_desc = m_tensors_cudnn_desc.get_prev_error_signals();
+    output_desc = m_tensors_cudnn_desc.get_error_signals();
+  }
+
+  // Perform convolution on the GPU
+  // Determine convolution algorithm
+  cudnnConvolutionFwdAlgo_t convolution_cudnn_algorithm
+                       = get_forward_algo_cudnn(input.Width(), input_desc, input.LockedBuffer(),
+                                                m_kernel_cudnn_desc, kernel.LockedBuffer(),
+                                                m_convolution_cudnn_desc,
+                                                output_desc, output.Buffer(),
+                                                workspace_size, workspace.Buffer());
+
+  // Apply convolution
+  CHECK_CUDNN(cudnnConvolutionForward(cudnn::get_handle(),
+                                      &one,
+                                      input_desc,
+                                      input.LockedBuffer(),
+                                      m_kernel_cudnn_desc,
+                                      kernel.LockedBuffer(),
+                                      m_convolution_cudnn_desc,
+                                      convolution_cudnn_algorithm,
+                                      workspace.Buffer(),
+                                      workspace_size,
+                                      &zero,
+                                      output_desc,
+                                      output.Buffer()));
+
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>::
+apply_transposed_convolution_cudnn(bool during_forward_prop) {
+#ifndef LBANN_HAS_CUDNN
+  LBANN_ERROR("cuDNN not detected");
+#else
+
+  // Useful constants
+  const auto zero = El::TypeTraits<ScalingType>::Zero();
+  const auto one = El::TypeTraits<ScalingType>::One();
+
+  // GPU data
+  const auto& kernel = this->weights_values(0);
+  const auto& input = (during_forward_prop ?
+                       this->get_local_prev_activations() :
+                       this->get_local_prev_error_signals());
+  auto& output = (during_forward_prop ?
+                  this->get_local_activations() :
+                  this->get_local_error_signals());
+
+  // Do nothing if there is no local data
+  if (input.Height() < 1 || input.Width() < 1
+      || output.Height() < 1 || output.Width() < 1) {
+    return;
+  }
+
+  // Initialize GPU workspace
+  // Note: Use CUB GPU memory pool if possible
+  El::Matrix<TensorDataType, El::Device::GPU> workspace;
+#ifdef HYDROGEN_HAVE_CUB
+  workspace.SetMemoryMode(1);
+#endif // HYDROGEN_HAVE_CUB
+  size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
+  workspace.Resize(workspace_size / sizeof(TensorDataType), 1);
+  workspace_size = workspace.Height() * sizeof(TensorDataType);
+
+  // Convolution transpose parameters
+  std::vector<int> input_dims, output_dims;
+  cudnnTensorDescriptor_t input_desc, output_desc;
+  if (during_forward_prop) {
+    input_dims = this->get_input_dims();
+    output_dims = this->get_output_dims();
+    input_desc = m_tensors_cudnn_desc.get_prev_activations();
+    output_desc = m_tensors_cudnn_desc.get_activations();
+  }
+  else {
+    input_dims = this->get_output_dims();
+    output_dims = this->get_input_dims();
+    input_desc = m_tensors_cudnn_desc.get_prev_error_signals();
+    output_desc = m_tensors_cudnn_desc.get_error_signals();
+  }
+
+  // Perform transposed convolution on the GPU
+  // Determine transposed convolution algorithm
+  cudnnConvolutionBwdDataAlgo_t transposed_convolution_cudnn_algorithm
+                       = get_backward_data_algo_cudnn(input.Width(),
+                                                      m_kernel_cudnn_desc, kernel.LockedBuffer(),
+                                                      input_desc, input.LockedBuffer(),
+                                                      m_convolution_cudnn_desc,
+                                                      output_desc, output.Buffer(),
+                                                      workspace_size, workspace.Buffer());
+  // Perform transposed convolution
+  CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn::get_handle(),
+                                           &one,
+                                           m_kernel_cudnn_desc,
+                                           kernel.LockedBuffer(),
+                                           input_desc,
+                                           input.LockedBuffer(),
+                                           m_convolution_cudnn_desc,
+                                           transposed_convolution_cudnn_algorithm,
+                                           workspace.Buffer(),
+                                           workspace_size,
+                                           &zero,
+                                           output_desc,
+                                           output.Buffer()));
+
+
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_layer<TensorDataType,Device>::apply_bias_cudnn() {
+#ifndef LBANN_HAS_CUDNN
+  LBANN_ERROR("cuDNN not detected");
+#else
+  auto& local_output = this->get_local_activations();
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()
+      && local_output.Height() > 0
+      && local_output.Width() > 0) {
+    const auto one = El::TypeTraits<ScalingType>::One();
+    const auto& bias = this->weights_values(1);
+    CHECK_CUDNN(cudnnAddTensor(cudnn::get_handle(),
+                               &m_bias_scaling_factor,
+                               m_bias_cudnn_desc,
+                               bias.LockedBuffer(),
+                               &one,
+                               m_tensors_cudnn_desc.get_activations(),
+                               local_output.Buffer()));
+  }
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::compute_gradients_cudnn(bool using_transposed_convolution) {
+#ifndef LBANN_HAS_CUDNN
+  LBANN_ERROR("cuDNN not detected");
+#else
+
+  // Matrices
+  const auto& local_input = this->get_local_prev_activations();
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+
+  const bool has_local_data = (local_input.Height() > 0
+                               && local_input.Width() > 0
+                               && local_gradient_wrt_output.Height() > 0
+                               && local_gradient_wrt_output.Width() > 0);
+
+  // Compute bias gradient
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()
+      && this->get_weights(1).get_optimizer() != nullptr) {
+    auto* bias_optimizer = this->get_weights(1).get_optimizer();
+    auto dst_scale_dt = El::TypeTraits<TensorDataType>::Zero(),
+      gradient_scale_dt = El::TypeTraits<TensorDataType>::Zero();
+    auto& bias_gradient = bias_optimizer->get_gradient_buffer(
+      dst_scale_dt, gradient_scale_dt, true);
+    if (has_local_data) {
+      auto dst_scale = ScalingType(dst_scale_dt), gradient_scale = ScalingType(gradient_scale_dt);
+      CHECK_CUDNN(cudnnConvolutionBackwardBias(
+                    cudnn::get_handle(),
+                    &gradient_scale,
+                    m_tensors_cudnn_desc.get_prev_error_signals(),
+                    local_gradient_wrt_output.LockedBuffer(),
+                    &dst_scale,
+                    m_bias_cudnn_desc,
+                    bias_gradient.Buffer()));
+    } else {
+      El::Scale(dst_scale_dt, bias_gradient);
+    }
+  }
+
+  // Compute kernel gradient
+  auto* kernel_optimizer = this->get_weights(0).get_optimizer();
+  if (kernel_optimizer != nullptr) {
+    auto dst_scale_dt = El::TypeTraits<TensorDataType>::Zero(), gradient_scale_dt = El::TypeTraits<TensorDataType>::Zero();
+    auto& kernel_gradient = kernel_optimizer->get_gradient_buffer(
+      dst_scale_dt, gradient_scale_dt, true);
+    if (has_local_data) {
+      // Initialize GPU workspace
+      El::Matrix<TensorDataType, El::Device::GPU> workspace;
+#ifdef HYDROGEN_HAVE_CUB
+      workspace.SetMemoryMode(1); // CUB GPU memory pool
+#endif // HYDROGEN_HAVE_CUB
+      size_t workspace_size = 1 << 30; /// @todo Allocate largest free block
+      workspace.Resize(workspace_size / sizeof(TensorDataType), 1);
+      workspace_size = workspace.Height() * sizeof(TensorDataType);
+
+      // Initialize cuDNN objects
+      auto&& input_desc = m_tensors_cudnn_desc.get_prev_activations();
+      auto&& gradient_wrt_output_desc = m_tensors_cudnn_desc.get_prev_error_signals();
+
+      auto dst_scale = ScalingType(dst_scale_dt), gradient_scale = ScalingType(gradient_scale_dt);
+
+      // Determine algorithm and compute kernel gradient
+      if (using_transposed_convolution) {
+        cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm
+          = get_backward_filter_algo_cudnn(
+            local_input.Width(),
+            gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(),
+            input_desc, local_input.LockedBuffer(),
+            m_convolution_cudnn_desc,
+            m_kernel_cudnn_desc,
+            workspace_size, workspace.Buffer());
+        CHECK_CUDNN(cudnnConvolutionBackwardFilter(
+                      cudnn::get_handle(),
+                      &gradient_scale,
+                      gradient_wrt_output_desc,
+                      local_gradient_wrt_output.LockedBuffer(),
+                      input_desc,
+                      local_input.LockedBuffer(),
+                      m_convolution_cudnn_desc,
+                      kernel_gradient_cudnn_algorithm,
+                      workspace.Buffer(),
+                      workspace_size,
+                      &dst_scale,
+                      m_kernel_cudnn_desc,
+                      kernel_gradient.Buffer()));
+      } else {
+        cudnnConvolutionBwdFilterAlgo_t kernel_gradient_cudnn_algorithm
+          = get_backward_filter_algo_cudnn(
+            local_input.Width(),
+            input_desc, local_input.LockedBuffer(),
+            gradient_wrt_output_desc, local_gradient_wrt_output.LockedBuffer(),
+            m_convolution_cudnn_desc,
+            m_kernel_cudnn_desc,
+            workspace_size, workspace.Buffer());
+        CHECK_CUDNN(cudnnConvolutionBackwardFilter(
+                      cudnn::get_handle(),
+                      &gradient_scale,
+                      input_desc,
+                      local_input.LockedBuffer(),
+                      gradient_wrt_output_desc,
+                      local_gradient_wrt_output.LockedBuffer(),
+                      m_convolution_cudnn_desc,
+                      kernel_gradient_cudnn_algorithm,
+                      workspace.Buffer(),
+                      workspace_size,
+                      &dst_scale,
+                      m_kernel_cudnn_desc,
+                      kernel_gradient.Buffer()));
+      }
+    } else {
+      El::Scale(dst_scale_dt, kernel_gradient);
+    }
+  }
+
+#endif // LBANN_HAS_CUDNN
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::apply_convolution_im2col(bool during_forward_prop) {
+
+  // Local matrices
+  const auto& local_kernel = this->weights_values(0).LockedMatrix();
+  const auto& local_input = (during_forward_prop ?
+                             this->get_local_prev_activations() :
+                             this->get_local_prev_error_signals());
+  auto& local_output = (during_forward_prop ?
+                        this->get_local_activations() :
+                        this->get_local_error_signals());
+
+  // Matrix parameters
+  const int output_size = local_output.Height();
+  const El::Int local_width = local_input.Width();
+  std::vector<int> input_dims, output_dims;
+  if (during_forward_prop) {
+    input_dims = this->get_input_dims();
+    output_dims = this->get_output_dims();
+  }
+  else {
+    input_dims = this->get_output_dims();
+    output_dims = this->get_input_dims();
+  }
+  const auto& kernel_dims = this->get_kernel_dims();
+  const auto& kernel_size = std::accumulate(kernel_dims.begin(),
+                                            kernel_dims.end(),
+                                            1, std::multiplies<int>());
+
+  // Initialize matrices
+  const int m = output_size / output_dims[0];
+  const int n = output_dims[0];
+  const int k = kernel_size / output_dims[0];
+  DMatDT<Device> input_col, output_col;
+  DMatDT<Device> im2col_matrix(k, m);
+  const DMatDT<Device> kernel_matrix(k, n, local_kernel.LockedBuffer(), k);
+
+  // Iterate through input columns
+  for (El::Int col = 0; col < local_width; ++col) {
+
+    // Construct im2col matrix from current input column
+    El::LockedView(input_col, local_input, El::ALL, El::IR(col));
+    im2col<TensorDataType>(input_col,
+                           im2col_matrix,
+                           input_dims[0],
+                           input_dims.size() - 1,
+                           &input_dims[1],
+                           m_pads.data(),
+                           &kernel_dims[2],
+                           m_strides.data());
+
+    // Apply convolution to current input column
+    output_col.Attach(m, n, local_output.Buffer(0, col), m);
+    El::Gemm(El::TRANSPOSE, El::NORMAL,
+             El::TypeTraits<TensorDataType>::One(), im2col_matrix, kernel_matrix,
+             El::TypeTraits<TensorDataType>::Zero(), output_col);
+
+  }
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::apply_transposed_convolution_im2col(bool during_forward_prop) {
+
+  // Local matrices
+  const auto& local_kernel = this->weights_values(0).LockedMatrix();
+  const auto& local_input = (during_forward_prop ?
+                             this->get_local_prev_activations() :
+                             this->get_local_prev_error_signals());
+  DMatDT<Device>& local_output = (during_forward_prop ?
+                                  this->get_local_activations() :
+                                  this->get_local_error_signals());
+
+  // Matrix parameters
+  const int input_size = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  std::vector<int> input_dims, output_dims;
+  if (during_forward_prop) {
+    input_dims = this->get_input_dims();
+    output_dims = this->get_output_dims();
+  }
+  else {
+    input_dims = this->get_output_dims();
+    output_dims = this->get_input_dims();
+  }
+  const auto& kernel_dims = this->get_kernel_dims();
+  const auto& kernel_size = std::accumulate(kernel_dims.begin(),
+                                            kernel_dims.end(),
+                                            1, std::multiplies<int>());
+
+  // Initialize matrices
+  const int m = kernel_size / input_dims[0];
+  const int n = input_size / input_dims[0];
+  const int k = input_dims[0];
+  DMatDT<Device> input_col, output_col;
+  DMatDT<Device> im2col_matrix(m, n);
+  const DMatDT<Device> kernel_matrix(m, k, local_kernel.LockedBuffer(), m);
+
+  // Iterate through input columns
+  for (El::Int col = 0; col < local_width; ++col) {
+
+    // Apply transposed convolution to current input column
+    input_col.LockedAttach(n, k, local_input.LockedBuffer(0, col), n);
+    El::Gemm(El::NORMAL, El::TRANSPOSE,
+             El::TypeTraits<TensorDataType>::One(), kernel_matrix, input_col,
+             El::TypeTraits<TensorDataType>::Zero(), im2col_matrix);
+
+    // Perform col2im to accumulate contributions from each kernel
+    // position
+    El::View(output_col, local_output, El::ALL, El::IR(col));
+    col2im<TensorDataType>(im2col_matrix,
+                           output_col,
+                           output_dims[0],
+                           output_dims.size() - 1,
+                           &output_dims[1],
+                           m_pads.data(),
+                           &kernel_dims[2],
+                           m_strides.data());
+
+  }
+
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::apply_bias_cpu() {
+
+  // Return immediately if there is no bias
+  if (m_bias_scaling_factor == El::TypeTraits<ScalingType>::Zero()) return;
+
+  // Local matrices
+  const auto& local_bias = this->weights_values(1).LockedMatrix();
+  auto& local_output = this->get_local_activations();
+
+  // Matrix parameters
+  const El::Int local_width = local_output.Width();
+  const auto& output_dims = this->get_output_dims();
+  const El::Int num_output_channels = output_dims[0];
+  const El::Int num_per_output_channel = this->get_output_size() / num_output_channels;
+
+  // Apply bias to each output channel
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int channel = 0; channel < num_output_channels; ++channel) {
+    const El::Int row_start = channel * num_per_output_channel;
+    const El::Int row_end = (channel+1) * num_per_output_channel;
+    const TensorDataType bias_term = TensorDataType(m_bias_scaling_factor) * local_bias(channel, 0);
+    for (El::Int col = 0; col < local_width; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        local_output(row, col) += bias_term;
+      }
+    }
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_layer<TensorDataType,Device>
+::compute_gradients_im2col(bool using_transposed_convolution) {
+
+  // Local matrices
+  const DMatDT<Device>& local_input = this->get_local_prev_activations();
+  const DMatDT<Device>& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  const bool has_local_data = (!local_input.IsEmpty()
+                               && !local_gradient_wrt_output.IsEmpty());
+
+  // Get convolution parameters
+  const El::Int local_width = local_input.Width();
+  const auto& input_dims = this->get_input_dims();
+  const auto& output_dims = this->get_output_dims();
+  const int num_input_channels = input_dims[0];
+  const int num_output_channels = output_dims[0];
+  const int num_per_output_channel = this->get_output_size() / num_output_channels;
+  const auto& kernel_dims = this->get_kernel_dims();
+  const auto& kernel_size = std::accumulate(kernel_dims.begin(),
+                                            kernel_dims.end(),
+                                            1, std::multiplies<int>());
+
+  // Compute bias gradient
+  // Note: Sum is computed with Kahan summation
+  if (m_bias_scaling_factor != El::TypeTraits<ScalingType>::Zero()
+      && this->get_weights(1).get_optimizer() != nullptr) {
+    auto* bias_optimizer = this->get_weights(1).get_optimizer();
+    TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
+    auto& bias_gradient = bias_optimizer->get_gradient_buffer(
+      dst_scale, gradient_scale, true);
+    if (has_local_data) {
+      auto& local_bias_gradient = bias_gradient.Matrix();
+      LBANN_OMP_PARALLEL_FOR
+      for (int channel = 0; channel < num_output_channels; ++channel) {
+        const El::Int row_start = channel * num_per_output_channel;
+        const El::Int row_end = (channel+1) * num_per_output_channel;
+        auto sum = El::TypeTraits<TensorDataType>::Zero();
+        auto correction = El::TypeTraits<TensorDataType>::Zero();
+        for (El::Int col = 0; col < local_width; ++col) {
+          for (El::Int row = row_start; row < row_end; ++row) {
+            TensorDataType term = local_gradient_wrt_output(row, col);
+            term += correction;
+            const TensorDataType next_sum = sum + term;
+            correction = term - (next_sum - sum);
+            sum = next_sum;
+          }
+        }
+        local_bias_gradient(channel, 0) = dst_scale*local_bias_gradient(channel, 0)
+          + gradient_scale*sum;
+      }
+    } else {
+      El::Scale(dst_scale, bias_gradient);
+    }
+  }
+
+  // Stop early if kernel is not being optimized
+  auto* kernel_optimizer = this->get_weights(0).get_optimizer();
+  if (kernel_optimizer == nullptr) { return; }
+
+  // Initialize matrices
+  const int m = (using_transposed_convolution ?
+                 kernel_size / num_input_channels :
+                 kernel_size / num_output_channels);
+  const int n = (using_transposed_convolution ?
+                 num_input_channels :
+                 num_output_channels);
+  const int k = (using_transposed_convolution ?
+                 this->get_input_size() / num_input_channels :
+                 this->get_output_size() / num_output_channels);
+  auto dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
+  auto& kernel_gradient = kernel_optimizer->get_gradient_buffer(
+    dst_scale, gradient_scale, true);
+  El::Scale(dst_scale, kernel_gradient);
+  DMatDT<Device> im2col_matrix(m, k);
+  DMatDT<Device> kernel_gradient_matrix(m, n, kernel_gradient.Buffer(), m);
+
+  // Compute kernel gradient contributions from each data sample
+  for (El::Int col = 0; col < local_width; ++col) {
+    if (using_transposed_convolution) {
+      const DMatDT<Device> input_col(k, n, local_input.LockedBuffer(0,col), k);
+      const DMatDT<Device> gradient_wrt_output_col =
+        El::LockedView(local_gradient_wrt_output, El::ALL, El::IR(col));
+      im2col<TensorDataType>(gradient_wrt_output_col,
+                             im2col_matrix,
+                             num_output_channels,
+                             output_dims.size() - 1,
+                             &output_dims[1],
+                             m_pads.data(),
+                             &kernel_dims[2],
+                             m_strides.data());
+      El::Gemm(El::NORMAL, El::NORMAL,
+               gradient_scale, im2col_matrix, input_col,
+               El::TypeTraits<TensorDataType>::One(), kernel_gradient_matrix);
+    }
+    else {
+      const DMatDT<Device> input_col
+        = El::LockedView(local_input, El::ALL, El::IR(col));
+      const DMatDT<Device> gradient_wrt_output_col(k, n, local_gradient_wrt_output.LockedBuffer(0,col), k);
+      im2col<TensorDataType>(input_col,
+                             im2col_matrix,
+                             num_input_channels,
+                             input_dims.size() - 1,
+                             &input_dims[1],
+                             m_pads.data(),
+                             &kernel_dims[2],
+                             m_strides.data());
+      El::Gemm(El::NORMAL, El::NORMAL,
+               gradient_scale, im2col_matrix, gradient_wrt_output_col,
+               El::TypeTraits<TensorDataType>::One(), kernel_gradient_matrix);
+    }
+  }
+}
+
+#ifdef LBANN_HAS_CUDNN
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::copy_kernel_cudnn_desc(const cudnnFilterDescriptor_t& src,
+                         cudnnFilterDescriptor_t& dst) {
+
+  // Create or destroy descriptor if needed
+  if(src != nullptr && dst == nullptr) {
+    CHECK_CUDNN(cudnnCreateFilterDescriptor(&dst));
+  }
+  else if(src == nullptr && dst != nullptr) {
+    CHECK_CUDNN(cudnnDestroyFilterDescriptor(dst));
+    dst = nullptr;
+  }
+
+  // Copy descriptor data if needed
+  if(src != nullptr) {
+    cudnnDataType_t data_type;
+    cudnnTensorFormat_t format;
+    int num_dims;
+    std::vector<int> dims(1);
+    CHECK_CUDNN(cudnnGetFilterNdDescriptor(src,
+                                           dims.size(),
+                                           &data_type,
+                                           &format,
+                                           &num_dims,
+                                           dims.data()));
+    dims.resize(num_dims);
+    CHECK_CUDNN(cudnnGetFilterNdDescriptor(src,
+                                           num_dims,
+                                           &data_type,
+                                           &format,
+                                           &num_dims,
+                                           dims.data()));
+    CHECK_CUDNN(cudnnSetFilterNdDescriptor(dst,
+                                           data_type,
+                                           format,
+                                           num_dims,
+                                           dims.data()));
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+void
+base_convolution_layer<TensorDataType,Device>
+::copy_convolution_cudnn_desc(const cudnnConvolutionDescriptor_t& src,
+                              cudnnConvolutionDescriptor_t& dst) {
+
+  // Create or destroy descriptor if needed
+  if(src != nullptr && dst == nullptr) {
+    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&dst));
+  }
+  else if(src == nullptr && dst != nullptr) {
+    CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(dst));
+    dst = nullptr;
+  }
+
+  // Copy descriptor data if needed
+  if(src != nullptr) {
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnMathType_t math_type;
+    int num_dims;
+
+    CHECK_CUDNN(cudnnGetConvolutionMathType(src, &math_type));
+    CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src,
+                                                0,
+                                                &num_dims,
+                                                nullptr,
+                                                nullptr,
+                                                nullptr,
+                                                &mode,
+                                                &data_type));
+    std::vector<int> pads(num_dims), strides(num_dims), dilations(num_dims);
+    CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src,
+                                                num_dims,
+                                                &num_dims,
+                                                pads.data(),
+                                                strides.data(),
+                                                dilations.data(),
+                                                &mode,
+                                                &data_type));
+    int num_groups;
+    CHECK_CUDNN(cudnnGetConvolutionGroupCount(src,
+                                              &num_groups));
+    CHECK_CUDNN(cudnnSetConvolutionMathType(dst, math_type));
+    CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(dst,
+                                                num_dims,
+                                                pads.data(),
+                                                strides.data(),
+                                                dilations.data(),
+                                                mode,
+                                                data_type));
+    CHECK_CUDNN(cudnnSetConvolutionGroupCount(dst,
+                                              num_groups));
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+cudnnConvolutionFwdAlgo_t
+base_convolution_layer<TensorDataType,Device>::get_forward_algo_cudnn(
+  const int local_mini_batch_size,
+  const cudnnTensorDescriptor_t& input_desc,
+  const TensorDataType* input,
+  const cudnnFilterDescriptor_t& kernel_desc,
+  const TensorDataType* kernel,
+  const cudnnConvolutionDescriptor_t& conv_desc,
+  const cudnnTensorDescriptor_t& output_desc,
+  TensorDataType* output,
+  size_t ws_size,
+  TensorDataType* ws) {
+  if (m_fwd_cudnn_algos.count(local_mini_batch_size) == 0) {
+#ifdef LBANN_DETERMINISTIC
+    bool deterministic = true;
+#else
+    bool deterministic = false;
+#endif
+    m_fwd_cudnn_algos[local_mini_batch_size] =
+      cudnn::get_fwd_algorithm(
+        true, deterministic,
+        input_desc, input,
+        kernel_desc, kernel,
+        conv_desc,
+        output_desc, output,
+        ws_size, ws);
+  }
+  return m_fwd_cudnn_algos[local_mini_batch_size];
+}
+
+template <typename TensorDataType, El::Device Device>
+cudnnConvolutionBwdDataAlgo_t
+base_convolution_layer<TensorDataType,Device>::get_backward_data_algo_cudnn(
+  const int local_mini_batch_size,
+  const cudnnFilterDescriptor_t& kernel_desc,
+  const TensorDataType* kernel,
+  const cudnnTensorDescriptor_t& prev_error_signal_desc,
+  const TensorDataType* prev_error_signal,
+  const cudnnConvolutionDescriptor_t& conv_desc,
+  const cudnnTensorDescriptor_t& error_signal_desc,
+  TensorDataType* error_signal,
+  size_t ws_size,
+  TensorDataType* ws) {
+  if (m_bwd_data_cudnn_algos.count(local_mini_batch_size) == 0) {
+#ifdef LBANN_DETERMINISTIC
+    bool deterministic = true;
+#else
+    bool deterministic = false;
+#endif
+    m_bwd_data_cudnn_algos[local_mini_batch_size] =
+      cudnn::get_bwd_data_algorithm(
+        true, deterministic,
+        kernel_desc, kernel,
+        prev_error_signal_desc, prev_error_signal,
+        conv_desc,
+        error_signal_desc, error_signal,
+        ws_size, ws);
+  }
+  return m_bwd_data_cudnn_algos[local_mini_batch_size];
+}
+
+template <typename TensorDataType, El::Device Device>
+cudnnConvolutionBwdFilterAlgo_t
+base_convolution_layer<TensorDataType,Device>::get_backward_filter_algo_cudnn(
+  const int local_mini_batch_size,
+  const cudnnTensorDescriptor_t& input_desc,
+  const TensorDataType* input,
+  const cudnnTensorDescriptor_t& prev_error_signal_desc,
+  const TensorDataType* prev_error_signal,
+  const cudnnConvolutionDescriptor_t& conv_desc,
+  const cudnnFilterDescriptor_t& kernel_gradient_desc,
+  size_t ws_size,
+  TensorDataType* ws) {
+  if (m_bwd_filter_cudnn_algos.count(local_mini_batch_size) == 0) {
+#ifdef LBANN_DETERMINISTIC
+    bool deterministic = true;
+#else
+    bool deterministic = false;
+#endif
+    // Temporary filter gradient buffer.
+    El::Matrix<TensorDataType, El::Device::GPU> kernel_gradient;
+#ifdef HYDROGEN_HAVE_CUB
+    kernel_gradient.SetMemoryMode(1);
+#endif
+    kernel_gradient.Resize(this->get_weights(0).get_matrix_height(),
+                           this->get_weights(0).get_matrix_width());
+    m_bwd_filter_cudnn_algos[local_mini_batch_size] =
+      cudnn::get_bwd_filter_algorithm(
+        true, deterministic,
+        input_desc, input,
+        prev_error_signal_desc, prev_error_signal,
+        conv_desc,
+        kernel_gradient_desc, kernel_gradient.Buffer(),
+        ws_size, ws);
+  }
+  return m_bwd_filter_cudnn_algos[local_mini_batch_size];
+}
+#endif // LBANN_HAS_CUDNN
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, El::Device Device>
+void base_convolution_layer<TensorDataType,Device>::setup_distconv_adapter() {
+  this->get_distconv_adapter_ptr() = make_unique<
+    base_convolution_adapter<TensorDataType, Device>>(*this);
+}
+
+template <typename TensorDataType, El::Device Device>
+const base_convolution_adapter<TensorDataType, Device>&
+base_convolution_layer<TensorDataType, Device>::get_distconv_adapter() const {
+  return dynamic_cast<const base_convolution_adapter<
+    TensorDataType, Device>&>(data_type_layer<TensorDataType>::get_distconv_adapter());
+}
+
+template <typename TensorDataType, El::Device Device>
+base_convolution_adapter<TensorDataType, Device>&
+base_convolution_layer<TensorDataType, Device>::get_distconv_adapter() {
+  return const_cast<base_convolution_adapter<TensorDataType, Device>&>(
+    static_cast<const base_convolution_layer<TensorDataType, Device>&>(*this).get_distconv_adapter());
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::setup_fp_tensors() {
+  data_type_distconv_adapter<TensorDataType>::setup_fp_tensors();
+  auto &layer = dynamic_cast<
+    base_convolution_layer<TensorDataType, Device>&>(this->layer());
+  const auto &input_dist = this->get_prev_activations_dist();
+
+  const auto& kernel_dims = layer.get_kernel_dims();
+  std::stringstream ss;
+  dc::util::print_vector(ss, kernel_dims.begin(), kernel_dims.end());
+
+  // assumes no partitioning on channel/filter dimensions
+  assert_eq(input_dist.get_split_shape()[-2], 1);
+  auto shared_dist = dc::Dist::make_shared_distribution(
+    input_dist.get_locale_shape());
+
+  dc::Shape kernel_shape(kernel_dims);
+  std::reverse(kernel_shape.begin(), kernel_shape.end());
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+  m_kernel = make_unique<TensorDevType>(kernel_shape, loc, shared_dist);
+  assert0(dc::tensor::View(
+            *m_kernel, layer.weights_values(0).LockedBuffer()));
+
+  if (layer.m_bias_scaling_factor != TensorDataType(0)) {
+    dc::Shape bias_shape(dc::get_num_dims(layer), 1);
+    bias_shape[dc::get_channel_dim()] = layer.get_output_dims()[0];
+    m_bias = make_unique<TensorDevType>(bias_shape, loc, shared_dist);
+    assert0(dc::tensor::View(
+              *m_bias, layer.weights_values(1).LockedBuffer()));
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::setup_bp_tensors() {
+  data_type_distconv_adapter<TensorDataType>::setup_bp_tensors();
+  auto &l = dynamic_cast<
+    base_convolution_layer<TensorDataType, Device>&>(this->layer());
+
+  const auto shared_dist = dc::Dist::make_shared_distribution(
+    this->get_prev_error_signals_dist().get_locale_shape());
+  dc::Shape kernel_shape(l.get_kernel_dims());
+  std::reverse(kernel_shape.begin(), kernel_shape.end());
+  const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
+
+  m_kernel_gradient = make_unique<TensorDevType>(kernel_shape, loc, shared_dist);
+  // Gradient buffer is needed for auto-tuning the bp filter algorithm
+  assert0(dc::tensor::View(
+            *m_kernel_gradient,
+            l.get_weights(0).get_optimizer()->get_gradient().Buffer()));
+
+  // Bias tensor. Shared by all procs
+  if (l.m_bias_scaling_factor != TensorDataType(0)) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
+    if (bias_optimizer != nullptr) {
+      dc::Shape bias_shape(dc::get_num_dims(l), 1);
+      bias_shape[dc::get_channel_dim()] = l.get_output_dims()[0];
+      m_bias_gradient = make_unique<TensorDevType>(bias_shape, loc, shared_dist);
+      // setup_bias_gradients needs strides of the bias tensor,
+      // which is set when its view is set.
+      assert0(dc::tensor::View(
+                *m_bias_gradient,
+                l.get_weights(1).get_optimizer()->get_gradient().Buffer()));
+    }
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::setup_layer(
+  size_t workspace_capacity) {
+  data_type_distconv_adapter<TensorDataType>::setup_layer(workspace_capacity);
+  auto &layer = dynamic_cast<base_convolution_layer<TensorDataType, Device>&>(this->layer());
+  m_conv = make_unique<dc::Convolution<TensorDataType>>(
+    dc::get_backend(), dc::get_num_dims(layer),
+    dc::get_halo_exchange_method());
+  if (layer.m_bias_scaling_factor != TensorDataType(0)) {
+    m_conv->setup_bias(*m_bias);
+    m_conv->setup_bias_gradient(*m_bias_gradient);
+  }
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::fp_compute_convolution() {
+  auto &l = dynamic_cast<base_convolution_layer<
+    TensorDataType, Device>&>(this->layer());
+  assert0(dc::tensor::View(
+            *m_kernel, l.weights_values(0).LockedBuffer()));
+  m_conv->forward(TensorDataType{1}, this->get_prev_activations(),
+                  *m_kernel, TensorDataType{0}, this->get_activations());
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::fp_apply_bias() {
+  auto &l = dynamic_cast<base_convolution_layer<
+    TensorDataType, Device>&>(this->layer());
+  if (l.m_bias_scaling_factor == TensorDataType(0)) return;
+  assert0(dc::tensor::View(
+            *m_bias, l.weights_values(1).LockedBuffer()));
+  m_conv->apply_bias(l.m_bias_scaling_factor, *m_bias,
+                     TensorDataType{1}, this->get_activations());
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::bp_compute_convolution_data() {
+  auto &l = dynamic_cast<base_convolution_layer<
+    TensorDataType, Device>&>(this->layer());
+  assert0(dc::tensor::View(
+            *m_kernel, l.weights_values(0).LockedBuffer()));
+  m_conv->backward_data(TensorDataType{1}, *m_kernel,
+                        this->get_prev_error_signals(),
+                        TensorDataType{0}, this->get_error_signals());
+}
+
+template <typename TensorDataType, El::Device Device>
+void base_convolution_adapter<TensorDataType, Device>::bp_compute_convolution_filter() {
+  auto &l = dynamic_cast<base_convolution_layer<
+    TensorDataType, Device>&>(this->layer());
+  const bool has_local_data = this->get_prev_activations().get_local_size() > 0 &&
+    this->get_prev_error_signals().get_local_size() > 0;
+  if (l.m_bias_scaling_factor != TensorDataType(0)
+      && l.get_weights(1).get_optimizer() != nullptr) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
+    TensorDataType dst_scale{0}, gradient_scale{0};
+    auto& bias_gradient = bias_optimizer->get_gradient_buffer(
+      dst_scale, gradient_scale, true);
+    assert0(dc::tensor::View(*m_bias_gradient,
+                             bias_gradient.Buffer()));
+    if (has_local_data) {
+      m_conv->backward_bias(gradient_scale,
+                            this->get_prev_error_signals(),
+                            dst_scale, *m_bias_gradient, false);
+    } else {
+      m_bias_gradient->scale(dst_scale, El::GPUManager::Stream());
+    }
+  }
+
+  auto* kernel_optimizer = l.get_weights(0).get_optimizer();
+  if (kernel_optimizer == nullptr) return;
+  TensorDataType dst_scale{0}, gradient_scale{0};
+  auto& kernel_gradient = kernel_optimizer->get_gradient_buffer(
+    dst_scale, gradient_scale, true);
+  assert0(dc::tensor::View(
+            *m_kernel_gradient, kernel_gradient.Buffer()));
+  if (has_local_data) {
+    m_conv->backward_filter(gradient_scale,
+                            this->get_prev_activations(),
+                            this->get_prev_error_signals(),
+                            dst_scale,
+                            *m_kernel_gradient, false);
+  } else {
+    m_kernel_gradient->scale(dst_scale, El::GPUManager::Stream());
+  }
+}
+#endif // LBANN_HAS_DISTCONV
+
+#define PROTO_DEVICE(T, Device)                                            \
+  template class base_convolution_layer<T, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/channelwise_fully_connected.cpp b/src/layers/learning/channelwise_fully_connected.cpp
new file mode 100644
index 00000000000..8b85727c7af
--- /dev/null
+++ b/src/layers/learning/channelwise_fully_connected.cpp
@@ -0,0 +1,493 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CHANNELWISE_FULLY_CONNECTED_LAYER_INSTANTIATE
+#include "lbann/layers/learning/channelwise_fully_connected.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/weights/initializer.hpp"
+#include "lbann/weights/variance_scaling_initializers.hpp"
+#include <layers.pb.h>
+
+namespace lbann
+{
+
+// =========================================================
+// Class member functions
+// =========================================================
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::channelwise_fully_connected_layer(
+  lbann_comm* comm,
+  std::vector<size_t> output_channel_dims,
+  bool bias,
+  bool transpose)
+  : data_type_layer<TensorDataType>(comm),
+    m_has_bias{bias},
+    m_transpose{transpose}
+{
+
+  // Initialize output tensor dimensions
+  if (output_channel_dims.empty()) {
+    output_channel_dims.push_back(1);
+  }
+  std::vector<int> output_dims;
+  output_dims.push_back(1);
+  output_dims.insert(
+    output_dims.end(),
+    output_channel_dims.begin(),
+    output_channel_dims.end());
+  this->set_output_dims(output_dims);
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>*
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::copy() const
+{
+  return new channelwise_fully_connected_layer(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::string
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::get_type() const
+{
+  return "channel-wise fully-connected";
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+data_layout
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::get_data_layout() const
+{
+  return Layout;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+El::Device
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::get_device_allocation() const
+{
+  return Device;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+description
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::get_description() const
+{
+  auto desc = data_type_layer<TensorDataType>::get_description();
+  desc.add("Bias", m_has_bias);
+  desc.add("Transpose", m_transpose);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::setup_dims(DataReaderMetaData& dr_metadata)
+{
+  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
+
+  // Make sure input and output dimensions are valid
+  const auto& input_dims = this->get_input_dims();
+  auto output_dims = this->get_output_dims();
+  if (input_dims.size() <= 1) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "expects an input tensor with >1 dimensions, ",
+                "but parent layer ",
+                "\"",this->get_parent_layers()[0]->get_name(),"\" ",
+                "outputs a ",input_dims.size(),"-D tensor");
+  }
+  if (output_dims.size() <= 1) {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "expects an output tensor with >1 dimensions,",
+                "but it has been configured ",
+                "as a ",output_dims.size(),"-D tensor");
+  }
+
+  // Input and output tensors have same number of channels
+  output_dims[0] = input_dims[0];
+  this->set_output_dims(output_dims);
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::setup_data(size_t max_mini_batch_size)
+{
+  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  // Tensor dimensions
+  const auto& input_dims = this->get_input_dims();
+  const auto& output_dims = this->get_output_dims();
+  const std::vector<int> input_channel_dims(
+    input_dims.begin()+1, input_dims.end());
+  const std::vector<int> output_channel_dims(
+    output_dims.begin()+1, output_dims.end());
+  const auto& input_channel_size = std::accumulate(
+    input_channel_dims.begin(), input_channel_dims.end(),
+    1, std::multiplies<size_t>());
+  const auto& output_channel_size = std::accumulate(
+    output_channel_dims.begin(), output_channel_dims.end(),
+    1, std::multiplies<size_t>());
+
+  // Set number of weights
+  using WeightsType = data_type_weights<TensorDataType>;
+  if ((m_has_bias && this->num_weights() > 2)
+      || (!m_has_bias && this->num_weights() > 1)) {
+    LBANN_ERROR(
+      "attempted to setup ",
+      this->get_type()," layer \"",this->get_name(),"\" ",
+      "with an invalid number of weights ",
+      "(",this->num_weights(),")");
+  }
+  this->set_num_weights(m_has_bias ? 2 : 1);
+
+  // Create default linearity weights if needed
+  if (!this->has_weights(0)) {
+    auto w = make_unique<WeightsType>(this->get_comm());
+    auto init = make_unique<he_initializer<TensorDataType>>(probability_distribution::gaussian);
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+    w->set_name(this->get_name() + "_linearity_weights");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->set_weights(0, w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+
+  // Setup linearity weights
+  {
+    auto& linearity_weights = this->get_weights(0);
+    auto dist = this->get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    dist.rowDist = El::STAR;
+    auto* cast_initializer = dynamic_cast<variance_scaling_initializer<TensorDataType>*>(linearity_weights.get_initializer());
+    if (cast_initializer != nullptr) {
+      cast_initializer->set_fan_in(input_channel_size);
+      cast_initializer->set_fan_out(output_channel_size);
+    }
+    linearity_weights.set_dims(
+      m_transpose ? input_channel_dims : output_channel_dims,
+      m_transpose ? output_channel_dims : input_channel_dims);
+    linearity_weights.set_matrix_distribution(dist);
+  }
+
+  // Setup bias weights if needed
+  if (m_has_bias) {
+    auto dist = this->get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    dist.rowDist = El::STAR;
+    if (!this->has_weights(1)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_bias_weights");
+      w->set_optimizer(std::move(opt));
+      this->set_weights(1, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+    auto& bias_weights = this->get_weights(1);
+    bias_weights.set_dims(output_channel_dims);
+    bias_weights.set_matrix_distribution(dist);
+  }
+
+  // Initialize freeze state
+  auto const num_weights = this->num_weights();
+  for (size_t ii = 0; ii < num_weights; ++ii) {
+    auto& w = this->get_weights(ii);
+    if (this->m_frozen) {
+      w.freeze();
+    } else {
+      w.unfreeze();
+    }
+  }
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::fp_compute()
+{
+  const auto& zero = El::TypeTraits<TensorDataType>::Zero();
+  const auto& one = El::TypeTraits<TensorDataType>::One();
+
+  // Data tensors
+  using LocalMat = El::Matrix<TensorDataType, Device>;
+  const auto& linearity = this->weights_values(0);
+  const auto& local_input = dynamic_cast<const LocalMat&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<LocalMat&>(this->get_local_activations());
+  const auto& local_linearity = dynamic_cast<const LocalMat&>(linearity.LockedMatrix());
+
+  // Tensor dimensions
+  const auto& local_mini_batch_size = local_input.Width();
+  const auto& input_dims = this->get_input_dims();
+  const auto& output_dims = this->get_output_dims();
+  const auto& num_channels = input_dims[0];
+  const auto& input_channel_size = std::accumulate(
+    input_dims.begin()+1, input_dims.end(),
+    1, std::multiplies<size_t>());
+  const auto& output_channel_size = std::accumulate(
+    output_dims.begin()+1, output_dims.end(),
+    1, std::multiplies<size_t>());
+
+  // Reshape input and output tensors
+  // Note: [mini_batch_size,num_channels,*] -> [mini_batch_size*num_channels,*]
+  LocalMat local_input_reshaped, local_output_reshaped;
+  if (local_input.Contiguous()) {
+    local_input_reshaped.LockedAttach(
+      input_channel_size,
+      local_mini_batch_size * num_channels,
+      local_input.LockedBuffer(),
+      input_channel_size);
+  }
+  else {
+    El::Copy(local_input, local_input_reshaped);
+    local_input_reshaped.Resize(
+      input_channel_size,
+      local_mini_batch_size * num_channels);
+  }
+  if (local_output.Contiguous()) {
+    local_output_reshaped.Attach(
+      output_channel_size,
+      local_mini_batch_size * num_channels,
+      local_output.Buffer(),
+      output_channel_size);
+  }
+  else {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "has a non-contiguous output tensor");
+  }
+
+  // Apply linearity
+  El::Gemm(
+    m_transpose ? El::TRANSPOSE : El::NORMAL,
+    El::NORMAL,
+    one, local_linearity, local_input_reshaped,
+    zero, reinterpret_cast<LocalMat&>(local_output_reshaped));
+
+  // Apply bias
+  if (m_has_bias) {
+    const auto& bias = this->weights_values(1);
+    LocalMat ones(local_mini_batch_size * num_channels, 1);
+    El::Fill(ones, one);
+    El::Gemm(
+      El::NORMAL, El::TRANSPOSE,
+      one, reinterpret_cast<const LocalMat&>(bias.LockedMatrix()), ones,
+      one, reinterpret_cast<LocalMat&>(local_output_reshaped));
+  }
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void
+channelwise_fully_connected_layer<TensorDataType,Layout,Device>
+::bp_compute()
+{
+  const auto& zero = El::TypeTraits<TensorDataType>::Zero();
+  const auto& one = El::TypeTraits<TensorDataType>::One();
+
+  // Data tensors
+  using LocalMat = El::Matrix<TensorDataType,Device>;
+  const auto& linearity = this->weights_values(0);
+  const auto& local_input = dynamic_cast<const LocalMat&>(this->get_local_prev_activations());
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(this->get_local_prev_error_signals());
+  auto& local_input_grad = dynamic_cast<LocalMat&>(this->get_local_error_signals());
+  const auto& local_linearity = dynamic_cast<const LocalMat&>(linearity.LockedMatrix());
+
+  // Tensor dimensions
+  const auto& local_mini_batch_size = local_input.Width();
+  const auto& input_dims = this->get_input_dims();
+  const auto& output_dims = this->get_output_dims();
+  const auto& num_channels = input_dims[0];
+  const auto& input_channel_size = std::accumulate(
+    input_dims.begin()+1, input_dims.end(),
+    1, std::multiplies<size_t>());
+  const auto& output_channel_size = std::accumulate(
+    output_dims.begin()+1, output_dims.end(),
+    1, std::multiplies<size_t>());
+
+  // Reshape input and output tensors
+  // Note: [mini_batch_size,num_channels,*] -> [mini_batch_size*num_channels,*]
+  LocalMat local_input_reshaped, local_output_grad_reshaped, local_input_grad_reshaped;
+  if (local_input.Contiguous()) {
+    local_input_reshaped.LockedAttach(
+      input_channel_size,
+      local_mini_batch_size * num_channels,
+      local_input.LockedBuffer(),
+      input_channel_size);
+  }
+  else {
+    El::Copy(local_input, local_input_reshaped);
+    local_input_reshaped.Resize(
+      input_channel_size,
+      local_mini_batch_size * num_channels);
+  }
+  if (local_output_grad.Contiguous()) {
+    local_output_grad_reshaped.LockedAttach(
+      output_channel_size,
+      local_mini_batch_size * num_channels,
+      local_output_grad.LockedBuffer(),
+      output_channel_size);
+  }
+  else {
+    El::Copy(local_output_grad, local_output_grad_reshaped);
+    local_output_grad_reshaped.Resize(
+      output_channel_size,
+      local_mini_batch_size * num_channels);
+  }
+  if (local_input_grad.Contiguous()) {
+    local_input_grad_reshaped.Attach(
+      input_channel_size,
+      local_mini_batch_size * num_channels,
+      local_input_grad.Buffer(),
+      input_channel_size);
+  }
+  else {
+    LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
+                "has a non-contiguous gradient w.r.t. input tensor");
+  }
+
+  // Compute gradient w.r.t. input
+  El::Gemm(
+    m_transpose ? El::NORMAL : El::TRANSPOSE,
+    El::NORMAL,
+    one, local_linearity, local_output_grad_reshaped,
+    zero, reinterpret_cast<LocalMat&>(local_input_grad_reshaped));
+
+  // Compute gradient w.r.t. linearity
+  auto* linearity_optimizer = this->get_weights(0).get_optimizer();
+  if (linearity_optimizer != nullptr) {
+    TensorDataType dst_scale, gradient_scale;
+    auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
+      dst_scale, gradient_scale, true);
+    if (m_transpose) {
+      El::Gemm(
+        El::NORMAL, El::TRANSPOSE,
+        gradient_scale, local_input_reshaped, local_output_grad_reshaped,
+        dst_scale, linearity_gradient.Matrix());
+    }
+    else {
+      El::Gemm(
+        El::NORMAL, El::TRANSPOSE,
+        gradient_scale, local_output_grad_reshaped, local_input_reshaped,
+        dst_scale, linearity_gradient.Matrix());
+    }
+  }
+
+  // Compute gradient w.r.t. bias
+  if (m_has_bias) {
+    auto& bias_weights = this->get_weights(1);
+    auto* bias_optimizer = bias_weights.get_optimizer();
+    if (bias_optimizer != nullptr) {
+      TensorDataType dst_scale, gradient_scale;
+      auto& bias_gradient = bias_optimizer->get_gradient_buffer(
+        dst_scale, gradient_scale, true);
+      LocalMat ones(local_mini_batch_size * num_channels, 1);
+      El::Fill(ones, one);
+      El::Gemv(
+        El::NORMAL,
+        gradient_scale, local_output_grad_reshaped, ones,
+        dst_scale, bias_gradient.Matrix());
+    }
+  }
+
+}
+
+// =========================================================
+// Builder function
+// =========================================================
+
+namespace
+{
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR(
+      "Attempted to construct channelwise_fully_connected_layer ",
+      "with invalid parameters ",
+      "(TensorDataType=",TypeName<T>(),", ",
+      "Layout=",to_string(L),", ",
+      "Device=",to_string(D),")");
+    return nullptr;
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct Builder<TensorDataType,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = channelwise_fully_connected_layer<
+      TensorDataType,
+      data_layout::DATA_PARALLEL,
+      Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_channelwise_fully_connected_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, channelwise_fully_connected);
+  const auto& params = proto_layer.channelwise_fully_connected();
+  std::vector<size_t> output_channel_dims;
+  const size_t num_output_channel_dims = params.output_channel_dims_size();
+  for (size_t i=0; i<num_output_channel_dims; ++i) {
+    output_channel_dims.push_back(params.output_channel_dims(i));
+  }
+  const bool has_bias = (params.has_bias()
+                         ? params.bias().value()
+                         : true);
+  const bool transpose = (params.has_transpose()
+                          ? params.transpose().value()
+                          : false);
+  return BuilderType::Build(comm, output_channel_dims, has_bias, transpose);
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#define PROTO_DEVICE(T, Device)                                         \
+  template class channelwise_fully_connected_layer<                     \
+    T,data_layout::DATA_PARALLEL,Device>;                               \
+  LBANN_LAYER_BUILDER_ETI(channelwise_fully_connected, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp
new file mode 100644
index 00000000000..6310dab45b3
--- /dev/null
+++ b/src/layers/learning/channelwise_scale_bias.cpp
@@ -0,0 +1,147 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE
+#include "lbann/layers/learning/channelwise_scale_bias.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_activations());
+  auto& local_output =
+    dynamic_cast<CPUMatType&>(this->get_local_activations());
+  const auto& local_weights =
+    dynamic_cast<const CPUMatType&>(this->weights_values(0).LockedMatrix());
+  const auto local_scale = El::LockedView(local_weights,
+                                          El::ALL, El::IR(0));
+  const auto local_bias = El::LockedView(local_weights,
+                                         El::ALL, El::IR(1));
+
+  // Dimensions
+  // Note: channel_size is the number of input entries per channel and
+  // local_width is the number of local mini-batch samples.
+  const auto dims = this->get_output_dims();
+  const El::Int num_channels = dims[0];
+  const El::Int channel_size = std::accumulate(dims.begin() + 1,
+                                               dims.end(),
+                                               1, std::multiplies<int>());
+  const El::Int local_width = local_input.Width();
+
+  // Apply channel-wise scale and bias
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int channel = 0; channel < num_channels; ++channel) {
+    const auto a = local_scale(channel, 0);
+    const auto b = local_bias(channel, 0);
+    const El::Int row_start = channel * channel_size;
+    const El::Int row_end = (channel + 1) * channel_size;
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& x = local_input(row, col);
+        auto& y = local_output(row, col);
+        y = a * x + b;
+      }
+    }
+  }
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_activations());
+  const auto& local_gradient_wrt_output =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_error_signals());
+  auto& local_gradient_wrt_input =
+    dynamic_cast<CPUMatType&>(this->get_local_error_signals());
+  const auto& local_weights =
+    dynamic_cast<const CPUMatType&>(this->weights_values(0).LockedMatrix());
+  auto& local_gradient_wrt_weights =
+    dynamic_cast<CPUMatType&>(this->m_weights_gradient->Matrix());
+  const auto local_scale = El::LockedView(local_weights,
+                                          El::ALL, El::IR(0));
+  auto local_gradient_wrt_scale = El::View(local_gradient_wrt_weights,
+                                           El::ALL, El::IR(0));
+  auto local_gradient_wrt_bias = El::View(local_gradient_wrt_weights,
+                                          El::ALL, El::IR(1));
+
+  // Dimensions
+  // Note: channel_size is the number of input entries per channel and
+  // local_width is the number of local mini-batch samples.
+  const auto dims = this->get_output_dims();
+  const El::Int num_channels = dims[0];
+  const El::Int channel_size = std::accumulate(dims.begin() + 1,
+                                               dims.end(),
+                                               1, std::multiplies<int>());
+  const El::Int local_width = local_input.Width();
+
+  // Compute gradients
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int channel = 0; channel < num_channels; ++channel) {
+    const auto a = local_scale(channel, 0);
+    auto& da = local_gradient_wrt_scale(channel, 0);
+    auto& db = local_gradient_wrt_bias(channel, 0);
+    da = 0;
+    db = 0;
+    const El::Int row_start = channel * channel_size;
+    const El::Int row_end = (channel + 1) * channel_size;
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& x = local_input(row, col);
+        const auto& dy = local_gradient_wrt_output(row, col);
+        auto& dx = local_gradient_wrt_input(row, col);
+        da += x * dy;
+        db += dy;
+        dx = a * dy;
+      }
+    }
+  }
+
+  // Update optimizer with gradient
+  auto* opt = this->get_weights(0).get_optimizer();
+  if (opt != nullptr) {
+    opt->add_to_gradient(*this->m_weights_gradient, El::TypeTraits<TensorDataType>::One(), true);
+  }
+}
+
+#define PROTO(T)                                      \
+  template class channelwise_scale_bias_layer<        \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu
new file mode 100644
index 00000000000..e603a36f5b2
--- /dev/null
+++ b/src/layers/learning/channelwise_scale_bias.cu
@@ -0,0 +1,282 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE
+#include "lbann/layers/learning/channelwise_scale_bias.hpp"
+#ifdef HYDROGEN_HAVE_CUB
+#include "cub/block/block_reduce.cuh"
+#endif // HYDROGEN_HAVE_CUB
+
+namespace lbann {
+
+namespace {
+
+/**
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (channel_size / bsizex) x (width / bsizey) x num_channels
+ */
+template <typename TensorDataType>
+__global__ void fp_kernel(size_t num_channels,
+                          size_t channel_size,
+                          size_t width,
+                          const TensorDataType* __restrict__ input,
+                          size_t input_ldim,
+                          TensorDataType* __restrict__ output,
+                          size_t output_ldim,
+                          const TensorDataType* __restrict__ scale,
+                          const TensorDataType* __restrict__ bias) {
+
+  // Indices
+  const size_t bidz = blockIdx.z;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nblocksz = gridDim.z;
+
+  // Apply channel-wise scale/bias
+  for (size_t channel = bidz; channel < num_channels; channel += nblocksz) {
+    const auto a = scale[channel];
+    const auto b = bias[channel];
+    const size_t row_start = channel * channel_size;
+    const size_t row_end = (channel + 1) * channel_size;
+    const size_t col_start = 0;
+    const size_t col_end = width;
+    for (size_t col = col_start+gidy; col < col_end; col += nthreadsy) {
+      for (size_t row = row_start+gidx; row < row_end; row += nthreadsx) {
+        const auto& x = input[row + col*input_ldim];
+        auto& y = output[row + col*output_ldim];
+        y = a * x + b;
+      }
+    }
+  }
+
+}
+
+/**
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (channel_size / bsizex) x (width / bsizey) x num_channels
+ */
+template <size_t bsizex, size_t bsizey, typename TensorDataType>
+__global__ void bp_kernel(size_t num_channels,
+                          size_t channel_size,
+                          size_t width,
+                          const TensorDataType* __restrict__ input,
+                          size_t input_ldim,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          size_t gradient_wrt_output_ldim,
+                          TensorDataType* __restrict__ gradient_wrt_input,
+                          size_t gradient_wrt_input_ldim,
+                          const TensorDataType* __restrict__ scale,
+                          TensorDataType* __restrict__ gradient_wrt_scale,
+                          TensorDataType* __restrict__ gradient_wrt_bias) {
+
+  // Indices
+  const size_t tid = threadIdx.x + threadIdx.y * blockDim.x;
+  const size_t bidz = blockIdx.z;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nblocksz = gridDim.z;
+
+  for (size_t channel = bidz; channel < num_channels; channel += nblocksz) {
+
+    // Accumulate gradient contributions for thread in private memory
+    TensorDataType private_da{0}, private_db{0};
+    const auto a = scale[channel];
+    const size_t row_start = channel * channel_size;
+    const size_t row_end = (channel + 1) * channel_size;
+    const size_t col_start = 0;
+    const size_t col_end = width;
+    for (size_t col = col_start+gidy; col < col_end; col += nthreadsy) {
+      for (size_t row = row_start+gidx; row < row_end; row += nthreadsx) {
+        const auto& x = input[row + col*input_ldim];
+        const auto& dy = gradient_wrt_output[row + col*gradient_wrt_output_ldim];
+        auto& dx = gradient_wrt_input[row + col*gradient_wrt_input_ldim];
+        private_da += x * dy;
+        private_db += dy;
+        dx = a * dy;
+      }
+    }
+
+    // Accumulate gradient contributions for block and add to result
+#ifdef HYDROGEN_HAVE_CUB
+    constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
+    using BlockReduce = cub::BlockReduce<TensorDataType, bsizex, reduce_algo, bsizey>;
+    __shared__ typename BlockReduce::TempStorage workspace;
+    __syncthreads();
+    const auto da = BlockReduce(workspace).Sum(private_da);
+    if (tid == 0) {
+      cuda::atomic_add(&gradient_wrt_scale[channel], da);
+    }
+    __syncthreads();
+    const auto db = BlockReduce(workspace).Sum(private_db);
+    if (tid == 0) {
+      cuda::atomic_add(&gradient_wrt_bias[channel], db);
+    }
+#else
+    __shared__ TensorDataType workspace[bsizex*bsizey];
+    workspace[tid] = private_da;
+    for (size_t stride = bsizex*bsizey/2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        workspace[tid] += workspace[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&gradient_wrt_scale[channel], workspace[0]);
+    }
+    workspace[tid] = private_db;
+    for (size_t stride = bsizex*bsizey/2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        workspace[tid] += workspace[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&gradient_wrt_bias[channel], workspace[0]);
+    }
+#endif // HYDROGEN_HAVE_CUB
+
+  }
+
+}
+
+} // namespace
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const GPUMatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<GPUMatType&>(this->get_local_activations());
+  const auto& local_weights = dynamic_cast<const GPUMatType&>(this->weights_values(0).LockedMatrix());
+  const auto local_scale = El::LockedView(local_weights,
+                                          El::ALL, El::IR(0));
+  const auto local_bias = El::LockedView(local_weights,
+                                         El::ALL, El::IR(1));
+
+  // Dimensions
+  // Note: channel_size is the number of input entries per channel and
+  // local_width is the number of local mini-batch samples.
+  const auto dims = this->get_output_dims();
+  const El::Int num_channels = dims[0];
+  const El::Int channel_size = std::accumulate(dims.begin() + 1,
+                                               dims.end(),
+                                               1, std::multiplies<int>());
+  const El::Int local_width = local_input.Width();
+
+  // Apply channel-wise scale and bias
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (channel_size + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    grid_dims.z = num_channels;
+    fp_kernel
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        num_channels, channel_size, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output.Buffer(), local_output.LDim(),
+        local_scale.LockedBuffer(),
+        local_bias.LockedBuffer());
+  }
+
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void channelwise_scale_bias_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const GPUMatType&>(this->get_local_prev_activations());
+  const auto& local_gradient_wrt_output = dynamic_cast<const GPUMatType&>(this->get_local_prev_error_signals());
+  auto& local_gradient_wrt_input = dynamic_cast<GPUMatType&>(this->get_local_error_signals());
+  const auto& local_weights = dynamic_cast<const GPUMatType&>(this->weights_values(0).LockedMatrix());
+  auto& local_gradient_wrt_weights = dynamic_cast<GPUMatType&>(this->m_weights_gradient->Matrix());
+  const auto local_scale = El::LockedView(local_weights,
+                                          El::ALL, El::IR(0));
+  auto local_gradient_wrt_scale = El::View(local_gradient_wrt_weights,
+                                           El::ALL, El::IR(0));
+  auto local_gradient_wrt_bias = El::View(local_gradient_wrt_weights,
+                                          El::ALL, El::IR(1));
+
+  // Dimensions
+  // Note: channel_size is the number of input entries per channel and
+  // local_width is the number of local mini-batch samples.
+  const auto dims = this->get_output_dims();
+  const El::Int num_channels = dims[0];
+  const El::Int channel_size = std::accumulate(dims.begin() + 1,
+                                               dims.end(),
+                                               1, std::multiplies<int>());
+  const El::Int local_width = local_input.Width();
+
+  // Compute gradients
+  El::Zero(local_gradient_wrt_weights);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (channel_size + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    grid_dims.z = num_channels;
+    bp_kernel<block_size_x, block_size_y>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        num_channels, channel_size, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
+        local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim(),
+        local_scale.LockedBuffer(),
+        local_gradient_wrt_scale.Buffer(),
+        local_gradient_wrt_bias.Buffer());
+  }
+
+  // Update optimizer with gradient
+  auto* opt = this->get_weights(0).get_optimizer();
+  if (opt != nullptr) {
+    opt->add_to_gradient(*this->m_weights_gradient, El::TypeTraits<TensorDataType>::One(), true);
+  }
+
+}
+
+#define PROTO(T)                                      \
+  template class channelwise_scale_bias_layer<        \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/channelwise_scale_bias_builder.cpp b/src/layers/learning/channelwise_scale_bias_builder.cpp
new file mode 100644
index 00000000000..1d104bffed8
--- /dev/null
+++ b/src/layers/learning/channelwise_scale_bias_builder.cpp
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/learning/channelwise_scale_bias.hpp"
+
+namespace lbann {
+
+namespace
+{
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"channelwise_scale_bias\""
+                "with Layout=", to_string(L), ".\nThis layer is only "
+                "supported with DATA_PARALLEL data layout.");
+    return nullptr;
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct Builder<TensorDataType,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = channelwise_scale_bias_layer<TensorDataType,
+                                                   data_layout::DATA_PARALLEL,
+                                                   Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_channelwise_scale_bias_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const&)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  return BuilderType::Build(comm);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(channelwise_scale_bias, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/convolution.cpp b/src/layers/learning/convolution.cpp
new file mode 100644
index 00000000000..e9ea59a2b58
--- /dev/null
+++ b/src/layers/learning/convolution.cpp
@@ -0,0 +1,403 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CONVOLUTION_LAYER_INSTANTIATE
+#include "lbann/layers/learning/base_convolution.hpp"
+#include "lbann/layers/learning/convolution.hpp"
+
+#include "lbann/proto/proto_common.hpp"
+
+#include <layers.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+convolution_layer<TensorDataType,Layout,Device>::convolution_layer(
+  lbann_comm *comm,
+  int num_data_dims,
+  int num_output_channels,
+  int conv_dim,
+  int pad,
+  int stride,
+  int dilation,
+  int groups,
+  bool has_bias)
+  : convolution_layer(comm,
+                      num_data_dims,
+                      num_output_channels,
+                      std::vector<int>(num_data_dims, conv_dim),
+                      std::vector<int>(num_data_dims, pad),
+                      std::vector<int>(num_data_dims, stride),
+                      std::vector<int>(num_data_dims, dilation),
+                      groups,
+                      has_bias)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+convolution_layer<TensorDataType,Layout,Device>::convolution_layer(
+  lbann_comm *comm,
+  int num_data_dims,
+  int num_output_channels,
+  std::vector<int> conv_dims,
+  std::vector<int> pads,
+  std::vector<int> strides,
+  std::vector<int> dilations,
+  int groups,
+  bool has_bias)
+  : base_convolution_layer<TensorDataType, Device>(
+    comm,
+    num_data_dims,
+    num_output_channels,
+    std::move(conv_dims),
+    std::move(pads),
+    std::move(strides),
+    std::move(dilations),
+    groups,
+    has_bias)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void convolution_layer<TensorDataType,Layout,Device>
+::setup_dims(DataReaderMetaData& dr_metadata)  {
+  base_convolution_layer<TensorDataType, Device>::setup_dims(dr_metadata);
+
+  // Get tensor dimensions
+  const auto& input_dims = this->get_input_dims();
+  auto output_dims = input_dims;
+
+  // Initialize output tensor dimensions
+  output_dims[0] = this->m_output_channels;
+  for (size_t i = 0; i < output_dims.size() - 1; ++i) {
+    const auto& input_dim = input_dims[i+1];
+    const auto& kernel_dim = this->m_conv_dims[i];
+    const auto& stride = this->m_strides[i];
+    const auto& pad = this->m_pads[i];
+    const auto& dilation = this->m_dilations[i];
+    const auto& effective_dim = (input_dim
+                                 + 2 * pad
+                                 - dilation * (kernel_dim-1));
+    output_dims[i+1] = (effective_dim + stride - 1) / stride;
+  }
+  this->set_output_dims(output_dims);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::vector<int> convolution_layer<TensorDataType,Layout,Device>
+::get_kernel_dims() const {
+  std::vector<int> dims;
+  dims.push_back(this->m_output_channels);
+  dims.push_back(this->get_input_dims()[0] / this->m_groups);
+  dims.insert(dims.end(),
+              this->m_conv_dims.begin(),
+              this->m_conv_dims.end());
+  return dims;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void convolution_layer<TensorDataType,Layout,Device>::fp_compute() {
+  using BaseConvLayer = base_convolution_layer<TensorDataType, Device>;
+  if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      this->get_distconv_adapter().fp_compute_convolution();
+      this->get_distconv_adapter().fp_apply_bias();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    BaseConvLayer::apply_convolution_cudnn(true);
+    BaseConvLayer::apply_bias_cudnn();
+  }
+  else {
+    BaseConvLayer::apply_convolution_im2col(true);
+    BaseConvLayer::apply_bias_cpu();
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void convolution_layer<TensorDataType,Layout,Device>::bp_compute() {
+  using BaseConvLayer = base_convolution_layer<TensorDataType, Device>;
+  if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      if (this->get_distconv_adapter().m_conv->is_overlap_bwd_halo_exchange_enabled()) {
+        this->get_distconv_adapter().m_conv->backward_data_exchange_halo(
+          this->get_distconv_adapter().get_prev_error_signals());
+      }
+      this->get_distconv_adapter().bp_compute_convolution_filter();
+      this->get_distconv_adapter().bp_compute_convolution_data();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    BaseConvLayer::compute_gradients_cudnn(false);
+    BaseConvLayer::apply_transposed_convolution_cudnn(false);
+  }
+  else {
+    BaseConvLayer::compute_gradients_im2col(false);
+    BaseConvLayer::apply_transposed_convolution_im2col(false);
+  }
+}
+
+#if defined LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void convolution_layer<TensorDataType,Layout,Device>::setup_distconv_adapter() {
+  this->get_distconv_adapter_ptr() = make_unique<
+    convolution_distconv_adapter<TensorDataType, Layout, Device>>(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+bool convolution_layer<TensorDataType,Layout,Device>
+::is_distconv_supported() const {
+  const auto& kernel_dims = get_kernel_dims();
+  for(int i = 0; i < dc::get_num_spatial_dims(*this); i++) {
+    if (kernel_dims[2 + i] != kernel_dims[2]) {
+      dc::MPIRootPrintStreamDebug()
+        << "Nonsymmetric kernel not supported";
+      return false;
+    }
+    if (kernel_dims[2 + i] !=
+        this->m_pads[i] / this->m_dilations[i] * 2 + 1) {
+      dc::MPIRootPrintStreamDebug()
+        << "Unsupported as padding does not match the kernel size";
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void convolution_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  base_convolution_adapter<TensorDataType, Dev>::setup_distributions(
+      constraints);
+  auto &l = dynamic_cast<convolution_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+  auto kernel_dims = l.get_kernel_dims();
+  std::reverse(kernel_dims.begin(), kernel_dims.end());
+  auto dilations = l.m_dilations;
+  std::reverse(dilations.begin(), dilations.end());
+  dc::IntVector overlap(dc::get_num_dims(l), 0);
+  const auto &ps = l.get_parallel_strategy();
+  // i=0 -> width; i=1 -> height; i=2: -> depth;
+  for(int i = 0; i < dc::get_num_spatial_dims(l); i++) {
+    int splits = 0;
+    switch (i) {
+      case 0: splits = ps.width_splits; break;
+      case 1: splits = ps.height_splits; break;
+      case 2: splits = ps.depth_splits; break;
+    }
+    if (splits > 1) {
+      overlap[i] = (kernel_dims[i] - 1) / 2 * dilations[i];
+    }
+  }
+  auto &prev_activations_dist = this->get_prev_activations_dist();
+  prev_activations_dist.set_overlap(overlap);
+  constraints.mark_updated(prev_activations_dist);
+  constraints.mark_invariant(prev_activations_dist);
+  auto &prev_error_signals_dist = this->get_prev_error_signals_dist();
+  prev_error_signals_dist.set_overlap(overlap);
+  constraints.mark_updated(prev_error_signals_dist);
+  constraints.mark_invariant(prev_error_signals_dist);
+  // To deal with strides, error signals must have the same size
+  // of overlap
+  auto &error_signals_dist = this->get_error_signals_dist();
+  error_signals_dist.set_overlap(overlap);
+  constraints.mark_updated(error_signals_dist);
+  constraints.mark_invariant(error_signals_dist);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dc::Shape convolution_distconv_adapter<TensorDataType, Layout, Device>::
+get_activations_local_shape(int index) const {
+  assert_eq(index, 0);
+  const auto &layer = dynamic_cast<const convolution_layer<
+    TensorDataType, Layout, Device>&>(this->layer());
+  auto filter_dims = layer.get_kernel_dims();
+  std::reverse(std::begin(filter_dims), std::end(filter_dims));
+  auto strides = layer.m_strides;
+  std::reverse(std::begin(strides), std::end(strides));
+  auto dilations = layer.m_dilations;
+  std::reverse(std::begin(dilations), std::end(dilations));
+  const auto output_spatial_local_shape =
+      ::distconv::get_convolution_output_local_tensor_shape(
+          this->get_prev_activations(),
+          filter_dims, strides, true, dilations,
+          layer.m_groups);
+  return output_spatial_local_shape;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void convolution_distconv_adapter<TensorDataType, Layout, Device>::setup_layer(
+    size_t workspace_capacity) {
+  base_convolution_adapter<TensorDataType, Device>::setup_layer(
+      workspace_capacity);
+  auto &layer = dynamic_cast<convolution_layer<
+    TensorDataType, Layout, Device>&>(this->layer());
+
+  if (dc::is_deterministic()) {
+    dc::MPIRootPrintStreamDebug()
+      << "Using deterministic convolution algorithms";
+    this->m_fwd_algo = "DETERMINISTIC";
+    this->m_bwd_data_algo = "DETERMINISTIC";
+    this->m_bwd_filter_algo = "DETERMINISTIC";
+  } else {
+    this->m_fwd_algo = dc::get_convolution_fwd_algorithm();
+    this->m_bwd_data_algo = dc::get_convolution_bwd_data_algorithm();
+    this->m_bwd_filter_algo = dc::get_convolution_bwd_filter_algorithm();
+  }
+
+  std::vector<int> pads = layer.m_pads;
+  std::reverse(pads.begin(), pads.end());
+  std::vector<int> strides = layer.m_strides;
+  std::reverse(strides.begin(), strides.end());
+  std::vector<int> dilations = layer.m_dilations;
+  std::reverse(dilations.begin(), dilations.end());
+
+  this->m_conv->setup(this->get_prev_activations(),
+                      *(this->m_kernel), this->get_activations(),
+                      this->get_error_signals(),
+                      *this->m_kernel_gradient,
+                      this->get_prev_error_signals(),
+                      pads, strides, dilations, layer.m_groups,
+                      this->m_fwd_algo, this->m_bwd_data_algo,
+                      this->m_bwd_filter_algo,
+                      workspace_capacity);
+}
+#endif // defined LBANN_HAS_DISTCONV
+
+// Builder helper stuff
+namespace {
+
+#ifdef LBANN_HAS_CUDNN
+using ProtoTensorOpEnumType = decltype(lbann_data::DEFAULT_TENSOR_OPS);
+cudnnMathType_t convert_to_cudnn_math_type(ProtoTensorOpEnumType mt)
+{
+  switch (mt)
+  {
+  case lbann_data::DEFAULT_TENSOR_OPS:
+    return cudnn::get_default_convolution_math_type();
+  case lbann_data::NO_TENSOR_OPS:
+    return CUDNN_DEFAULT_MATH;
+  case lbann_data::USE_TENSOR_OPS:
+    return CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+  default:
+    LBANN_ERROR("Bad math type value.");
+  }
+  return CUDNN_DEFAULT_MATH;
+}
+#endif // LBANN_HAS_CUDNN
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+struct ConvLayerBuilder
+{
+  static std::unique_ptr<Layer> Build(
+    lbann_comm* comm, lbann_data::Layer const& proto_layer){
+
+    const auto& params = proto_layer.convolution();
+    const auto& num_output_channels = params.num_output_channels();
+    const auto& bias = params.has_bias();
+    int num_groups = params.num_groups();
+    if (num_groups == 0) {
+      num_groups = 1;
+    }
+
+    if (params.has_vectors()) {
+      const auto& dims = parse_list<int>(params.conv_dims());
+      const auto& pads = parse_list<int>(params.conv_pads());
+      const auto& strides = parse_list<int>(params.conv_strides());
+      std::vector<int> dilations = parse_list<int>(params.conv_dilations());
+      if (dilations.empty()) {
+        dilations.resize(dims.size(), 1);
+      }
+#ifdef LBANN_HAS_CUDNN
+      auto ret = lbann::make_unique<convolution_layer<TensorDataType, Layout, Device>>(
+        comm, dims.size(), num_output_channels,
+        dims, pads, strides, dilations, num_groups, bias);
+      ret->set_cudnn_math_mode(
+        convert_to_cudnn_math_type(params.conv_tensor_op_mode()));
+      return ret;
+#else
+      return lbann::make_unique<convolution_layer<TensorDataType, Layout, Device>>(
+        comm, dims.size(), num_output_channels,
+        dims, pads, strides, dilations, num_groups, bias);
+#endif // LBANN_HAS_CUDNN
+    }
+    else {
+      const auto& num_dims = params.num_dims();
+      const auto& dim = params.conv_dims_i();
+      const auto& pad = params.conv_pads_i();
+      const auto& stride = params.conv_strides_i();
+      int dilation = params.conv_dilations_i();
+      if (dilation == 0) {
+        dilation = 1;
+      }
+#ifdef LBANN_HAS_CUDNN
+      auto ret =lbann::make_unique<convolution_layer<TensorDataType, Layout, Device>>(
+        comm, num_dims, num_output_channels,
+        dim, pad, stride, dilation, num_groups, bias);
+      ret->set_cudnn_math_mode(
+        convert_to_cudnn_math_type(params.conv_tensor_op_mode()));
+      return ret;
+#else
+      return lbann::make_unique<convolution_layer<TensorDataType, Layout, Device>>(
+        comm, num_dims, num_output_channels,
+        dim, pad, stride, dilation, num_groups, bias);
+#endif // LBANN_HAS_CUDNN
+    }
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct ConvLayerBuilder<TensorDataType, data_layout::MODEL_PARALLEL, Device>
+{
+  static std::unique_ptr<Layer> Build(
+    lbann_comm* comm, lbann_data::Layer const& proto_layer){
+    LBANN_ERROR("convolution layer is only supported with "
+                "a data-parallel layout");
+  }
+};
+
+}// namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_convolution_layer_from_pbuf(
+  lbann_comm* comm,
+  const lbann_data::Layer& proto_layer) {
+  using Builder = ConvLayerBuilder<TensorDataType, Layout, Device>;
+  return Builder::Build(comm, proto_layer);
+}
+
+#define PROTO_DEVICE(T, Device)                                            \
+  template class convolution_layer<T, data_layout::DATA_PARALLEL, Device>; \
+    template std::unique_ptr<Layer>                                       \
+  build_convolution_layer_from_pbuf<T, data_layout::DATA_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template std::unique_ptr<Layer>                                       \
+  build_convolution_layer_from_pbuf<T, data_layout::MODEL_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/learning/deconvolution.cpp b/src/layers/learning/deconvolution.cpp
new file mode 100644
index 00000000000..3f80353e369
--- /dev/null
+++ b/src/layers/learning/deconvolution.cpp
@@ -0,0 +1,311 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CONVOLUTION_LAYER_INSTANTIATE
+#include "lbann/layers/learning/base_convolution.hpp"
+#include "lbann/layers/learning/deconvolution.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include <sstream>
+#include <string>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+deconvolution_layer<TensorDataType,Layout,Device>::deconvolution_layer(
+  lbann_comm *comm,
+  int num_data_dims,
+  int num_output_channels,
+  int conv_dim,
+  int pad,
+  int stride,
+  int dilation,
+  int groups,
+  bool has_bias)
+  : deconvolution_layer(comm,
+                        num_data_dims,
+                        num_output_channels,
+                        std::vector<int>(num_data_dims, conv_dim),
+                        std::vector<int>(num_data_dims, pad),
+                        std::vector<int>(num_data_dims, stride),
+                        std::vector<int>(num_data_dims, dilation),
+                        groups,
+                        has_bias)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+deconvolution_layer<TensorDataType,Layout,Device>::deconvolution_layer(
+  lbann_comm *comm,
+  int num_data_dims,
+  int num_output_channels,
+  std::vector<int> conv_dims,
+  std::vector<int> pads,
+  std::vector<int> strides,
+  std::vector<int> dilations,
+  int groups,
+  bool has_bias)
+  : base_convolution_layer<TensorDataType, Device>(
+    comm,
+    num_data_dims,
+    num_output_channels,
+    std::move(conv_dims),
+    std::move(pads),
+    std::move(strides),
+    std::move(dilations),
+    groups,
+    has_bias)
+{}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void
+deconvolution_layer<TensorDataType,Layout,Device>
+::setup_dims(DataReaderMetaData& dr_metadata) {
+  base_convolution_layer<TensorDataType, Device>::setup_dims(dr_metadata);
+  std::ostringstream err;
+
+  // Get tensor dimensions
+  const auto& input_dims = this->get_input_dims();
+  auto output_dims = input_dims;
+
+  // Check for unsupported features
+  /// @todo Implement dilated and grouped deconvolution
+  if (std::any_of(this->m_dilations.begin(),
+                  this->m_dilations.end(),
+                  [] (int d) { return d != 1; })) {
+    err << this->get_type() << " layer "
+        << "\"" << this->get_name() << "\" "
+        << "has non-unit dilations (";
+    for (size_t i = 0; i < this->m_dilations.size(); ++i) {
+      err << (i > 0 ? ", " : "") << this->m_dilations[i];
+    }
+    err << ")";
+    LBANN_ERROR(err.str());
+  }
+  if (this->m_groups != 1) {
+    err << this->get_type() << " layer "
+        << "\"" << this->get_name() << "\" "
+        << "has non-unit groups "
+        << "(" << this->m_groups << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Initialize output tensor dimensions
+  /// @todo Dilated deconvolution
+  output_dims[0] = this->m_output_channels;
+  for (size_t i = 0; i < output_dims.size() - 1; ++i) {
+    const auto& input_dim = input_dims[i+1];
+    const auto& kernel_dim = this->m_conv_dims[i];
+    const auto& stride = this->m_strides[i];
+    const auto& pad = this->m_pads[i];
+    // const auto& dilation = this->m_dilations[i];
+    output_dims[i+1] = (input_dim-1) * stride + kernel_dim - 2 * pad;
+  }
+  this->set_output_dims(output_dims);
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::vector<int>
+deconvolution_layer<TensorDataType,Layout,Device>
+::get_kernel_dims() const {
+  std::vector<int> dims;
+  dims.push_back(this->get_input_dims()[0]);
+  dims.push_back(this->m_output_channels);
+  dims.insert(dims.end(),
+              this->m_conv_dims.begin(),
+              this->m_conv_dims.end());
+  return dims;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void deconvolution_layer<TensorDataType,Layout,Device>::fp_compute() {
+  using BaseConvLayer = base_convolution_layer<TensorDataType, Device>;
+  if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      this->get_distconv_adapter().fp_compute_convolution();
+      this->get_distconv_adapter().fp_apply_bias();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    BaseConvLayer::apply_transposed_convolution_cudnn(true);
+    BaseConvLayer::apply_bias_cudnn();
+  } else {
+    BaseConvLayer::apply_transposed_convolution_im2col(true);
+    BaseConvLayer::apply_bias_cpu();
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void deconvolution_layer<TensorDataType,Layout,Device>::bp_compute() {
+  using BaseConvLayer = base_convolution_layer<TensorDataType, Device>;
+  if(this->using_gpus()) {
+#ifdef LBANN_HAS_DISTCONV
+    if (this->distconv_enabled()) {
+      if (this->get_distconv_adapter().m_conv->is_overlap_bwd_halo_exchange_enabled()) {
+        this->get_distconv_adapter().m_conv->backward_data_exchange_halo(
+          this->get_distconv_adapter().get_prev_error_signals());
+      }
+      this->get_distconv_adapter().bp_compute_convolution_filter();
+      this->get_distconv_adapter().bp_compute_convolution_data();
+      return;
+    }
+#endif // LBANN_HAS_DISTCONV
+    BaseConvLayer::compute_gradients_cudnn(true);
+    BaseConvLayer::apply_convolution_cudnn(false);
+  } else {
+    BaseConvLayer::compute_gradients_im2col(true);
+    BaseConvLayer::apply_convolution_im2col(false);
+  }
+}
+
+#if defined LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void deconvolution_layer<TensorDataType,Layout,Device>
+::setup_distconv_adapter() {
+  this->get_distconv_adapter_ptr() = make_unique<
+    deconvolution_distconv_adapter<TensorDataType, Layout, Device>>(*this);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+bool deconvolution_layer<TensorDataType,Layout,Device>
+::is_distconv_supported() const {
+  const auto& kernel_dims = get_kernel_dims();
+  for(int i = 0; i < dc::get_num_spatial_dims(*this); i++) {
+    auto pad = this->m_pads[i];
+    if (pad != 0) {
+      dc::MPIPrintStreamDebug()
+        << this->get_name()
+        << " unsupported as padding must be zero";
+      return false;
+    }
+    auto stride_size = this->m_strides[i];
+    auto filter_size = kernel_dims[2+i];
+    if (!(filter_size % 2 == 0 && filter_size == stride_size)) {
+      dc::MPIPrintStreamDebug()
+        << this->get_name()
+        << " unsupported due to filter and stride sizes";
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void deconvolution_distconv_adapter<TensorDataType, T_layout, Dev>::
+setup_distributions(tensor_overlap_constraints &constraints) {
+  base_convolution_adapter<TensorDataType, Dev>::setup_distributions(
+      constraints);
+
+  // Assumes zero halo all tensor for now
+  // prev activations
+  for (auto &d: this->m_prev_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_activations_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_prev_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+  for (auto &d: this->m_error_signals_dists) {
+    d.clear_overlap();
+    constraints.mark_updated(d);
+    constraints.mark_invariant(d);
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dc::Shape deconvolution_distconv_adapter<TensorDataType, Layout, Device>::
+get_activations_local_shape(int index) const {
+  assert_eq(index, 0);
+  const auto &layer = dynamic_cast<const deconvolution_layer<
+    TensorDataType, Layout, Device>&>(this->layer());
+  auto filter_dims = layer.get_kernel_dims();
+  std::reverse(std::begin(filter_dims), std::end(filter_dims));
+  auto strides = layer.m_strides;
+  std::reverse(std::begin(strides), std::end(strides));
+  auto dilations = layer.m_dilations;
+  std::reverse(std::begin(dilations), std::end(dilations));
+  const auto output_spatial_local_shape =
+      ::distconv::get_deconvolution_output_local_tensor_shape(
+          this->get_prev_activations(),
+          filter_dims, strides, false, dilations,
+          layer.m_groups);
+  return output_spatial_local_shape;
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void deconvolution_distconv_adapter<TensorDataType, Layout, Device>::setup_layer(
+    size_t workspace_capacity) {
+  base_convolution_adapter<TensorDataType, Device>::setup_layer(
+      workspace_capacity);
+  auto &layer = dynamic_cast<deconvolution_layer<
+    TensorDataType, Layout, Device>&>(this->layer());
+
+  if (dc::is_deterministic()) {
+    dc::MPIRootPrintStreamDebug()
+      << "Using deterministic convolution algorithms";
+    this->m_fwd_algo = "DETERMINISTIC";
+    this->m_bwd_data_algo = "DETERMINISTIC";
+    this->m_bwd_filter_algo = "DETERMINISTIC";
+  } else {
+    this->m_fwd_algo = dc::get_convolution_bwd_data_algorithm();
+    this->m_bwd_data_algo = dc::get_convolution_fwd_algorithm();
+    this->m_bwd_filter_algo = dc::get_convolution_bwd_filter_algorithm();
+  }
+
+  std::vector<int> pads = layer.m_pads;
+  std::reverse(pads.begin(), pads.end());
+  std::vector<int> strides = layer.m_strides;
+  std::reverse(strides.begin(), strides.end());
+  std::vector<int> dilations = layer.m_dilations;
+  std::reverse(dilations.begin(), dilations.end());
+
+  this->m_conv->setup(this->get_prev_activations(),
+                      *(this->m_kernel), this->get_activations(),
+                      this->get_error_signals(),
+                      *this->m_kernel_gradient,
+                      this->get_prev_error_signals(),
+                      pads, strides, dilations, layer.m_groups,
+                      this->m_fwd_algo, this->m_bwd_data_algo,
+                      this->m_bwd_filter_algo,
+                      workspace_capacity, false, true);
+}
+#endif // defined LBANN_HAS_DISTCONV
+
+#define PROTO_DEVICE(T, Device)                                             \
+  template class deconvolution_layer<T, data_layout::DATA_PARALLEL, Device>;
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp
new file mode 100644
index 00000000000..b039e08048a
--- /dev/null
+++ b/src/layers/learning/embedding.cpp
@@ -0,0 +1,117 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_EMBEDDING_LAYER_INSTANTIATE
+#include "lbann/layers/learning/embedding.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void embedding_layer<TensorDataType,Layout,Device>::setup_matrices(const El::Grid& grid) {
+  data_type_layer<TensorDataType>::setup_matrices(grid);
+  this->m_embeddings_grad.reset(new El::DistMatrix<TensorDataType, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>(grid));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void embedding_layer<TensorDataType,Layout,Device>::fp_compute() {
+  using MatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local data
+  const auto& local_embeddings = dynamic_cast<const MatType&>(this->weights_values(0).LockedMatrix());
+  const auto& local_input = dynamic_cast<const MatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<MatType&>(this->get_local_activations());
+  const size_t input_size = this->get_input_size();
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // Populate output matrix with values from embedding matrix
+  MatType embedding_v, output_v;
+  for (size_t j=0; j<local_mini_batch_size; ++j) {
+    for (size_t i=0; i<input_size; ++i) {
+      El::View(output_v, local_output,
+               El::IR(i*m_embedding_dim, (i+1)*m_embedding_dim),
+               El::IR(j));
+      const El::Int ind = static_cast<El::Int>(std::floor(local_input(i, j)));
+      if (0<=ind && ind<static_cast<El::Int>(this->m_num_embeddings)) {
+        El::LockedView(embedding_v, local_embeddings, El::ALL, El::IR(ind));
+        El::Copy(embedding_v, output_v);
+      } else {
+        El::Zero(output_v);
+      }
+    }
+  }
+
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void embedding_layer<TensorDataType, Layout, Device>::bp_compute() {
+  using MatType = El::Matrix<TensorDataType, El::Device::CPU>;
+  const TensorDataType one = El::TypeTraits<TensorDataType>::One();
+
+  // Embedding layer is not differentiable w.r.t. inputs
+  El::Zero(this->get_error_signals());
+
+  // Nothing to be done if embeddings are not being optimized
+  if (this->get_weights(0).get_optimizer() == nullptr) { return; }
+  auto& opt = *this->get_weights(0).get_optimizer();
+
+  // Local data
+  const auto& local_input = dynamic_cast<const MatType&>(this->get_local_prev_activations());
+  auto& local_embedding_grad = dynamic_cast<MatType&>(this->m_embeddings_grad->Matrix());
+  const auto& local_output_grad = dynamic_cast<const MatType&>(this->get_local_prev_error_signals());
+  const size_t input_size = this->get_input_size();
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // Update gradient w.r.t. embeddings
+  // Note: Don't update gradient for padding index
+  El::Zero(local_embedding_grad);
+  MatType embedding_grad_v, output_grad_v;
+  for (size_t j=0; j<local_mini_batch_size; ++j) {
+    for (size_t i=0; i<input_size; ++i) {
+      const El::Int ind = static_cast<El::Int>(std::floor(local_input(i, j)));
+      if (0<=ind && ind<static_cast<El::Int>(this->m_num_embeddings)
+          && ind!=this->m_padding_idx) {
+        El::LockedView(output_grad_v, local_output_grad,
+                       El::IR(i*m_embedding_dim, (i+1)*m_embedding_dim),
+                       El::IR(j));
+        El::View(embedding_grad_v, local_embedding_grad,
+                 El::ALL, El::IR(ind));
+        El::Axpy(one, output_grad_v, embedding_grad_v);
+      }
+    }
+  }
+  opt.add_to_gradient(*this->m_embeddings_grad, El::TypeTraits<TensorDataType>::One(), true);
+
+}
+
+
+// Explicit instantiation
+#define PROTO(T)                                                        \
+  template class embedding_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/embedding.cu b/src/layers/learning/embedding.cu
new file mode 100644
index 00000000000..29e002be307
--- /dev/null
+++ b/src/layers/learning/embedding.cu
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_EMBEDDING_LAYER_INSTANTIATE
+#include "lbann/layers/learning/embedding.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** @brief Kernel for forward prop
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (embedding_dim / bsize) x input_size x mini_batch_size
+ */
+template <typename TensorDataType>
+__global__ void fp_kernel(El::Int num_embeddings,
+                          El::Int embedding_dim,
+                          El::Int input_size,
+                          El::Int mini_batch_size,
+                          const TensorDataType* __restrict__ indices,
+                          El::Int indices_ldim,
+                          const TensorDataType* __restrict__ embeddings,
+                          El::Int embeddings_ldim,
+                          TensorDataType* __restrict__ output,
+                          El::Int output_ldim) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const El::Int gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nthreadsy = blockDim.y * gridDim.y;
+  const El::Int nthreadsz = blockDim.z * gridDim.z;
+  for (El::Int k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (El::Int j = gidy; j < input_size; j += nthreadsy) {
+      for (El::Int i = gidx; i < embedding_dim; i += nthreadsx) {
+        auto& y = output[i+j*embedding_dim+k*output_ldim];
+        const El::Int ind = static_cast<El::Int>(indices[j+k*indices_ldim]);
+        if (0<=ind && ind<num_embeddings) {
+          y = embeddings[i+ind*embeddings_ldim];
+        }
+        else {
+          y = TensorDataType(0.0);
+        }
+      }
+    }
+  }
+}
+
+/** @brief Kernel for backprop
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (embedding_dim / bsize) x input_size x mini_batch_size
+ */
+template <typename TensorDataType>
+__global__ void bp_kernel(El::Int num_embeddings,
+                          El::Int embedding_dim,
+                          El::Int input_size,
+                          El::Int mini_batch_size,
+                          El::Int padding_idx,
+                          const TensorDataType* __restrict__ indices,
+                          El::Int indices_ldim,
+                          const TensorDataType* __restrict__ output_grad,
+                          El::Int output_grad_ldim,
+                          TensorDataType* __restrict__ embeddings_grad,
+                          El::Int embeddings_grad_ldim) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const El::Int gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nthreadsy = blockDim.y * gridDim.y;
+  const El::Int nthreadsz = blockDim.z * gridDim.z;
+  for (El::Int k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (El::Int j = gidy; j < input_size; j += nthreadsy) {
+      for (El::Int i = gidx; i < embedding_dim; i += nthreadsx) {
+        const El::Int ind = static_cast<El::Int>(indices[j+k*indices_ldim]);
+        if (0<=ind && ind<num_embeddings && ind!=padding_idx) {
+          const auto& dy = output_grad[i+j*embedding_dim+k*output_grad_ldim];
+          auto& dw = embeddings_grad[i+ind*embeddings_grad_ldim];
+          cuda::atomic_add(&dw, dy);
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void embedding_layer<TensorDataType, T_layout, Dev>::setup_matrices(const El::Grid& grid) {
+  data_type_layer<TensorDataType>::setup_matrices(grid);
+  this->m_embeddings_grad.reset(new El::DistMatrix<TensorDataType, El::STAR, El::STAR, El::ELEMENT, El::Device::GPU>(grid));
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void embedding_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  using MatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local data
+  const auto& local_embeddings = dynamic_cast<const MatType&>(this->weights_values(0).LockedMatrix());
+  const auto& local_input = dynamic_cast<const MatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<MatType&>(this->get_local_activations());
+  const auto& input_size = this->get_input_size();
+  const auto& local_mini_batch_size = local_input.Width();
+
+  // Launch CUDA kernel
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size;
+    grid_dims.y = input_size;
+    grid_dims.z = local_mini_batch_size;
+    fp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      this->m_num_embeddings,
+      this->m_embedding_dim,
+      input_size,
+      local_mini_batch_size,
+      local_input.LockedBuffer(),
+      local_input.LDim(),
+      local_embeddings.LockedBuffer(),
+      local_embeddings.LDim(),
+      local_output.Buffer(),
+      local_output.LDim());
+  }
+
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void embedding_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  using MatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Embedding layer is not differentiable w.r.t. inputs
+  El::Zero(this->get_error_signals());
+
+  // Nothing to be done if embeddings are not being optimized
+  if (this->get_weights(0).get_optimizer() == nullptr) { return; }
+  auto& opt = *this->get_weights(0).get_optimizer();
+
+  // Local data
+  const auto& local_input = dynamic_cast<const MatType&>(this->get_local_prev_activations());
+  auto& local_embedding_grad = dynamic_cast<MatType&>(this->m_embeddings_grad->Matrix());
+  const auto& local_output_grad = dynamic_cast<const MatType&>(this->get_local_prev_error_signals());
+  const auto& input_size = this->get_input_size();
+  const auto& local_mini_batch_size = local_input.Width();
+
+  // Launch CUDA kernel
+  El::Zero(local_embedding_grad);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (this->m_embedding_dim + block_size - 1) / block_size;
+    grid_dims.y = input_size;
+    grid_dims.z = local_mini_batch_size;
+    bp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      this->m_num_embeddings,
+      this->m_embedding_dim,
+      input_size,
+      local_mini_batch_size,
+      this->m_padding_idx,
+      local_input.LockedBuffer(),
+      local_input.LDim(),
+      local_output_grad.LockedBuffer(),
+      local_output_grad.LDim(),
+      local_embedding_grad.Buffer(),
+      local_embedding_grad.LDim());
+  }
+  opt.add_to_gradient(*this->m_embeddings_grad, El::TypeTraits<TensorDataType>::One(), true);
+
+}
+
+// Explicit instantiation
+#define PROTO(T)                     \
+  template class embedding_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/embedding_builder.cpp b/src/layers/learning/embedding_builder.cpp
new file mode 100644
index 00000000000..7f8960f18ba
--- /dev/null
+++ b/src/layers/learning/embedding_builder.cpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/learning/embedding.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+namespace
+{
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"embedding\""
+                "with Layout=", to_string(L), ".\nThis layer is only "
+                "supported with DATA_PARALLEL data layout.");
+    return nullptr;
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct Builder<TensorDataType,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = embedding_layer<TensorDataType,data_layout::DATA_PARALLEL,Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_embedding_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, embedding);
+
+  const auto& params = proto_layer.embedding();
+  const size_t num_embeddings = params.num_embeddings();
+  const size_t embedding_dim = params.embedding_dim();
+  const El::Int padding_idx = (params.has_padding_idx() ?
+                               params.padding_idx().value() : -1);
+  return BuilderType::Build(comm, num_embeddings, embedding_dim, padding_idx);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(embedding, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp
new file mode 100644
index 00000000000..64cc292d129
--- /dev/null
+++ b/src/layers/learning/entrywise_scale_bias.cpp
@@ -0,0 +1,156 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE
+#include "lbann/layers/learning/entrywise_scale_bias.hpp"
+
+namespace lbann {
+
+namespace {
+
+template <typename TensorDataType>
+void fp_impl(const El::Matrix<TensorDataType, El::Device::CPU>& local_input,
+             El::Matrix<TensorDataType, El::Device::CPU>& local_output,
+             El::Matrix<TensorDataType, El::Device::CPU> const& local_scale_bias) {
+
+  // Local matrices
+  const auto local_scale = El::LockedView(local_scale_bias,
+                                          El::ALL, El::IR(0));
+  const auto local_bias = El::LockedView(local_scale_bias,
+                                         El::ALL, El::IR(1));
+
+  // Apply entry-wise scale and bias
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int col = 0; col < local_width; ++col) {
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& a = local_scale(row, 0);
+      const auto& b = local_bias(row, 0);
+      const auto& x = local_input(row, col);
+      auto& y = local_output(row, col);
+      y = a * x + b;
+    }
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_impl(
+  const El::Matrix<TensorDataType, El::Device::CPU>& local_input,
+  const El::Matrix<TensorDataType, El::Device::CPU>& local_gradient_wrt_output,
+  El::Matrix<TensorDataType, El::Device::CPU>& local_gradient_wrt_input,
+  El::Matrix<TensorDataType, El::Device::CPU> const& local_scale_bias,
+  El::AbstractDistMatrix<TensorDataType>& gradient_wrt_scale_bias) {
+
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  auto& local_gradient_wrt_scale_bias
+    = dynamic_cast<CPUMatType&>(gradient_wrt_scale_bias.Matrix());
+  const auto local_scale = El::LockedView(local_scale_bias,
+                                          El::ALL, El::IR(0));
+  auto local_gradient_wrt_scale = El::View(local_gradient_wrt_scale_bias,
+                                           El::ALL, El::IR(0));
+  auto local_gradient_wrt_bias = El::View(local_gradient_wrt_scale_bias,
+                                          El::ALL, El::IR(1));
+
+  // Dimensions
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+
+  // Iterate through row blocks
+  // Note: Block size is chosen to match cache line size.
+  El::Zero(local_gradient_wrt_scale_bias);
+  constexpr El::Int _bsize = 64 / sizeof(DataType);
+  constexpr El::Int bsize = _bsize > 1 ? _bsize : 1;
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+
+    // Compute gradient contributions for row block
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& a = local_scale(row, 0);
+        const auto& x = local_input(row, col);
+        const auto& dy = local_gradient_wrt_output(row, col);
+        auto& dx = local_gradient_wrt_input(row, col);
+        auto& da = local_gradient_wrt_scale(row, 0);
+        auto& db = local_gradient_wrt_bias(row, 0);
+        dx = a * dy;
+        da += x * dy;
+        db += dy;
+      }
+    }
+
+  }
+
+}
+
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void entrywise_scale_bias_layer<TensorDataType, Layout, Device>::fp_compute() {
+  using LocalMatType = El::Matrix<TensorDataType, Device>;
+  fp_impl(dynamic_cast<LocalMatType const&>(this->get_local_prev_activations()),
+          dynamic_cast<LocalMatType&>(this->get_local_activations()),
+          dynamic_cast<LocalMatType const&>(this->weights_values(0).LockedMatrix()));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void entrywise_scale_bias_layer<TensorDataType, Layout, Device>::bp_compute() {
+  using LocalMatType = El::Matrix<TensorDataType, Device>;
+
+  bp_impl(dynamic_cast<LocalMatType const&>(this->get_local_prev_activations()),
+          dynamic_cast<LocalMatType const&>(this->get_local_prev_error_signals()),
+          dynamic_cast<LocalMatType&>(this->get_local_error_signals()),
+          dynamic_cast<LocalMatType const&>(this->weights_values(0).LockedMatrix()),
+          *this->m_weights_gradient);
+
+  // Update optimizer with gradient
+  auto* opt = this->get_weights(0).get_optimizer();
+  if (opt != nullptr) {
+    opt->add_to_gradient(*(this->m_weights_gradient),
+                         El::TypeTraits<TensorDataType>::One(),
+                         true);
+  }
+}
+
+LBANN_LAYER_DEFAULT_BUILDER(entrywise_scale_bias)
+
+#define PROTO(T)                                                        \
+  template class entrywise_scale_bias_layer<                            \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;                    \
+  template class entrywise_scale_bias_layer<                            \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>;                   \
+  LBANN_LAYER_BUILDER_ETI(entrywise_scale_bias, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu
new file mode 100644
index 00000000000..695986244cc
--- /dev/null
+++ b/src/layers/learning/entrywise_scale_bias.cu
@@ -0,0 +1,214 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE
+#include "lbann/layers/learning/entrywise_scale_bias.hpp"
+
+namespace lbann {
+
+namespace {
+
+/**
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (height / bsizex) x (width / bsizey) x num_channels
+ */
+template <typename TensorDataType>
+__global__ void fp_kernel(size_t height,
+                          size_t width,
+                          const TensorDataType* __restrict__ input,
+                          size_t input_ldim,
+                          TensorDataType* __restrict__ output,
+                          size_t output_ldim,
+                          const TensorDataType* __restrict__ scale,
+                          const TensorDataType* __restrict__ bias) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t row = gidx; row < height; row += nthreadsx) {
+    const auto a = scale[row];
+    const auto b = bias[row];
+    for (size_t col = gidy; col < width; col += nthreadsy) {
+      const auto& x = input[row + col*input_ldim];
+      auto& y = output[row + col*output_ldim];
+      y = a * x + b;
+    }
+  }
+}
+
+/**
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (height / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void bp_kernel(size_t height,
+                          size_t width,
+                          const TensorDataType* __restrict__ input,
+                          size_t input_ldim,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          size_t gradient_wrt_output_ldim,
+                          TensorDataType* __restrict__ gradient_wrt_input,
+                          size_t gradient_wrt_input_ldim,
+                          const TensorDataType* __restrict__ scale,
+                          TensorDataType* __restrict__ gradient_wrt_scale,
+                          TensorDataType* __restrict__ gradient_wrt_bias) {
+  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nthreads = blockDim.x * gridDim.x;
+  for (size_t row = gid; row < height; row += nthreads) {
+    const auto a = scale[row];
+    TensorDataType da{0}, db{0};
+    for (size_t col = 0; col < width; ++col) {
+      const auto& x = input[row + col * input_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = a * dy;
+      da += x * dy;
+      db += dy;
+    }
+    gradient_wrt_scale[row] = da;
+    gradient_wrt_bias[row] = db;
+  }
+}
+
+template <typename TensorDataType>
+void fp_impl(
+  const El::Matrix<TensorDataType, El::Device::GPU>& local_input,
+  El::Matrix<TensorDataType, El::Device::GPU>& local_output,
+  El::Matrix<TensorDataType, El::Device::GPU> const& local_scale_bias) {
+
+  // Local matrices
+  const auto local_scale = El::LockedView(local_scale_bias,
+                                          El::ALL, El::IR(0));
+  const auto local_bias = El::LockedView(local_scale_bias,
+                                         El::ALL, El::IR(1));
+
+  // Apply entry-wise scale and bias
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    fp_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output.Buffer(), local_output.LDim(),
+        local_scale.LockedBuffer(),
+        local_bias.LockedBuffer());
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_impl(
+  const El::Matrix<TensorDataType, El::Device::GPU>& local_input,
+  const El::Matrix<TensorDataType, El::Device::GPU>& local_gradient_wrt_output,
+  El::Matrix<TensorDataType, El::Device::GPU>& local_gradient_wrt_input,
+  El::Matrix<TensorDataType, El::Device::GPU> const& local_scale_bias,
+  El::Matrix<TensorDataType, El::Device::GPU>& local_gradient_wrt_scale_bias) {
+
+  // Local matrices
+  const auto local_scale = El::LockedView(local_scale_bias,
+                                          El::ALL, El::IR(0));
+  auto local_gradient_wrt_scale = El::View(local_gradient_wrt_scale_bias,
+                                           El::ALL, El::IR(0));
+  auto local_gradient_wrt_bias = El::View(local_gradient_wrt_scale_bias,
+                                          El::ALL, El::IR(1));
+
+  // Compute gradients
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  El::Zero(local_gradient_wrt_scale_bias);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_kernel <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
+      local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim(),
+      local_scale.LockedBuffer(),
+      local_gradient_wrt_scale.Buffer(),
+      local_gradient_wrt_bias.Buffer());
+  }
+
+}
+
+} // namespace
+
+// Template instantiation
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void entrywise_scale_bias_layer<TensorDataType, Layout, Device>::fp_compute() {
+  using LocalMatType = El::Matrix<TensorDataType, Device>;
+  fp_impl(dynamic_cast<const LocalMatType&>(this->get_local_prev_activations()),
+          dynamic_cast<LocalMatType&>(this->get_local_activations()),
+          dynamic_cast<LocalMatType const&>(
+            this->weights_values(0).LockedMatrix()));
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void entrywise_scale_bias_layer<TensorDataType, Layout, Device>::bp_compute() {
+  using LocalMatType = El::Matrix<TensorDataType, Device>;
+
+  auto& scale_bias = this->get_weights(0);
+  auto& gradient_wrt_scale_bias = *this->m_weights_gradient;
+
+  bp_impl(dynamic_cast<const LocalMatType&>(this->get_local_prev_activations()),
+          dynamic_cast<const LocalMatType&>(this->get_local_prev_error_signals()),
+          dynamic_cast<LocalMatType&>(this->get_local_error_signals()),
+          dynamic_cast<LocalMatType const&>(
+            this->weights_values(0).LockedMatrix()),
+          dynamic_cast<LocalMatType&>(gradient_wrt_scale_bias.Matrix()));
+
+  // Update optimizer with gradient
+  auto* opt = scale_bias.get_optimizer();
+  if (opt != nullptr) {
+    opt->add_to_gradient(gradient_wrt_scale_bias, TensorDataType{1}, true);
+  }
+}
+
+LBANN_LAYER_DEFAULT_BUILDER(entrywise_scale_bias)
+
+#define PROTO(T)                                                     \
+  template class entrywise_scale_bias_layer<                         \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;                 \
+  template class entrywise_scale_bias_layer<                         \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>;                \
+  LBANN_LAYER_BUILDER_ETI(entrywise_scale_bias, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/learning/fully_connected.cpp b/src/layers/learning/fully_connected.cpp
index 730a58631be..4aa979a713c 100644
--- a/src/layers/learning/fully_connected.cpp
+++ b/src/layers/learning/fully_connected.cpp
@@ -24,119 +24,274 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE
 #include "lbann/layers/learning/fully_connected.hpp"
 
+#include "lbann/weights/initializer.hpp"
+#include "lbann/weights/variance_scaling_initializers.hpp"
+
+#include <layers.pb.h>
+
+#include <string>
+#include <sstream>
+
 namespace lbann {
 
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-  ::setup_matrices(const El::Grid& grid) {
-  learning_layer::setup_matrices(grid);
-  deallocate_matrices();
-  m_bias_gradient = new MCStarMat<El::Device::CPU>(grid);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+fully_connected_layer<TensorDataType, T_layout, Dev>::fully_connected_layer(
+  lbann_comm *comm,
+  int output_size,
+  bool transpose,
+  WeightsType* weight,
+  bool has_bias)
+  : learning_layer<TensorDataType>(comm),
+  m_bias_gradient(nullptr),
+  m_transpose(transpose) {
+
+  // Initialize output tensor dimensions
+  this->set_output_dims({output_size});
+
+  // Initialize bias
+  m_bias_scaling_factor = (has_bias
+                           ? El::TypeTraits<TensorDataType>::One()
+                           : El::TypeTraits<TensorDataType>::Zero());
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+fully_connected_layer<TensorDataType, T_layout, Dev>::fully_connected_layer(
+  const fully_connected_layer& other)
+  : learning_layer<TensorDataType>(other),
+  m_bias_scaling_factor(other.m_bias_scaling_factor),
+  m_transpose(other.m_transpose) {
+
+  // Deep matrix copies
+  m_bias_gradient = other.m_bias_gradient;
+  if (m_bias_gradient != nullptr) {
+    m_bias_gradient = m_bias_gradient->Copy();
+  }
 }
 
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-  ::setup_matrices(const El::Grid& grid) {
-  learning_layer::setup_matrices(grid);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+auto fully_connected_layer<TensorDataType, T_layout, Dev>::operator=(
+  const fully_connected_layer& other) -> fully_connected_layer& {
+  learning_layer<TensorDataType>::operator=(other);
+  m_bias_scaling_factor = other.m_bias_scaling_factor;
+  m_transpose = other.m_transpose;
+
+  // Deep matrix copies
   deallocate_matrices();
-  m_bias_gradient = new StarMat<El::Device::CPU>(grid);
+  m_bias_gradient = other.m_bias_gradient;
+  if (m_bias_gradient != nullptr) {
+    m_bias_gradient = m_bias_gradient->Copy();
+  }
+
+  return *this;
 }
 
-#ifdef LBANN_HAS_GPU
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-  ::setup_matrices(const El::Grid& grid) {
-  learning_layer::setup_matrices(grid);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+fully_connected_layer<TensorDataType, T_layout, Dev>::~fully_connected_layer() {
   deallocate_matrices();
 }
 
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-  ::setup_matrices(const El::Grid& grid) {
-  learning_layer::setup_matrices(grid);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+description
+fully_connected_layer<TensorDataType, T_layout, Dev>::get_description() const {
+  auto desc = learning_layer<TensorDataType>::get_description();
+  const auto& bias_str = (m_bias_scaling_factor == El::TypeTraits<TensorDataType>::Zero()
+                          ? "disabled"
+                          : "enabled");
+  desc.add("Bias", bias_str);
+  return desc;
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void fully_connected_layer<TensorDataType, T_layout, Dev>
+::setup_matrices(const El::Grid& grid) {
+  learning_layer<TensorDataType>::setup_matrices(grid);
   deallocate_matrices();
+  if(Dev == El::Device::CPU) {
+    if(T_layout == data_layout::MODEL_PARALLEL) {
+      // Allocate a MCStarMat (RowSumMat)
+      this->m_bias_gradient =
+        new El::DistMatrix<TensorDataType,
+                           El::MC, El::STAR,
+                           El::ELEMENT,
+                           El::Device::CPU>(grid);
+    } else if(T_layout == data_layout::DATA_PARALLEL) {
+      // Allocate a StarMat
+      this->m_bias_gradient =
+        new El::DistMatrix<TensorDataType,
+                           El::STAR, El::STAR,
+                           El::ELEMENT,
+                           El::Device::CPU>(grid);
+    }
+  }
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void fully_connected_layer<TensorDataType, T_layout, Dev>
+::setup_data(size_t max_mini_batch_size) {
+  learning_layer<TensorDataType>::setup_data(max_mini_batch_size);
+
+  // Initialize default weights if none are provided
+  if (this->num_weights() > 2) {
+    LBANN_ERROR("attempted to setup ", this->get_name(), " with an invalid number of weights");
+  }
+  if (m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    this->set_num_weights(2);
+  } else {
+    this->set_num_weights(1);
+  }
+  if (!this->has_weights(0)) {
+    auto w = make_unique<WeightsType>(this->get_comm());
+    auto init = make_unique<he_initializer<TensorDataType>>(probability_distribution::gaussian);
+    auto opt = this->m_model->template create_optimizer<TensorDataType>();
+    w->set_name(this->get_name() + "_linearity_weights");
+    w->set_initializer(std::move(init));
+    w->set_optimizer(std::move(opt));
+    this->set_weights(0, w.get());
+    this->m_model->add_weights(std::move(w));
+  }
+  auto& linearity_weights = this->get_weights(0);
+
+  // Initialize variance scaling initialization
+  if (auto* initializer = linearity_weights.get_initializer()) {
+    set_fan_in(*initializer, this->get_input_size());
+    set_fan_out(*initializer, this->get_output_size());
+  }
+
+  // Setup linearity weights
+  auto linearity_dist = this->get_prev_activations().DistData();
+  if (linearity_dist.colDist != El::MC
+      || linearity_dist.rowDist != El::MR) {
+    linearity_dist.colDist = El::STAR;
+    linearity_dist.rowDist = El::STAR;
+  }
+  if (m_transpose) {
+    linearity_weights.set_dims(this->get_input_dims(), this->get_output_dims());
+  } else {
+    linearity_weights.set_dims(this->get_output_dims(), this->get_input_dims());
+  }
+  linearity_weights.set_matrix_distribution(linearity_dist);
+
+  // Set up bias if needed.
+  if (m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    if (!this->has_weights(1)) {
+      auto w = make_unique<WeightsType>(this->get_comm());
+      auto opt = this->m_model->template create_optimizer<TensorDataType>();
+      w->set_name(this->get_name() + "_bias_weights");
+      w->set_optimizer(std::move(opt));
+      this->set_weights(1, w.get());
+      this->m_model->add_weights(std::move(w));
+    }
+    auto& bias_weights = this->get_weights(1);
+    // Setup bias weights
+    auto bias_dist = this->get_activations().DistData();
+    bias_dist.rowDist = El::STAR;
+    bias_weights.set_dims(this->get_output_dims());
+    bias_weights.set_matrix_distribution(bias_dist);
+    if (this->m_bias_gradient != nullptr) {
+      El::Zeros(*this->m_bias_gradient,
+                bias_weights.get_matrix_height(),
+                bias_weights.get_matrix_width());
+    }
+  }
+
+  // Initialize freeze state
+  auto const num_weights = this->num_weights();
+  for (size_t ii = 0; ii < num_weights; ++ii) {
+    auto& w = this->get_weights(ii);
+    if (this->m_frozen) {
+      w.freeze();
+    } else {
+      w.unfreeze();
+    }
+  }
+  for (size_t ii = 0; ii < num_weights; ++ii) {
+    auto& w = this->get_weights(ii);
+    if (w.is_frozen() != this->is_frozen()) {
+      LBANN_ERROR((this->is_frozen() ? "" : "un"), "frozen ",
+                  "layer \"", this->get_name(), "\" has ",
+                  (w.is_frozen() ? "" : "un"), "frozen ",
+                  "weights \"", w.get_name(), "\"");
+    }
+  }
 }
-#endif // LBANN_HAS_GPU
 
 /** CPU implementation of forward prop computation. */
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(fully_connected_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::CPU>& l) {
 
   // Matrices
-  const auto& input = get_prev_activations();
-  auto& output = get_activations();
+  const auto& input = l.get_prev_activations();
+  auto& output = l.get_activations();
 
   // Apply linearity
   // Note: Perform GEMMs independently if possible
-  const auto& linearity = m_weights[0]->get_values();
+  const auto& linearity = l.weights_values(0);
   if (linearity.DistSize() == 1) {
-    El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+    El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
              El::NORMAL,
-             DataType(1), linearity.LockedMatrix(), input.LockedMatrix(),
-             DataType(0), output.Matrix());
+             El::TypeTraits<TensorDataType>::One(), linearity.LockedMatrix(), input.LockedMatrix(),
+             El::TypeTraits<TensorDataType>::Zero(), output.Matrix());
   } else {
-    El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+    El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
              El::NORMAL,
-             DataType(1), linearity, input,
-             DataType(0), output);
+             El::TypeTraits<TensorDataType>::One(), linearity, input,
+             El::TypeTraits<TensorDataType>::Zero(), output);
   }
 
   // Apply bias if needed
-  if(m_bias_scaling_factor != DataType(0)) {
-    const auto& local_bias = m_weights[1]->get_values().LockedMatrix();
+  if(l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    const auto& local_bias = l.weights_values(1).LockedMatrix();
     auto& local_output = output.Matrix();
     El::IndexDependentMap(local_output,
-                          (std::function<DataType(El::Int,El::Int,const DataType&)>)
-                          ([this,&local_bias](El::Int r, El::Int c,const DataType& z)
-                           ->DataType {
-                            return z + m_bias_scaling_factor * local_bias(r, 0);
+                          (std::function<TensorDataType(El::Int,El::Int,const TensorDataType&)>)
+                          ([&l,&local_bias](El::Int r, El::Int c,const TensorDataType& z)
+                           ->TensorDataType {
+                            return z + l.m_bias_scaling_factor * local_bias(r, 0);
                           }));
   }
 
 }
 
 /** CPU implementation of backward prop computation. */
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-
-  // Effective mini-batch size
-  const int mini_batch_size = this->m_model->get_effective_mini_batch_size();
+template <typename TensorDataType>
+void bp_compute_impl(fully_connected_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::CPU>& l) {
 
   // Matrices
-  const auto& linearity = m_weights[0]->get_values();
-  const auto& input = get_prev_activations();
-  const auto& gradient_wrt_output = get_prev_error_signals();
-  auto& gradient_wrt_input = get_error_signals();
+  const auto& linearity = l.weights_values(0);
+  const auto& input = l.get_prev_activations();
+  const auto& gradient_wrt_output = l.get_prev_error_signals();
+  auto& gradient_wrt_input = l.get_error_signals();
   const auto& local_linearity = linearity.LockedMatrix();
   const auto& local_input = input.LockedMatrix();
   const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
   auto& local_gradient_wrt_input = gradient_wrt_input.Matrix();
 
   // Compute gradient w.r.t. bias if needed
-  if (m_bias_scaling_factor != DataType(0)) {
-    optimizer* bias_optimizer = this->m_weights[1]->get_optimizer();
+  if (l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
     if (bias_optimizer != nullptr) {
       El::RowSum(local_gradient_wrt_output,
-                 m_bias_gradient->Matrix());
+                 l.m_bias_gradient->Matrix());
       bias_optimizer->add_to_gradient(
-        *m_bias_gradient,
-        m_bias_scaling_factor / mini_batch_size,
+        *l.m_bias_gradient,
+        l.m_bias_scaling_factor,
         true);
     }
   }
 
   // Compute gradient w.r.t. linearity if needed
   // Note: Perform GEMMs independently if possible
-  optimizer* linearity_optimizer = this->m_weights[0]->get_optimizer();
+  auto* linearity_optimizer = l.get_weights(0).get_optimizer();
   if (linearity_optimizer != nullptr) {
-    DataType dst_scale = DataType(0), gradient_scale = DataType(1);
+    TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(),
+      gradient_scale = El::TypeTraits<TensorDataType>::One();
     if (linearity.DistSize() == 1) {
       auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale, true);
-      gradient_scale /= mini_batch_size;
-      if (m_transpose) {
+      if (l.m_transpose) {
         El::Gemm(El::NORMAL, El::TRANSPOSE,
                  gradient_scale, local_input, local_gradient_wrt_output,
                  dst_scale, linearity_gradient.Matrix());
@@ -148,8 +303,7 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_com
     } else {
       auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale);
-      gradient_scale /= mini_batch_size;
-      if (m_transpose) {
+      if (l.m_transpose) {
         El::Gemm(El::NORMAL, El::TRANSPOSE,
                  gradient_scale, input, gradient_wrt_output,
                  dst_scale, linearity_gradient);
@@ -164,81 +318,77 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_com
   // Compute gradient w.r.t. input
   // Note: Perform GEMMs independently if possible
   if (linearity.DistSize() == 1) {
-    El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+    El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
              El::NORMAL,
-             DataType(1), local_linearity, local_gradient_wrt_output,
-             DataType(0), local_gradient_wrt_input);
+             El::TypeTraits<TensorDataType>::One(), local_linearity, local_gradient_wrt_output,
+             El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
   } else {
-    El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+    El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
              El::NORMAL,
-             DataType(1), linearity, gradient_wrt_output,
-             DataType(0), gradient_wrt_input);
+             El::TypeTraits<TensorDataType>::One(), linearity, gradient_wrt_output,
+             El::TypeTraits<TensorDataType>::Zero(), gradient_wrt_input);
   }
 
 }
 
 /** CPU implementation of forward prop computation. */
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(fully_connected_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>& l) {
 
   // Matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = l.get_local_prev_activations();
+  auto& local_output = l.get_local_activations();
 
   // Apply linearity
-  const auto& local_linearity = m_weights[0]->get_values().LockedMatrix();
-  El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+  const auto& local_linearity = l.weights_values(0).LockedMatrix();
+  El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
            El::NORMAL,
-           DataType(1), local_linearity, local_input,
-           DataType(0), local_output);
+           El::TypeTraits<TensorDataType>::One(), local_linearity, local_input,
+           El::TypeTraits<TensorDataType>::Zero(), local_output);
 
   // Apply bias if needed
-  if(m_bias_scaling_factor != DataType(0)) {
-    const auto& local_bias = m_weights[1]->get_values().LockedMatrix();
+  if(l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    const auto& local_bias = l.weights_values(1).LockedMatrix();
     El::IndexDependentMap(local_output,
-                          (std::function<DataType(El::Int,El::Int,const DataType&)>)
-                          ([this,&local_bias](El::Int r, El::Int c,const DataType& z)
-                           ->DataType {
-                            return z + m_bias_scaling_factor * local_bias(r, 0);
+                          (std::function<TensorDataType(El::Int,El::Int,const TensorDataType&)>)
+                          ([&l,&local_bias](El::Int r, El::Int c,const TensorDataType& z)
+                           ->TensorDataType {
+                            return z + l.m_bias_scaling_factor * local_bias(r, 0);
                           }));
   }
 
 }
 
 /** CPU implementation of backward prop computation. */
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-
-  // Effective mini-batch size
-  const int mini_batch_size = this->m_model->get_effective_mini_batch_size();
+template <typename TensorDataType>
+void bp_compute_impl(fully_connected_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>& l) {
 
   // Matrices
-  const auto& local_linearity = m_weights[0]->get_values().LockedMatrix();
-  const auto& local_input = get_local_prev_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+  const auto& local_linearity = l.weights_values(0).LockedMatrix();
+  const auto& local_input = l.get_local_prev_activations();
+  const auto& local_gradient_wrt_output = l.get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = l.get_local_error_signals();
 
   // Compute gradient w.r.t. bias if needed
-  if (m_bias_scaling_factor != DataType(0)) {
-    optimizer* bias_optimizer = this->m_weights[1]->get_optimizer();
+  if (l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
     if (bias_optimizer != nullptr) {
       El::RowSum(local_gradient_wrt_output,
-                 m_bias_gradient->Matrix());
+                 l.m_bias_gradient->Matrix());
       bias_optimizer->add_to_gradient(
-        *m_bias_gradient,
-        m_bias_scaling_factor / mini_batch_size,
+        *l.m_bias_gradient,
+        l.m_bias_scaling_factor,
         true);
     }
   }
 
   // Compute gradient w.r.t. linearity if needed
-  optimizer* linearity_optimizer = this->m_weights[0]->get_optimizer();
+  auto* linearity_optimizer = l.get_weights(0).get_optimizer();
   if (linearity_optimizer != nullptr) {
-    DataType dst_scale = DataType(0), gradient_scale = DataType(0);
+    TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
     auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
       dst_scale, gradient_scale, true);
-    gradient_scale /= mini_batch_size;
-    if (m_transpose) {
+    if (l.m_transpose) {
       El::Gemm(El::NORMAL, El::TRANSPOSE,
                gradient_scale, local_input, local_gradient_wrt_output,
                dst_scale, linearity_gradient.Matrix());
@@ -250,76 +400,72 @@ void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_comp
   }
 
   // Compute gradient w.r.t. input
-  El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+  El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
            El::NORMAL,
-           DataType(1), local_linearity, local_gradient_wrt_output,
-           DataType(0), local_gradient_wrt_input);
+           El::TypeTraits<TensorDataType>::One(), local_linearity, local_gradient_wrt_output,
+           El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
 
 }
 
 #ifdef LBANN_HAS_GPU
 /** GPU implementation of forward prop computation. */
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(fully_connected_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
 
   // Matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = l.get_local_prev_activations();
+  auto& local_output = l.get_local_activations();
 
   // Apply linearity
-  const auto& local_linearity = m_weights[0]->get_values().LockedMatrix();
-  El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+  const auto& local_linearity = l.weights_values(0).LockedMatrix();
+  El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
            El::NORMAL,
-           DataType(1), local_linearity, local_input,
-           DataType(0), local_output);
+           El::TypeTraits<TensorDataType>::One(), local_linearity, local_input,
+           El::TypeTraits<TensorDataType>::Zero(), local_output);
 
   // Apply bias if needed
-  if(m_bias_scaling_factor != DataType(0)) {
-    const auto& local_bias = m_weights[1]->get_values().LockedMatrix();
-    GPUMat ones;
+  if(l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    const auto& local_bias = l.weights_values(1).LockedMatrix();
+    El::Matrix<TensorDataType, El::Device::GPU> ones;
 #ifdef HYDROGEN_HAVE_CUB
     ones.SetMemoryMode(1); // Use CUB GPU memory pool if possible
 #endif // HYDROGEN_HAVE_CUB
     ones.Resize(local_input.Width(), 1);
-    El::Fill(ones, DataType(1));
+    El::Fill(ones, El::TypeTraits<TensorDataType>::One());
     El::Gemm(El::NORMAL, El::TRANSPOSE,
-             m_bias_scaling_factor, local_bias, ones,
-             DataType(1), local_output);
+             l.m_bias_scaling_factor, local_bias, ones,
+             El::TypeTraits<TensorDataType>::One(), local_output);
   }
 
 }
 
 /** GPU implementation of backward prop computation. */
-template <>
-void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-
-  // Effective mini-batch size
-  const int mini_batch_size = this->m_model->get_effective_mini_batch_size();
+template <typename TensorDataType>
+void bp_compute_impl(fully_connected_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::GPU>& l) {
 
   // Matrices
-  const auto& local_linearity = m_weights[0]->get_values().LockedMatrix();
-  const auto& local_input = get_local_prev_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+  const auto& local_linearity = l.weights_values(0).LockedMatrix();
+  const auto& local_input = l.get_local_prev_activations();
+  const auto& local_gradient_wrt_output = l.get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = l.get_local_error_signals();
 
   // Compute gradient w.r.t. bias if needed
-  if (m_bias_scaling_factor != DataType(0)) {
-    optimizer* bias_optimizer = this->m_weights[1]->get_optimizer();
+  if (l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
     if (bias_optimizer != nullptr) {
-      DataType dst_scale = DataType(0), gradient_scale = DataType(0);
+      TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
       auto& bias_gradient = bias_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale, true);
-      gradient_scale /= mini_batch_size;
       if (local_gradient_wrt_output.Height() < 1
           || local_gradient_wrt_output.Width() < 1) {
         El::Scale(dst_scale, bias_gradient);
       } else {
-        GPUMat ones;
+        El::Matrix<TensorDataType, El::Device::GPU> ones;
 #ifdef HYDROGEN_HAVE_CUB
         ones.SetMemoryMode(1); // Use CUB GPU memory pool if possible
 #endif // HYDROGEN_HAVE_CUB
         ones.Resize(local_gradient_wrt_output.Width(), 1);
-        El::Fill(ones, DataType(1));
+        El::Fill(ones, El::TypeTraits<TensorDataType>::One());
         El::Gemv(El::NORMAL,
                  gradient_scale, local_gradient_wrt_output, ones,
                  dst_scale, bias_gradient.Matrix());
@@ -328,13 +474,12 @@ void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_comp
   }
 
   // Compute gradient w.r.t. linearity if needed
-  optimizer* linearity_optimizer = this->m_weights[0]->get_optimizer();
+  auto* linearity_optimizer = l.get_weights(0).get_optimizer();
   if (linearity_optimizer != nullptr) {
-    DataType dst_scale = DataType(0), gradient_scale = DataType(0);
+    TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
     auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
       dst_scale, gradient_scale, true);
-    gradient_scale /= mini_batch_size;
-    if (m_transpose) {
+    if (l.m_transpose) {
       El::Gemm(El::NORMAL, El::TRANSPOSE,
                gradient_scale, local_input, local_gradient_wrt_output,
                dst_scale, linearity_gradient.Matrix());
@@ -346,63 +491,60 @@ void fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_comp
   }
 
   // Compute gradient w.r.t. input
-  El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+  El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
            El::NORMAL,
-           DataType(1), local_linearity, local_gradient_wrt_output,
-           DataType(0), local_gradient_wrt_input);
+           El::TypeTraits<TensorDataType>::One(), local_linearity, local_gradient_wrt_output,
+           El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
 
 }
 
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+template <typename TensorDataType>
+void fp_compute_impl(fully_connected_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
 
   // Matrices
-  const auto& input = get_prev_activations();
-  auto& output = get_activations();
+  const auto& input = l.get_prev_activations();
+  auto& output = l.get_activations();
 
   // Apply linearity
   // Note: Perform GEMMs independently if possible
-  const auto& linearity = m_weights[0]->get_values();
+  const auto& linearity = l.weights_values(0);
   if (linearity.DistSize() == 1) {
-    El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+    El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
              El::NORMAL,
-             DataType(1), linearity.LockedMatrix(), input.LockedMatrix(),
-             DataType(0), output.Matrix());
+             El::TypeTraits<TensorDataType>::One(), linearity.LockedMatrix(), input.LockedMatrix(),
+             El::TypeTraits<TensorDataType>::Zero(), output.Matrix());
   } else {
-    El::Gemm(m_transpose ? El::TRANSPOSE : El::NORMAL,
+    El::Gemm(l.m_transpose ? El::TRANSPOSE : El::NORMAL,
              El::NORMAL,
-             DataType(1), linearity, input,
-             DataType(0), output);
+             El::TypeTraits<TensorDataType>::One(), linearity, input,
+             El::TypeTraits<TensorDataType>::Zero(), output);
   }
 
   // Apply bias if needed
   // Note: local outer product is sufficient, no need for global GEMM
-  if(m_bias_scaling_factor != DataType(0)) {
-    const auto& bias = m_weights[1]->get_values();
-    GPUMat ones;
+  if(l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    const auto& bias = l.weights_values(1);
+    El::Matrix<TensorDataType, El::Device::GPU> ones;
 #ifdef HYDROGEN_HAVE_CUB
     ones.SetMemoryMode(1); // Use CUB GPU memory pool if possible
 #endif // HYDROGEN_HAVE_CUB
     ones.Resize(input.LocalWidth(), 1);
-    El::Fill(ones, DataType(1));
+    El::Fill(ones, El::TypeTraits<TensorDataType>::One());
     El::Gemm(El::NORMAL, El::TRANSPOSE,
-             m_bias_scaling_factor, bias.LockedMatrix(), ones,
-             DataType(1), output.Matrix());
+             l.m_bias_scaling_factor, bias.LockedMatrix(), ones,
+             El::TypeTraits<TensorDataType>::One(), output.Matrix());
   }
 
 }
 
-template <>
-void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-
-  // Effective mini-batch size
-  const int mini_batch_size = this->m_model->get_effective_mini_batch_size();
+template <typename TensorDataType>
+void bp_compute_impl(fully_connected_layer<TensorDataType, data_layout::MODEL_PARALLEL, El::Device::GPU>& l) {
 
   // Matrices
-  const auto& linearity = m_weights[0]->get_values();
-  const auto& input = get_prev_activations();
-  const auto& gradient_wrt_output = get_prev_error_signals();
-  auto& gradient_wrt_input = get_error_signals();
+  const auto& linearity = l.weights_values(0);
+  const auto& input = l.get_prev_activations();
+  const auto& gradient_wrt_output = l.get_prev_error_signals();
+  auto& gradient_wrt_input = l.get_error_signals();
   const auto& local_linearity = linearity.LockedMatrix();
   const auto& local_input = input.LockedMatrix();
   const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
@@ -410,23 +552,22 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_com
 
   // Compute gradient w.r.t. bias if needed
   // Note: local GEMV is sufficient, no need for global row sum
-  if (m_bias_scaling_factor != DataType(0)) {
-    optimizer* bias_optimizer = this->m_weights[1]->get_optimizer();
+  if (l.m_bias_scaling_factor != El::TypeTraits<TensorDataType>::Zero()) {
+    auto* bias_optimizer = l.get_weights(1).get_optimizer();
     if (bias_optimizer != nullptr) {
-      DataType dst_scale = DataType(0), gradient_scale = DataType(0);
+      TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
       auto& bias_gradient = bias_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale, true);
-      gradient_scale /= mini_batch_size;
       if (local_gradient_wrt_output.Height() < 1
           || local_gradient_wrt_output.Width() < 1) {
         El::Scale(dst_scale, bias_gradient);
       } else {
-        GPUMat ones;
+        El::Matrix<TensorDataType, El::Device::GPU> ones;
 #ifdef HYDROGEN_HAVE_CUB
         ones.SetMemoryMode(1); // Use CUB GPU memory pool if possible
 #endif // HYDROGEN_HAVE_CUB
         ones.Resize(local_gradient_wrt_output.Width(), 1);
-        El::Fill(ones, DataType(1));
+        El::Fill(ones, El::TypeTraits<TensorDataType>::One());
         El::Gemv(El::NORMAL,
                  gradient_scale, local_gradient_wrt_output, ones,
                  dst_scale, bias_gradient.Matrix());
@@ -436,14 +577,13 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_com
 
   // Compute gradient w.r.t. linearity if needed
   // Note: Perform GEMMs independently if possible
-  optimizer* linearity_optimizer = this->m_weights[0]->get_optimizer();
+  auto* linearity_optimizer = l.get_weights(0).get_optimizer();
   if (linearity_optimizer != nullptr) {
-    DataType dst_scale = DataType(0), gradient_scale = DataType(0);
+    TensorDataType dst_scale = El::TypeTraits<TensorDataType>::Zero(), gradient_scale = El::TypeTraits<TensorDataType>::Zero();
     if (linearity.DistSize() == 1) {
       auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale, true);
-      gradient_scale /= mini_batch_size;
-      if (m_transpose) {
+      if (l.m_transpose) {
         El::Gemm(El::NORMAL, El::TRANSPOSE,
                  gradient_scale, local_input, local_gradient_wrt_output,
                  dst_scale, linearity_gradient.Matrix());
@@ -455,8 +595,7 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_com
     } else {
       auto& linearity_gradient = linearity_optimizer->get_gradient_buffer(
         dst_scale, gradient_scale);
-      gradient_scale /= mini_batch_size;
-      if (m_transpose) {
+      if (l.m_transpose) {
         El::Gemm(El::NORMAL, El::TRANSPOSE,
                  gradient_scale, input, gradient_wrt_output,
                  dst_scale, linearity_gradient);
@@ -471,18 +610,55 @@ void fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_com
   // Compute gradient w.r.t. input
   // Note: Perform GEMMs independently if possible
   if (linearity.DistSize() == 1) {
-    El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+    El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
              El::NORMAL,
-             DataType(1), local_linearity, local_gradient_wrt_output,
-             DataType(0), local_gradient_wrt_input);
+             El::TypeTraits<TensorDataType>::One(), local_linearity, local_gradient_wrt_output,
+             El::TypeTraits<TensorDataType>::Zero(), local_gradient_wrt_input);
   } else {
-    El::Gemm(m_transpose ? El::NORMAL : El::TRANSPOSE,
+    El::Gemm(l.m_transpose ? El::NORMAL : El::TRANSPOSE,
              El::NORMAL,
-             DataType(1), linearity, gradient_wrt_output,
-             DataType(0), gradient_wrt_input);
+             El::TypeTraits<TensorDataType>::One(), linearity, gradient_wrt_output,
+             El::TypeTraits<TensorDataType>::Zero(), gradient_wrt_input);
   }
 
 }
+
 #endif // LBANN_HAS_GPU
 
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void fully_connected_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_compute_impl<TensorDataType>(*this);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void fully_connected_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  bp_compute_impl<TensorDataType>(*this);
+}
+
+template <typename TensorDataType, data_layout layout, El::Device device>
+std::unique_ptr<Layer> build_fully_connected_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& layer_msg)
+{
+  using LayerType = fully_connected_layer<TensorDataType, layout, device>;
+  const auto& params = layer_msg.fully_connected();
+  return lbann::make_unique<LayerType>(
+    comm,
+    params.num_neurons(),
+    params.transpose(),
+    nullptr,
+    params.has_bias());
+}
+
+#define PROTO_DEVICE(T, Device) \
+  template class fully_connected_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class fully_connected_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  template std::unique_ptr<Layer>                                       \
+  build_fully_connected_layer_from_pbuf<T, data_layout::DATA_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template std::unique_ptr<Layer>                                       \
+  build_fully_connected_layer_from_pbuf<T, data_layout::MODEL_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&)
+
+#include "lbann/macros/instantiate_device.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/categorical_accuracy.cpp b/src/layers/loss/categorical_accuracy.cpp
index f10f80c599c..ab9525b1857 100644
--- a/src/layers/loss/categorical_accuracy.cpp
+++ b/src/layers/loss/categorical_accuracy.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/categorical_accuracy.hpp"
 #include <limits>
 
@@ -33,10 +34,11 @@ namespace lbann {
 namespace {
 
 /** CPU implementation of categorical accuracy layer forward prop. */
+template <typename TensorDataType>
 void fp_cpu(lbann_comm& comm,
-            const AbsDistMat& predictions,
-            const AbsDistMat& labels,
-            AbsDistMat& loss) {
+            const El::AbstractDistMatrix<TensorDataType>& predictions,
+            const El::AbstractDistMatrix<TensorDataType>& labels,
+            El::AbstractDistMatrix<TensorDataType>& loss) {
 
   // Local matrices
   const auto& local_predictions = predictions.LockedMatrix();
@@ -56,11 +58,11 @@ void fp_cpu(lbann_comm& comm,
   const auto& col_comm_root = loss.RowOwner(0);
 
   // Find largest prediction entries in local data
-  std::vector<DataType> prediction_vals(local_width);
+  std::vector<TensorDataType> prediction_vals(local_width);
   std::vector<El::Int> prediction_inds(local_width);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType max_val = -std::numeric_limits<DataType>::infinity();
+    TensorDataType max_val = -std::numeric_limits<TensorDataType>::infinity();
     El::Int max_ind = height;
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& val = local_predictions(row, col);
@@ -76,7 +78,7 @@ void fp_cpu(lbann_comm& comm,
   // Gather large prediction entries
   /// @todo Non-blocking gather
   Al::request prediction_vals_req, prediction_inds_req;
-  std::vector<DataType> gathered_prediction_vals;
+  std::vector<TensorDataType> gathered_prediction_vals;
   std::vector<El::Int> gathered_prediction_inds;
   if (col_comm_size > 1) {
     if (col_comm_rank != col_comm_root) {
@@ -99,11 +101,11 @@ void fp_cpu(lbann_comm& comm,
   }
 
   // Find largest label entries in local data
-  std::vector<DataType> label_vals(local_width);
+  std::vector<TensorDataType> label_vals(local_width);
   std::vector<El::Int> label_inds(local_width);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType max_val = -std::numeric_limits<DataType>::infinity();
+    TensorDataType max_val = -std::numeric_limits<TensorDataType>::infinity();
     El::Int max_ind = height;
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& val = local_labels(row, col);
@@ -119,7 +121,7 @@ void fp_cpu(lbann_comm& comm,
   // Gather large label entries
   /// @todo Non-blocking gather
   Al::request label_vals_req, label_inds_req;
-  std::vector<DataType> gathered_label_vals;
+  std::vector<TensorDataType> gathered_label_vals;
   std::vector<El::Int> gathered_label_inds;
   if (col_comm_size > 1) {
     if (col_comm_rank != col_comm_root) {
@@ -147,7 +149,7 @@ void fp_cpu(lbann_comm& comm,
   if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
     LBANN_OMP_PARALLEL_FOR
     for (El::Int col = 0; col < local_width; ++col) {
-      DataType max_val = -std::numeric_limits<DataType>::infinity();
+      TensorDataType max_val = -std::numeric_limits<TensorDataType>::infinity();
       El::Int max_ind = height;
       for (El::Int rank = 0; rank < col_comm_size; ++rank) {
         const auto& val = gathered_prediction_vals[col + rank * local_width];
@@ -168,7 +170,7 @@ void fp_cpu(lbann_comm& comm,
   if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
     LBANN_OMP_PARALLEL_FOR
     for (El::Int col = 0; col < local_width; ++col) {
-      DataType max_val = -std::numeric_limits<DataType>::infinity();
+      TensorDataType max_val = -std::numeric_limits<TensorDataType>::infinity();
       El::Int max_ind = height;
       for (El::Int rank = 0; rank < col_comm_size; ++rank) {
         const auto& val = gathered_label_vals[col + rank * local_width];
@@ -188,7 +190,7 @@ void fp_cpu(lbann_comm& comm,
     LBANN_OMP_PARALLEL_FOR
     for (El::Int col = 0; col < local_width; ++col) {
       local_loss(0, col) = (prediction_inds[col] == label_inds[col] ?
-                            DataType(1) : DataType(0));
+                            El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero());
     }
   }
 
@@ -196,21 +198,21 @@ void fp_cpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(),
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
-}
-template <>
-void categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(),
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void categorical_accuracy_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_cpu(*this->get_comm(),
+         this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations());
 }
 
+#define PROTO(T)                                      \
+  template class categorical_accuracy_layer<          \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class categorical_accuracy_layer<          \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/categorical_accuracy.cu b/src/layers/loss/categorical_accuracy.cu
index c91f3359bce..5b1057887e6 100644
--- a/src/layers/loss/categorical_accuracy.cu
+++ b/src/layers/loss/categorical_accuracy.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/categorical_accuracy.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -35,8 +36,9 @@ namespace {
  *  Indices are equivalent to the global row indices of the input
  *  matrix.
  */
-__global__ void fill_indices_kernel(El::Int local_height,
-                                    El::Int local_width,
+template <typename TensorDataType>
+__global__ void fill_indices_kernel(El::Int const local_height,
+                                    El::Int const local_width,
                                     El::Int col_shift,
                                     El::Int col_stride,
                                     El::Int* __restrict__ indices) {
@@ -55,15 +57,15 @@ __global__ void fill_indices_kernel(El::Int local_height,
  *  sample and it finds the largest entry. Results are output to
  *  nblocksx x width matrices.
  */
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void reduce_max_entries_kernel(El::Int height, El::Int width,
-                                          const DataType* __restrict__ values,
+                                          const TensorDataType* __restrict__ values,
                                           El::Int values_row_stride,
                                           El::Int values_col_stride,
                                           const El::Int* __restrict__ indices,
                                           El::Int indices_row_stride,
                                           El::Int indices_col_stride,
-                                          DataType* __restrict__ max_values,
+                                          TensorDataType* __restrict__ max_values,
                                           El::Int* __restrict__ max_indices) {
 
   // Indices
@@ -78,7 +80,7 @@ __global__ void reduce_max_entries_kernel(El::Int height, El::Int width,
   for (El::Int col = bidy; col < width; col += gridDim.y) {
 
     // Find largest entry for each thread
-    DataType private_max_val = -cuda::infinity<DataType>();
+    TensorDataType private_max_val = -cuda::infinity<TensorDataType>();
     El::Int private_max_ind = cuda::max<El::Int>();
     for (El::Int row = gidx; row < height; row += nthreadsx) {
       const auto& val = values[row * values_row_stride
@@ -93,7 +95,7 @@ __global__ void reduce_max_entries_kernel(El::Int height, El::Int width,
     }
 
     // Shared memory reduction to get largest entry for each block
-    __shared__ DataType shared_max_vals[block_size];
+    __shared__ TensorDataType shared_max_vals[block_size];
     __shared__ El::Int shared_max_inds[block_size];
     shared_max_vals[tid] = private_max_val;
     shared_max_inds[tid] = private_max_ind;
@@ -122,10 +124,11 @@ __global__ void reduce_max_entries_kernel(El::Int height, El::Int width,
  *  Outputs one if the prediction and label indices match and
  *  otherwise outputs zero.
  */
+template <typename TensorDataType>
 __global__ void compute_accuracy_kernel(El::Int local_width,
                                         const El::Int* __restrict__ prediction_indices,
                                         const El::Int* __restrict__ label_indices,
-                                        DataType* __restrict__ loss,
+                                        TensorDataType* __restrict__ loss,
                                         El::Int loss_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int nthreads = blockDim.x * gridDim.x;
@@ -134,15 +137,16 @@ __global__ void compute_accuracy_kernel(El::Int local_width,
     const auto& prediction = prediction_indices[col];
     const auto& label = label_indices[col];
     loss[col*loss_ldim] = (prediction == label && prediction < max_ind ?
-                           DataType(1) : DataType(0));
+                           TensorDataType(1.0) : TensorDataType(0.0));
   }
 }
 
 /** GPU implementation of categorical accuracy layer forward prop. */
+template <typename TensorDataType>
 void fp_gpu(lbann_comm& comm,
-            const AbsDistMat& predictions,
-            const AbsDistMat& labels,
-            AbsDistMat& loss) {
+            const El::AbstractDistMatrix<TensorDataType>& predictions,
+            const El::AbstractDistMatrix<TensorDataType>& labels,
+            El::AbstractDistMatrix<TensorDataType>& loss) {
 
   // Local matrices
   const auto& local_predictions = predictions.LockedMatrix();
@@ -178,16 +182,18 @@ void fp_gpu(lbann_comm& comm,
   cuda::thrust::vector<El::Int> full_inds(local_height * local_width);
   if (full_inds.size() > 0) {
     const El::Int grid_size = (full_inds.size() + block_size - 1) / block_size;
-    fill_indices_kernel<<<grid_size, block_size, 0, stream>>>(
+    fill_indices_kernel<TensorDataType>
+        <<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
-      predictions.ColShift(), predictions.ColStride(),
+      predictions.ColShift(),
+      predictions.ColStride(),
       full_inds.data().get());
   }
 
   // Find largest prediction entries in local data
   grid_dims.x = (local_height + block_size - 1) / block_size;
   if (grid_dims.x < 1) { grid_dims.x = 1; }
-  cuda::thrust::vector<DataType> prediction_vals(grid_dims.x * local_width);
+  cuda::thrust::vector<TensorDataType> prediction_vals(grid_dims.x * local_width);
   cuda::thrust::vector<El::Int> prediction_inds(grid_dims.x * local_width);
   reduce_max_entries_kernel<block_size>
     <<<grid_dims, block_dims, 0, stream>>>(
@@ -199,7 +205,7 @@ void fp_gpu(lbann_comm& comm,
   while (grid_dims.x > 1) {
     const El::Int prev_height = grid_dims.x;
     grid_dims.x = (prev_height + block_size - 1) / block_size;
-    cuda::thrust::vector<DataType> prev_vals(std::move(prediction_vals));
+    cuda::thrust::vector<TensorDataType> prev_vals(std::move(prediction_vals));
     cuda::thrust::vector<El::Int> prev_inds(std::move(prediction_inds));
     prediction_vals.resize(grid_dims.x * local_width);
     prediction_inds.resize(grid_dims.x * local_width);
@@ -215,7 +221,7 @@ void fp_gpu(lbann_comm& comm,
   // Gather large prediction entries
   /// @todo Non-blocking gather
   Al::request prediction_vals_req, prediction_inds_req;
-  cuda::thrust::vector<DataType> gathered_prediction_vals;
+  cuda::thrust::vector<TensorDataType> gathered_prediction_vals;
   cuda::thrust::vector<El::Int> gathered_prediction_inds;
   if (col_comm_size > 1) {
     if (col_comm_rank != col_comm_root) {
@@ -238,7 +244,7 @@ void fp_gpu(lbann_comm& comm,
   // Find largest label entries in local data
   grid_dims.x = (local_height + block_size - 1) / block_size;
   if (grid_dims.x < 1) { grid_dims.x = 1; }
-  cuda::thrust::vector<DataType> label_vals(grid_dims.x * local_width);
+  cuda::thrust::vector<TensorDataType> label_vals(grid_dims.x * local_width);
   cuda::thrust::vector<El::Int> label_inds(grid_dims.x * local_width);
   reduce_max_entries_kernel<block_size>
     <<<grid_dims, block_dims, 0, stream>>>(
@@ -250,7 +256,7 @@ void fp_gpu(lbann_comm& comm,
   while (grid_dims.x > 1) {
     const El::Int prev_height = grid_dims.x;
     grid_dims.x = (prev_height + block_size - 1) / block_size;
-    cuda::thrust::vector<DataType> prev_vals(std::move(label_vals));
+    cuda::thrust::vector<TensorDataType> prev_vals(std::move(label_vals));
     cuda::thrust::vector<El::Int> prev_inds(std::move(label_inds));
     label_vals.resize(grid_dims.x * local_width);
     label_inds.resize(grid_dims.x * local_width);
@@ -266,7 +272,7 @@ void fp_gpu(lbann_comm& comm,
   // Gather large label entries
   /// @todo Non-blocking gather
   Al::request label_vals_req, label_inds_req;
-  cuda::thrust::vector<DataType> gathered_label_vals;
+  cuda::thrust::vector<TensorDataType> gathered_label_vals;
   cuda::thrust::vector<El::Int> gathered_label_inds;
   if (col_comm_size > 1) {
     if (col_comm_rank != col_comm_root) {
@@ -307,7 +313,7 @@ void fp_gpu(lbann_comm& comm,
     while (grid_dims.x > 1) {
       const El::Int prev_height = grid_dims.x;
       grid_dims.x = (prev_height + block_size - 1) / block_size;
-      cuda::thrust::vector<DataType> prev_vals(std::move(prediction_vals));
+      cuda::thrust::vector<TensorDataType> prev_vals(std::move(prediction_vals));
       cuda::thrust::vector<El::Int> prev_inds(std::move(prediction_inds));
       prediction_vals.resize(grid_dims.x * local_width);
       prediction_inds.resize(grid_dims.x * local_width);
@@ -339,7 +345,7 @@ void fp_gpu(lbann_comm& comm,
     while (grid_dims.x > 1) {
       const El::Int prev_height = grid_dims.x;
       grid_dims.x = (prev_height + block_size - 1) / block_size;
-      cuda::thrust::vector<DataType> prev_vals(std::move(label_vals));
+      cuda::thrust::vector<TensorDataType> prev_vals(std::move(label_vals));
       cuda::thrust::vector<El::Int> prev_inds(std::move(label_inds));
       label_vals.resize(grid_dims.x * local_width);
       label_inds.resize(grid_dims.x * local_width);
@@ -366,21 +372,21 @@ void fp_gpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(),
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
-}
-template <>
-void categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(),
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void categorical_accuracy_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_gpu(*this->get_comm(),
+         this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations());
 }
 
+#define PROTO(T)                                      \
+  template class categorical_accuracy_layer<          \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class categorical_accuracy_layer<          \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/cross_entropy.cpp b/src/layers/loss/cross_entropy.cpp
index 65eda604540..60fbc7d43a5 100644
--- a/src/layers/loss/cross_entropy.cpp
+++ b/src/layers/loss/cross_entropy.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/cross_entropy.hpp"
 #include "lbann/utils/exception.hpp"
 
@@ -31,19 +32,20 @@ namespace lbann {
 
 namespace {
 
-void local_fp_cpu(const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_cpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
 
   // Useful constants
-  const DataType zero = DataType(0);
+  const TensorDataType zero = El::TypeTraits<TensorDataType>::Zero();
   const El::Int local_height = local_prediction.Height();
   const El::Int local_width = local_prediction.Width();
 
   // Compute local contribution to cross entropy
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType sum = zero;
+    TensorDataType sum = zero;
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& xhat = local_ground_truth(row, col);
       if (xhat > zero) {
@@ -59,14 +61,15 @@ void local_fp_cpu(const AbsMat& local_prediction,
 
 }
 
-void local_bp_cpu(const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+template <typename TensorDataType>
+void local_bp_cpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
 
   // Useful constants
-  const DataType zero = DataType(0);
+  const TensorDataType zero = El::TypeTraits<TensorDataType>::Zero();
   const El::Int local_height = local_prediction.Height();
   const El::Int local_width = local_prediction.Width();
 
@@ -88,48 +91,29 @@ void local_bp_cpu(const AbsMat& local_prediction,
 
 } // namespace
 
-template <>
-void cross_entropy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_prediction, local_ground_truth, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_cpu(this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void cross_entropy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_cpu(this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void cross_entropy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_prediction, local_ground_truth, local_contribution);
-}
+#define PROTO(T)                                      \
+  template class cross_entropy_layer<                 \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class cross_entropy_layer<                 \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void cross_entropy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/cross_entropy.cu b/src/layers/loss/cross_entropy.cu
index 73ba9e2d226..a8980e621ba 100644
--- a/src/layers/loss/cross_entropy.cu
+++ b/src/layers/loss/cross_entropy.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/cross_entropy.hpp"
 #include "lbann/utils/exception.hpp"
 #include "math.h"
@@ -32,13 +33,13 @@ namespace lbann {
 
 namespace {
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void fp_kernel(int height, int width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          DataType* __restrict__ contribution) {
+                          TensorDataType* __restrict__ contribution) {
 
   // Indices
   const int tid = threadIdx.x;
@@ -50,18 +51,18 @@ __global__ void fp_kernel(int height, int width,
   for (int col = bidy; col < width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution = DataType(0);
+    auto private_contribution = TensorDataType(0.);
     for (int row = gidx; row < height; row += nthreadsx) {
       const auto& xhat = ground_truth[row + col * ground_truth_ldim];
-      if (xhat > DataType(0)) {
+      if (xhat > TensorDataType(0.)){
         const auto& x = prediction[row + col * prediction_ldim];
-        private_contribution += - xhat * log(x);
+        private_contribution += - xhat * cuda::log(x);
       }
     }
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -77,9 +78,10 @@ __global__ void fp_kernel(int height, int width,
 
 }
 
-void local_fp_gpu(const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   El::Zero(local_contribution);
   const auto& height = local_prediction.Height();
   const auto& width = local_prediction.Width();
@@ -99,16 +101,16 @@ void local_fp_gpu(const AbsMat& local_prediction,
   }
 }
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void bp_kernel(int height, int width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          DataType* __restrict__ gradient_wrt_prediction,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          TensorDataType* __restrict__ gradient_wrt_prediction,
                           int gradient_wrt_prediction_ldim,
-                          DataType* __restrict__ gradient_wrt_ground_truth,
+                          TensorDataType* __restrict__ gradient_wrt_ground_truth,
                           int gradient_wrt_ground_truth_ldim) {
 
   // Indices
@@ -124,18 +126,19 @@ __global__ void bp_kernel(int height, int width,
       const auto& xhat = ground_truth[row + col * ground_truth_ldim];
       auto& dx = gradient_wrt_prediction[row + col * gradient_wrt_prediction_ldim];
       auto& dxhat = gradient_wrt_ground_truth[row + col * gradient_wrt_ground_truth_ldim];
-      dx = (xhat > DataType(0)) ? - dy * xhat / x : DataType(0);
-      dxhat = - dy * std::log(x);
+      dx = (xhat > TensorDataType(0.)) ? - dy * xhat / x : TensorDataType(0.);
+      dxhat = - dy * cuda::log(x);
     }
   }
 
 }
 
-void local_bp_gpu(const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+template <typename TensorDataType>
+void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
   const auto& height = local_prediction.Height();
   const auto& width = local_prediction.Width();
   if (height > 0 && width > 0) {
@@ -160,48 +163,29 @@ void local_bp_gpu(const AbsMat& local_prediction,
 
 } // namespace
 
-template <>
-void cross_entropy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_prediction, local_ground_truth, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_gpu(this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void cross_entropy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void cross_entropy_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_gpu(this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void cross_entropy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_prediction, local_ground_truth, local_contribution);
-}
+#define PROTO(T)                                      \
+  template class cross_entropy_layer<                 \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class cross_entropy_layer<                 \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void cross_entropy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/entrywise.cpp b/src/layers/loss/entrywise.cpp
index 112a9c6861f..1fabcf24c14 100644
--- a/src/layers/loss/entrywise.cpp
+++ b/src/layers/loss/entrywise.cpp
@@ -24,17 +24,15 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ENTRYWISE_LAYER_INSTANTIATE
 #include "lbann/layers/loss/entrywise.hpp"
 #include "lbann/utils/entrywise_operator.hpp"
+#include "lbann/utils/numerical_traits.hpp"
 
 namespace lbann {
 
 namespace {
 
-// Helpful constants
-constexpr DataType zero = 0;
-constexpr DataType one = 1;
-
 /** Apply a binary backprop operator to CPU data.
  *  The input and output data must be on CPU and must have the same
  *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
@@ -43,12 +41,13 @@ constexpr DataType one = 1;
  *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
  *  the BinaryBackPropOperator is called.
  */
-template <typename BinaryBackPropOperator>
-void apply_binary_backprop_operator(const AbsMat& x1,
-                                    const AbsMat& x2,
-                                    const AbsMat& dy,
-                                    AbsMat& dx1,
-                                    AbsMat& dx2) {
+template <template <typename> class Op, typename TensorDataType>
+void apply_binary_backprop_operator(const El::AbstractMatrix<TensorDataType>& x1,
+                                    const El::AbstractMatrix<TensorDataType>& x2,
+                                    const El::AbstractMatrix<TensorDataType>& dy,
+                                    El::AbstractMatrix<TensorDataType>& dx1,
+                                    El::AbstractMatrix<TensorDataType>& dx2) {
+  using BinaryBackPropOperator = Op<TensorDataType>;
   if (x1.Contiguous() && x2.Contiguous() && dy.Contiguous()
       && dx1.Contiguous() && dx2.Contiguous()) {
     const auto* x1_buffer = x1.LockedBuffer();
@@ -75,7 +74,6 @@ void apply_binary_backprop_operator(const AbsMat& x1,
       }
     }
   }
-
 }
 
 // =========================================================
@@ -87,21 +85,25 @@ void apply_binary_backprop_operator(const AbsMat& x1,
 // (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
 
 /** Binary cross entropy operator. */
+template <typename TensorDataType>
 struct binary_cross_entropy_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    DataType y = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                                   const TensorDataType& x2) const {
+    static const auto zero = El::TypeTraits<TensorDataType>::Zero();
+    static const auto one = El::TypeTraits<TensorDataType>::One();
+    TensorDataType y = zero;
     if (x2 > zero) { y += -x2 * std::log(x1); }
     if (x2 < one)  { y += -(one-x2) * std::log(one-x1); }
     return y;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    static const auto zero = El::TypeTraits<TensorDataType>::Zero();
+    static const auto one = El::TypeTraits<TensorDataType>::One();
+    dx2 = dx1 = zero;
     if (dy == zero) { return; }
     if (x2 > zero) {
       dx1 += -x2 / x1 * dy;
@@ -120,26 +122,35 @@ struct binary_cross_entropy_op {
  *  implementation is taken from
  *  https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits.
  */
+template <typename TensorDataType>
 struct sigmoid_binary_cross_entropy_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                                   const TensorDataType& x2) const {
+    using std::exp;
+    using std::log1p;
+    static const auto zero = El::TypeTraits<TensorDataType>::Zero();
+    static const auto one = El::TypeTraits<TensorDataType>::One();
     const auto& z = std::max(zero, std::min(x2, one));
     if (x1 > zero) {
-      return (one - z) * x1 + std::log1p(std::exp(-x1));
+      return (one - z) * x1 + log1p(exp(-x1));
     } else {
-      return - x1 * z + std::log1p(std::exp(x1));
+      return - x1 * z + log1p(exp(x1));
     }
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    using std::exp;
+    using std::log1p;
+    static const auto zero = El::TypeTraits<TensorDataType>::Zero();
+    static const auto one = El::TypeTraits<TensorDataType>::One();
     const auto& z = std::max(zero, std::min(x2, one));
     if (x1 > zero) {
-      dx1 = -z + 1 / (one + std::exp(-x1));
+      dx1 = -z + one / (one + exp(-x1));
     } else {
-      dx1 = one - z - 1 / (one + std::exp(x1));
+        dx1 = one - z - one / (one + exp(x1));
     }
     dx1 *= dy;
     dx2 = (x2 == z) ? -x1 * dy : zero;
@@ -147,99 +158,98 @@ struct sigmoid_binary_cross_entropy_op {
 };
 
 /** Boolean accuracy operator. */
+template <typename TensorDataType>
 struct boolean_accuracy_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return b1 == b2 ? one : zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                                   const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return b1 == b2
+        ? El::TypeTraits<TensorDataType>::One()
+        : El::TypeTraits<TensorDataType>::Zero();
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx2 = dx1 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Boolean false negative operator. */
+template <typename TensorDataType>
 struct boolean_false_negative_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return (!b1 && b2) ? one : zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                                   const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return (!b1 && b2) ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx2 = dx1 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Boolean false positive operator. */
+template <typename TensorDataType>
 struct boolean_false_positive_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return (b1 && !b2) ? one : zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                                   const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return (b1 && !b2)
+        ? El::TypeTraits<TensorDataType>::One()
+        : El::TypeTraits<TensorDataType>::Zero();
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx2 = dx1 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::fp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
-                                        get_prev_activations(1),        \
-                                        get_activations());             \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::bp_compute() {                                               \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-         ::fp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
-                                        get_prev_activations(1),        \
-                                        get_activations());             \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    apply_entrywise_binary_operator<op>(                                \
+      this->get_prev_activations(0),                                    \
+      this->get_prev_activations(1),                                    \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-  ::bp_compute() {                                                      \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_binary_backprop_operator<op>(                                 \
+      this->get_local_prev_activations(0),                              \
+      this->get_local_prev_activations(1),                              \
+      this->get_local_prev_error_signals(),                             \
+      this->get_local_error_signals(0),                                 \
+      this->get_local_error_signals(1));                                \
   }
-  INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op)
-  INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
-  INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op)
-  INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op)
-  INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op)
+
+DEFINE_COMPUTE_OPS(binary_cross_entropy_layer, binary_cross_entropy_op)
+DEFINE_COMPUTE_OPS(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
+DEFINE_COMPUTE_OPS(boolean_accuracy_layer, boolean_accuracy_op)
+DEFINE_COMPUTE_OPS(boolean_false_negative_layer, boolean_false_negative_op)
+DEFINE_COMPUTE_OPS(boolean_false_positive_layer, boolean_false_positive_op)
+
+#define PROTO(T) \
+  BINARY_ETI_INST_MACRO_DEV_DT(binary_cross_entropy_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(sigmoid_binary_cross_entropy_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_accuracy_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_false_negative_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_false_positive_layer, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/entrywise.cu b/src/layers/loss/entrywise.cu
index c206dc815f6..bddaf92b53b 100644
--- a/src/layers/loss/entrywise.cu
+++ b/src/layers/loss/entrywise.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_ENTRYWISE_LAYER_INSTANTIATE
 #include "lbann/layers/loss/entrywise.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -32,23 +33,24 @@ namespace lbann {
 namespace {
 
 /** CUDA kernel to apply an binary backprop operator. */
-template <typename BinaryBackPropOperator>
+template <template <typename> class BinaryBackPropOperator,
+          typename TensorDataType>
 __global__
 void binary_backprop_operator_kernel(El::Int height, El::Int width,
-                                     const DataType* __restrict__ x1,
+                                     const TensorDataType* __restrict__ x1,
                                      El::Int x1_ldim,
-                                     const DataType* __restrict__ x2,
+                                     const TensorDataType* __restrict__ x2,
                                      El::Int x2_ldim,
-                                     const DataType* __restrict__ dy,
+                                     const TensorDataType* __restrict__ dy,
                                      El::Int dy_ldim,
-                                     DataType* __restrict__ dx1,
+                                     TensorDataType* __restrict__ dx1,
                                      El::Int dx1_ldim,
-                                     DataType* __restrict__ dx2,
+                                     TensorDataType* __restrict__ dx2,
                                      El::Int dx2_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
   const El::Int num_threads = blockDim.x * gridDim.x;
-  BinaryBackPropOperator op;
+  BinaryBackPropOperator<TensorDataType> op;
   for (El::Int pos = gid; pos < size; pos += num_threads) {
     const auto& row = pos % height;
     const auto& col = pos / height;
@@ -68,12 +70,13 @@ void binary_backprop_operator_kernel(El::Int height, El::Int width,
  *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
  *  the BinaryBackPropOperator is called.
  */
-template <typename BinaryBackPropOperator>
-void apply_binary_backprop_operator(const AbsMat& x1,
-                                    const AbsMat& x2,
-                                    const AbsMat& dy,
-                                    AbsMat& dx1,
-                                    AbsMat& dx2) {
+template <template <typename> class Op, typename TensorDataType>
+void apply_binary_backprop_operator(
+  const El::AbstractMatrix<TensorDataType>& x1,
+  const El::AbstractMatrix<TensorDataType>& x2,
+  const El::AbstractMatrix<TensorDataType>& dy,
+  El::AbstractMatrix<TensorDataType>& dx1,
+  El::AbstractMatrix<TensorDataType>& dx2) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -90,7 +93,7 @@ void apply_binary_backprop_operator(const AbsMat& x1,
   // Launch CUDA kernel
   if (grid_dim > 0) {
     CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    binary_backprop_operator_kernel<BinaryBackPropOperator>
+    binary_backprop_operator_kernel<Op>
       <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
         height, width,
         x1.LockedBuffer(), x1.LDim(),
@@ -111,23 +114,24 @@ void apply_binary_backprop_operator(const AbsMat& x1,
 // (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
 
 /** Binary cross entropy operator. */
+template <typename TensorDataType>
 struct binary_cross_entropy_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
-    DataType y = zero;
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const TensorDataType zero = 0.;
+    const TensorDataType one = 1.;
+    TensorDataType y = zero;
     if (x2 > zero) { y += -x2 * cuda::log(x1); }
     if (x2 < one)  { y += -(one-x2) * cuda::log(one-x1); }
     return y;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    const TensorDataType zero = 0.;
+    const TensorDataType one = 1.;
     dx1 = zero;
     dx2 = zero;
     if (dy == zero) { return; }
@@ -148,11 +152,12 @@ struct binary_cross_entropy_op {
  *  implementation is taken from
  *  https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits.
  */
+template <typename TensorDataType>
 struct sigmoid_binary_cross_entropy_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const TensorDataType zero = 0.;
+    const TensorDataType one = 1.;
     const auto& z = cuda::max(zero, cuda::min(x2, one));
     if (x1 > zero) {
       return (one - z) * x1 + cuda::log1p(cuda::exp(-x1));
@@ -160,18 +165,18 @@ struct sigmoid_binary_cross_entropy_op {
       return - x1 * z + cuda::log1p(cuda::exp(x1));
     }
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    const TensorDataType zero = 0.;
+    const TensorDataType one = 1.;
     const auto& z = cuda::max(zero, cuda::min(x2, one));
     if (x1 > zero) {
-      dx1 = -z + 1 / (one + cuda::exp(-x1));
+      dx1 = -z + one / (one + cuda::exp(-x1));
     } else {
-      dx1 = one - z - 1 / (one + cuda::exp(x1));
+      dx1 = one - z - one / (one + cuda::exp(x1));
     }
     dx1 *= dy;
     dx2 = (x2 == z) ? -x1 * dy : zero;
@@ -179,99 +184,97 @@ struct sigmoid_binary_cross_entropy_op {
 };
 
 /** Boolean accuracy operator. */
+template <typename TensorDataType>
 struct boolean_accuracy_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return b1 == b2 ? DataType(1) : DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return b1 == b2 ? TensorDataType(1.0) : TensorDataType(0.0);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Boolean false negative operator. */
+template <typename TensorDataType>
 struct boolean_false_negative_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return (!b1 && b2) ? DataType(1) : DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return (!b1 && b2) ? TensorDataType(1.0) : TensorDataType(0.0);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Boolean false positive operator. */
+template <typename TensorDataType>
 struct boolean_false_positive_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 >= DataType(0.5);
-    const auto& b2 = x2 >= DataType(0.5);
-    return (b1 && !b2) ? DataType(1) : DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 >= TensorDataType(0.5);
+    const auto& b2 = x2 >= TensorDataType(0.5);
+    return (b1 && !b2) ? TensorDataType(1.0) : TensorDataType(0.0);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-         ::fp_compute() {                                               \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
-                                              get_prev_activations(1),  \
-                                              get_activations());       \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    cuda::apply_entrywise_binary_operator<op>(                          \
+      this->get_prev_activations(0),                                    \
+      this->get_prev_activations(1),                                    \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-         ::bp_compute() {                                               \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_binary_backprop_operator<op>(                                 \
+      this->get_local_prev_activations(0),                              \
+      this->get_local_prev_activations(1),                              \
+      this->get_local_prev_error_signals(),                             \
+      this->get_local_error_signals(0),                                 \
+      this->get_local_error_signals(1));                                \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-         ::fp_compute() {                                               \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
-                                              get_prev_activations(1),  \
-                                              get_activations());       \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::bp_compute() {                                                      \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }
-  INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op)
-  INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
-  INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op)
-  INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op)
-  INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op)
+
+DEFINE_COMPUTE_OPS(binary_cross_entropy_layer, binary_cross_entropy_op)
+DEFINE_COMPUTE_OPS(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
+DEFINE_COMPUTE_OPS(boolean_accuracy_layer, boolean_accuracy_op)
+DEFINE_COMPUTE_OPS(boolean_false_negative_layer, boolean_false_negative_op)
+DEFINE_COMPUTE_OPS(boolean_false_positive_layer, boolean_false_positive_op)
+
+#define PROTO(T) \
+  BINARY_ETI_INST_MACRO_DEV_DT(binary_cross_entropy_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(sigmoid_binary_cross_entropy_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_accuracy_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_false_negative_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(boolean_false_positive_layer, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/l1_norm.cpp b/src/layers/loss/l1_norm.cpp
index 1d20295719a..35caef308d0 100644
--- a/src/layers/loss/l1_norm.cpp
+++ b/src/layers/loss/l1_norm.cpp
@@ -24,17 +24,19 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_L1_NORM_LAYER_INSTANTIATE
 #include "lbann/layers/loss/l1_norm.hpp"
 
 namespace lbann {
 
 namespace {
 
-void local_fp_cpu(const AbsMat& local_input,
-                  AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_cpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_input.Width(); ++col) {
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_input.Height(); ++row) {
       const auto& x = local_input(row, col);
       sum += std::fabs(x);
@@ -43,10 +45,11 @@ void local_fp_cpu(const AbsMat& local_input,
   }
 }
 
-void local_bp_cpu(const AbsMat& local_input,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_input) {
-  constexpr DataType zero = 0;
+template <typename TensorDataType>
+void local_bp_cpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_input) {
+  const TensorDataType zero = El::TypeTraits<TensorDataType>::Zero();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (El::Int col = 0; col < local_input.Width(); ++col) {
     for (El::Int row = 0; row < local_input.Height(); ++row) {
@@ -66,35 +69,26 @@ void local_bp_cpu(const AbsMat& local_input,
 
 } // namespace
 
-template <>
-void l1_norm_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_input, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l1_norm_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_cpu(this->get_local_prev_activations(),
+               this->m_workspace->Matrix());
 }
-template <>
-void l1_norm_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_cpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
-}
-template <>
-void l1_norm_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_input, local_contribution);
-}
-template <>
-void l1_norm_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_cpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l1_norm_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_cpu(this->get_local_prev_activations(),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class l1_norm_layer<                       \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class l1_norm_layer<                       \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/l1_norm.cu b/src/layers/loss/l1_norm.cu
index 1dfda1aac6e..8e5db8ef411 100644
--- a/src/layers/loss/l1_norm.cu
+++ b/src/layers/loss/l1_norm.cu
@@ -24,18 +24,19 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_L1_NORM_LAYER_INSTANTIATE
 #include "lbann/layers/loss/l1_norm.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void fp_kernel(El::Int local_height,
                           El::Int local_width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          DataType* __restrict__ contribution) {
+                          TensorDataType* __restrict__ contribution) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -47,7 +48,7 @@ __global__ void fp_kernel(El::Int local_height,
   for (El::Int col = bidy; col < local_width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution = 0;
+    TensorDataType private_contribution = 0;
     for (El::Int row = gidx; row < local_height; row += nthreadsx) {
       const auto& x = input[row + col * input_ldim];
       private_contribution += cuda::abs(x);
@@ -55,7 +56,7 @@ __global__ void fp_kernel(El::Int local_height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -71,7 +72,9 @@ __global__ void fp_kernel(El::Int local_height,
 
 }
 
-void local_fp_gpu(const AbsMat& local_input, AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   El::Zero(local_contribution);
   if (!local_input.IsEmpty()) {
     const auto& local_height = local_input.Height();
@@ -90,14 +93,14 @@ void local_fp_gpu(const AbsMat& local_input, AbsMat& local_contribution) {
   }
 }
 
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void bp_kernel(El::Int local_height, El::Int local_width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          DataType* __restrict__ gradient_wrt_input,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          TensorDataType* __restrict__ gradient_wrt_input,
                           El::Int gradient_wrt_input_ldim) {
-  constexpr DataType zero = 0;
+  const TensorDataType zero = 0.;
   const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int bidy = blockIdx.y;
   const El::Int nthreadsx = blockDim.x * gridDim.x;
@@ -117,9 +120,10 @@ __global__ void bp_kernel(El::Int local_height, El::Int local_width,
   }
 }
 
-void local_bp_gpu(const AbsMat& local_input,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_input) {
   if (!local_input.IsEmpty()) {
     const auto& local_height = local_input.Height();
     const auto& local_width = local_input.Width();
@@ -141,35 +145,25 @@ void local_bp_gpu(const AbsMat& local_input,
 
 } // namespace
 
-template <>
-void l1_norm_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_input, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l1_norm_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_gpu(this->get_local_prev_activations(),
+               this->m_workspace->Matrix());
 }
-template <>
-void l1_norm_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_gpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
-}
-template <>
-void l1_norm_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_input, local_contribution);
-}
-template <>
-void l1_norm_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_gpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l1_norm_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_gpu(this->get_local_prev_activations(),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class l1_norm_layer<                       \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class l1_norm_layer<                       \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/l2_norm2.cpp b/src/layers/loss/l2_norm2.cpp
index 258179d883f..661bb766793 100644
--- a/src/layers/loss/l2_norm2.cpp
+++ b/src/layers/loss/l2_norm2.cpp
@@ -24,17 +24,19 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_L2_NORM2_LAYER_INSTANTIATE
 #include "lbann/layers/loss/l2_norm2.hpp"
 
 namespace lbann {
 
 namespace {
 
-void local_fp_cpu(const AbsMat& local_input,
-                  AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_cpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_input.Width(); ++col) {
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_input.Height(); ++row) {
       const auto& x = local_input(row, col);
       sum += x * x;
@@ -43,9 +45,10 @@ void local_fp_cpu(const AbsMat& local_input,
   }
 }
 
-void local_bp_cpu(const AbsMat& local_input,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp_cpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_input) {
   auto const width = local_input.Width();
   auto const height = local_input.Height();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -61,35 +64,26 @@ void local_bp_cpu(const AbsMat& local_input,
 
 } // namespace
 
-template <>
-void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_input, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l2_norm2_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_cpu(this->get_local_prev_activations(),
+               this->m_workspace->Matrix());
 }
-template <>
-void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_cpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
-}
-template <>
-void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(local_input, local_contribution);
-}
-template <>
-void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_cpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l2_norm2_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_cpu(this->get_local_prev_activations(),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class l2_norm2_layer<                      \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class l2_norm2_layer<                      \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/l2_norm2.cu b/src/layers/loss/l2_norm2.cu
index 1a02a1096c2..916375a776b 100644
--- a/src/layers/loss/l2_norm2.cu
+++ b/src/layers/loss/l2_norm2.cu
@@ -24,18 +24,19 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_L2_NORM2_LAYER_INSTANTIATE
 #include "lbann/layers/loss/l2_norm2.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void fp_kernel(El::Int local_height,
                           El::Int local_width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          DataType* __restrict__ contribution) {
+                          TensorDataType* __restrict__ contribution) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -47,7 +48,7 @@ __global__ void fp_kernel(El::Int local_height,
   for (El::Int col = bidy; col < local_width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution = 0;
+    TensorDataType private_contribution = 0;
     for (El::Int row = gidx; row < local_height; row += nthreadsx) {
       const auto& x = input[row + col * input_ldim];
       private_contribution += x * x;
@@ -55,7 +56,7 @@ __global__ void fp_kernel(El::Int local_height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -71,7 +72,9 @@ __global__ void fp_kernel(El::Int local_height,
 
 }
 
-void local_fp_gpu(const AbsMat& local_input, AbsMat& local_contribution) {
+template <typename TensorDataType>
+void local_fp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   El::Zero(local_contribution);
   if (!local_input.IsEmpty()) {
     const auto& local_height = local_input.Height();
@@ -90,12 +93,12 @@ void local_fp_gpu(const AbsMat& local_input, AbsMat& local_contribution) {
   }
 }
 
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void bp_kernel(El::Int local_height, El::Int local_width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          DataType* __restrict__ gradient_wrt_input,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          TensorDataType* __restrict__ gradient_wrt_input,
                           El::Int gradient_wrt_input_ldim) {
   const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int bidy = blockIdx.y;
@@ -105,14 +108,15 @@ __global__ void bp_kernel(El::Int local_height, El::Int local_width,
     for (El::Int row = gidx; row < local_height; row += nthreadsx) {
       const auto& x = input[row + col * input_ldim];
       auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-      dx = 2 * x * dy;
+      dx = TensorDataType(2) * x * dy;
     }
   }
 }
 
-void local_bp_gpu(const AbsMat& local_input,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp_gpu(const El::AbstractMatrix<TensorDataType>& local_input,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_input) {
   if (!local_input.IsEmpty()) {
     const auto& local_height = local_input.Height();
     const auto& local_width = local_input.Width();
@@ -134,35 +138,26 @@ void local_bp_gpu(const AbsMat& local_input,
 
 } // namespace
 
-template <>
-void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_input, local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l2_norm2_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_gpu(this->get_local_prev_activations(),
+               this->m_workspace->Matrix());
 }
-template <>
-void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_gpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
-}
-template <>
-void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(const AbsMat& local_input,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(local_input, local_contribution);
-}
-template <>
-void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(const AbsMat& local_input,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_input) {
-  local_bp_gpu(local_input,
-               local_gradient_wrt_output,
-               local_gradient_wrt_input);
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void l2_norm2_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_gpu(this->get_local_prev_activations(),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals());
 }
 
+#define PROTO(T)                                      \
+  template class l2_norm2_layer<                      \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class l2_norm2_layer<                      \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/mean_absolute_error.cpp b/src/layers/loss/mean_absolute_error.cpp
index e497f11add6..586ce99907f 100644
--- a/src/layers/loss/mean_absolute_error.cpp
+++ b/src/layers/loss/mean_absolute_error.cpp
@@ -24,16 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE
 #include "lbann/layers/loss/mean_absolute_error.hpp"
 
 namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 void local_fp_cpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
 
   // Useful constants
   const auto& local_height = local_prediction.Height();
@@ -42,7 +44,7 @@ void local_fp_cpu(El::Int height,
   // Compute local contribution to mean absolute error
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_height; ++row) {
       sum += std::fabs(local_prediction(row, col)
                        - local_ground_truth(row, col));
@@ -52,15 +54,16 @@ void local_fp_cpu(El::Int height,
 
 }
 
+template <typename TensorDataType>
 void local_bp_cpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
 
   // Useful constants
-  const DataType scale = DataType(1) / height;
+  const TensorDataType scale = El::TypeTraits<TensorDataType>::One() / El::To<TensorDataType>(height);
   const El::Int local_height = local_prediction.Height();
   const El::Int local_width = local_prediction.Width();
 
@@ -80,8 +83,8 @@ void local_bp_cpu(El::Int height,
         dx = - scale * dy;
         dxhat = scale * dy;
       } else {
-        dx = DataType(0);
-        dxhat = DataType(0);
+        dx = El::TypeTraits<TensorDataType>::Zero();
+        dxhat = El::TypeTraits<TensorDataType>::Zero();
       }
     }
   }
@@ -90,56 +93,31 @@ void local_bp_cpu(El::Int height,
 
 } // namespace
 
-template <>
-void mean_absolute_error_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(height, local_prediction, local_ground_truth,
-               local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_absolute_error_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_cpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void mean_absolute_error_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_absolute_error_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_cpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void mean_absolute_error_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(height, local_prediction, local_ground_truth,
-               local_contribution);
-}
+#define PROTO(T)                                      \
+  template class mean_absolute_error_layer<           \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class mean_absolute_error_layer<           \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void mean_absolute_error_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/mean_absolute_error.cu b/src/layers/loss/mean_absolute_error.cu
index ba7c224c899..0b591b92836 100644
--- a/src/layers/loss/mean_absolute_error.cu
+++ b/src/layers/loss/mean_absolute_error.cu
@@ -24,20 +24,21 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE
 #include "lbann/layers/loss/mean_absolute_error.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void fp_kernel(int global_height,
                           int local_height, int local_width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          DataType* __restrict__ contribution) {
+                          TensorDataType* __restrict__ contribution) {
 
   // Indices
   const int tid = threadIdx.x;
@@ -49,7 +50,7 @@ __global__ void fp_kernel(int global_height,
   for (int col = bidy; col < local_width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution = DataType(0);
+    TensorDataType private_contribution = TensorDataType(0.0);
     for (int row = gidx; row < local_height; row += nthreadsx) {
       const auto& x = prediction[row + col * prediction_ldim];
       const auto& xhat = ground_truth[row + col * ground_truth_ldim];
@@ -58,7 +59,7 @@ __global__ void fp_kernel(int global_height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -75,10 +76,11 @@ __global__ void fp_kernel(int global_height,
 
 }
 
+template <typename TensorDataType>
 void local_fp_gpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   El::Zero(local_contribution);
   const auto& local_height = local_prediction.Height();
   const auto& local_width = local_prediction.Width();
@@ -98,17 +100,17 @@ void local_fp_gpu(El::Int height,
   }
 }
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void bp_kernel(int global_height,
                           int local_height, int local_width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          DataType* __restrict__ gradient_wrt_prediction,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          TensorDataType* __restrict__ gradient_wrt_prediction,
                           int gradient_wrt_prediction_ldim,
-                          DataType* __restrict__ gradient_wrt_ground_truth,
+                          TensorDataType* __restrict__ gradient_wrt_ground_truth,
                           int gradient_wrt_ground_truth_ldim) {
 
   // Indices
@@ -124,27 +126,29 @@ __global__ void bp_kernel(int global_height,
       const auto& xhat = ground_truth[row + col * ground_truth_ldim];
       auto& dx = gradient_wrt_prediction[row + col * gradient_wrt_prediction_ldim];
       auto& dxhat = gradient_wrt_ground_truth[row + col * gradient_wrt_ground_truth_ldim];
+      const TensorDataType global_height_dt = TensorDataType(global_height);
       if (x > xhat) {
-        dx = dy / global_height;
-        dxhat = -dy / global_height;
+        dx = dy / global_height_dt;
+        dxhat = -dy / global_height_dt;
       } else if (x < xhat) {
-        dx = -dy / global_height;
-        dxhat = dy / global_height;
+        dx = -dy / global_height_dt;
+        dxhat = dy / global_height_dt;
       } else {
-        dx = DataType(0);
-        dxhat = DataType(0);
+        dx = TensorDataType(0.0);
+        dxhat = TensorDataType(0.0);
       }
     }
   }
 
 }
 
+template <typename TensorDataType>
 void local_bp_gpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
   const auto& local_height = local_prediction.Height();
   const auto& local_width = local_prediction.Width();
   if (local_height > 0 && local_width > 0) {
@@ -169,56 +173,31 @@ void local_bp_gpu(El::Int height,
 
 } // namespace
 
-template <>
-void mean_absolute_error_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(height, local_prediction, local_ground_truth,
-               local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_absolute_error_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_gpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void mean_absolute_error_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_absolute_error_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_gpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void mean_absolute_error_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(height, local_prediction, local_ground_truth,
-               local_contribution);
-}
+#define PROTO(T)                                      \
+  template class mean_absolute_error_layer<           \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class mean_absolute_error_layer<           \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void mean_absolute_error_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/mean_squared_error.cpp b/src/layers/loss/mean_squared_error.cpp
index 19df1b60b4c..0c50e04235e 100644
--- a/src/layers/loss/mean_squared_error.cpp
+++ b/src/layers/loss/mean_squared_error.cpp
@@ -24,16 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE
 #include "lbann/layers/loss/mean_squared_error.hpp"
 
 namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 void local_fp_cpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
 
   // Useful constants
   const auto& local_height = local_prediction.Height();
@@ -42,7 +44,7 @@ void local_fp_cpu(El::Int height,
   // Compute local contribution to mean squared error
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& err = (local_prediction(row, col)
                          - local_ground_truth(row, col));
@@ -53,15 +55,16 @@ void local_fp_cpu(El::Int height,
 
 }
 
+template <typename TensorDataType>
 void local_bp_cpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
 
   // Useful constants
-  const DataType scale = DataType(2) / height;
+  const TensorDataType scale = static_cast<TensorDataType>(TensorDataType(2) / height);
   const El::Int local_height = local_prediction.Height();
   const El::Int local_width = local_prediction.Width();
 
@@ -81,56 +84,31 @@ void local_bp_cpu(El::Int height,
 
 } // namespace
 
-template <>
-void mean_squared_error_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(height, local_prediction, local_ground_truth,
-               local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_squared_error_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_cpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void mean_squared_error_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_squared_error_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_cpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void mean_squared_error_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_cpu(height, local_prediction, local_ground_truth,
-               local_contribution);
-}
+#define PROTO(T)                                      \
+  template class mean_squared_error_layer<            \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class mean_squared_error_layer<            \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void mean_squared_error_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_cpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/mean_squared_error.cu b/src/layers/loss/mean_squared_error.cu
index 024b676b39f..6a404cb7fe9 100644
--- a/src/layers/loss/mean_squared_error.cu
+++ b/src/layers/loss/mean_squared_error.cu
@@ -24,20 +24,21 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE
 #include "lbann/layers/loss/mean_squared_error.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void fp_kernel(int global_height,
                           int local_height, int local_width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          DataType* __restrict__ contribution) {
+                          TensorDataType* __restrict__ contribution) {
 
   // Indices
   const int tid = threadIdx.x;
@@ -49,7 +50,7 @@ __global__ void fp_kernel(int global_height,
   for (int col = bidy; col < local_width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution = DataType(0);
+    TensorDataType private_contribution = TensorDataType(0.0);
     for (int row = gidx; row < local_height; row += nthreadsx) {
       const auto& err = (prediction[row + col * prediction_ldim]
                          - ground_truth[row + col * ground_truth_ldim]);
@@ -58,7 +59,7 @@ __global__ void fp_kernel(int global_height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -75,10 +76,11 @@ __global__ void fp_kernel(int global_height,
 
 }
 
+template <typename TensorDataType>
 void local_fp_gpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  AbsMat& local_contribution) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  El::AbstractMatrix<TensorDataType>& local_contribution) {
   El::Zero(local_contribution);
   const auto& local_height = local_prediction.Height();
   const auto& local_width = local_prediction.Width();
@@ -98,17 +100,17 @@ void local_fp_gpu(El::Int height,
   }
 }
 
-template <int block_size>
+template <int block_size, typename TensorDataType>
 __global__ void bp_kernel(int global_height,
                           int local_height, int local_width,
-                          const DataType* __restrict__ prediction,
+                          const TensorDataType* __restrict__ prediction,
                           int prediction_ldim,
-                          const DataType* __restrict__ ground_truth,
+                          const TensorDataType* __restrict__ ground_truth,
                           int ground_truth_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          DataType* __restrict__ gradient_wrt_prediction,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
+                          TensorDataType* __restrict__ gradient_wrt_prediction,
                           int gradient_wrt_prediction_ldim,
-                          DataType* __restrict__ gradient_wrt_ground_truth,
+                          TensorDataType* __restrict__ gradient_wrt_ground_truth,
                           int gradient_wrt_ground_truth_ldim) {
 
   // Indices
@@ -124,19 +126,20 @@ __global__ void bp_kernel(int global_height,
                          - ground_truth[row + col * ground_truth_ldim]);
       auto& dx = gradient_wrt_prediction[row + col * gradient_wrt_prediction_ldim];
       auto& dxhat = gradient_wrt_ground_truth[row + col * gradient_wrt_ground_truth_ldim];
-      dx = 2 * err * dy / global_height;
-      dxhat = - 2 * err * dy / global_height;
+      dx = TensorDataType(2) * err * dy / TensorDataType(global_height);
+      dxhat = TensorDataType(-2) * err * dy / TensorDataType(global_height);
     }
   }
 
 }
 
+template <typename TensorDataType>
 void local_bp_gpu(El::Int height,
-                  const AbsMat& local_prediction,
-                  const AbsMat& local_ground_truth,
-                  const AbsMat& local_gradient_wrt_output,
-                  AbsMat& local_gradient_wrt_prediction,
-                  AbsMat& local_gradient_wrt_ground_truth) {
+                  const El::AbstractMatrix<TensorDataType>& local_prediction,
+                  const El::AbstractMatrix<TensorDataType>& local_ground_truth,
+                  const El::AbstractMatrix<TensorDataType>& local_gradient_wrt_output,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_prediction,
+                  El::AbstractMatrix<TensorDataType>& local_gradient_wrt_ground_truth) {
   const auto& local_height = local_prediction.Height();
   const auto& local_width = local_prediction.Width();
   if (local_height > 0 && local_width > 0) {
@@ -161,56 +164,31 @@ void local_bp_gpu(El::Int height,
 
 } // namespace
 
-template <>
-void mean_squared_error_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(height, local_prediction, local_ground_truth,
-               local_contribution);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_squared_error_layer<TensorDataType, T_layout, Dev>::local_fp_compute() {
+  local_fp_gpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->Matrix());
 }
 
-template <>
-void mean_squared_error_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void mean_squared_error_layer<TensorDataType, T_layout, Dev>::local_bp_compute() {
+  local_bp_gpu(this->get_input_size(),
+               this->get_local_prev_activations(0),
+               this->get_local_prev_activations(1),
+               this->m_workspace->LockedMatrix(),
+               this->get_local_error_signals(0),
+               this->get_local_error_signals(1));
 }
 
-template <>
-void mean_squared_error_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_fp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        AbsMat& local_contribution) {
-  local_fp_gpu(height, local_prediction, local_ground_truth,
-               local_contribution);
-}
+#define PROTO(T)                                      \
+  template class mean_squared_error_layer<            \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class mean_squared_error_layer<            \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void mean_squared_error_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::local_bp_compute(El::Int height,
-                        const AbsMat& local_prediction,
-                        const AbsMat& local_ground_truth,
-                        const AbsMat& local_gradient_wrt_output,
-                        AbsMat& local_gradient_wrt_prediction,
-                        AbsMat& local_gradient_wrt_ground_truth) {
-  local_bp_gpu(height,
-               local_prediction,
-               local_ground_truth,
-               local_gradient_wrt_output,
-               local_gradient_wrt_prediction,
-               local_gradient_wrt_ground_truth);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/loss/top_k_categorical_accuracy.cpp b/src/layers/loss/top_k_categorical_accuracy.cpp
index 9cad631a78c..9352b84f211 100644
--- a/src/layers/loss/top_k_categorical_accuracy.cpp
+++ b/src/layers/loss/top_k_categorical_accuracy.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/top_k_categorical_accuracy.hpp"
 #include <algorithm>
 #include <limits>
@@ -34,15 +35,16 @@ namespace lbann {
 namespace {
 
 /** Sparse vector entry. */
+template <typename TensorDataType>
 struct entry {
 
   /** Vector entry value. */
-  DataType value = min_value;
+  TensorDataType value = min_value;
   /** Vector entry index. */
   El::Int index = max_index;
 
   /** Minimum possible value. */
-  static constexpr DataType min_value = -std::numeric_limits<DataType>::infinity();
+  static constexpr TensorDataType min_value = -std::numeric_limits<TensorDataType>::infinity();
   /** Maximum possible index. */
   static constexpr El::Int max_index = std::numeric_limits<El::Int>::max();
 
@@ -50,18 +52,19 @@ struct entry {
    *  Entries are sorted by value in decreasing order, with ties
    *  broken in favor of entries with smaller indices.
    */
-  static bool compare(const entry& a, const entry& b) {
+  static bool compare(const entry<TensorDataType>& a, const entry<TensorDataType>& b) {
     return a.value > b.value || (a.value == b.value && a.index < b.index);
   }
 
 };
 
 /** CPU implementation of top-k categorical accuracy layer forward prop. */
+template <typename TensorDataType>
 void fp_cpu(lbann_comm& comm,
             El::Int k,
-            const AbsDistMat& predictions,
-            const AbsDistMat& labels,
-            AbsDistMat& loss) {
+            const El::AbstractDistMatrix<TensorDataType>& predictions,
+            const El::AbstractDistMatrix<TensorDataType>& labels,
+            El::AbstractDistMatrix<TensorDataType>& loss) {
 
   // Local matrices
   const auto& local_predictions = predictions.LockedMatrix();
@@ -76,7 +79,7 @@ void fp_cpu(lbann_comm& comm,
     El::Zero(loss);
     return;
   } else if (k >= height) {
-    El::Fill(loss, DataType(1));
+    El::Fill(loss, El::TypeTraits<TensorDataType>::One());
     return;
   } else if (local_width < 1) {
     return;
@@ -96,7 +99,7 @@ void fp_cpu(lbann_comm& comm,
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int row = 0; row < local_height; ++row) {
-      if (local_labels(row, col) > DataType(0)) {
+      if (local_labels(row, col) > El::TypeTraits<TensorDataType>::Zero()) {
         label_indices[col] = labels.GlobalRow(row);
       }
     }
@@ -108,10 +111,10 @@ void fp_cpu(lbann_comm& comm,
                     El::mpi::MIN);
 
   // Find top-k entries in each column of local prediction matrix
-  std::vector<entry> top_entries(local_width * k);
+  std::vector<entry<TensorDataType>> top_entries(local_width * k);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    std::vector<entry> local_entries(std::max(local_height, k));
+    std::vector<entry<TensorDataType>> local_entries(std::max(local_height, k));
     for (El::Int row = 0; row < local_height; ++row) {
       local_entries[row].value = local_predictions(row, col);
       local_entries[row].index = predictions.GlobalRow(row);
@@ -120,25 +123,25 @@ void fp_cpu(lbann_comm& comm,
                            local_entries.end(),
                            &top_entries[col*k],
                            &top_entries[col*k] + k,
-                           entry::compare);
+                           entry<TensorDataType>::compare);
   }
 
   // Find top-k entries in each column of global prediction matrix
   if (col_comm_size > 1) {
     if (col_comm_rank != col_comm_root) {
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data()),
-                  top_entries.size() * sizeof(entry),
+                  top_entries.size() * sizeof(entry<TensorDataType>),
                   col_comm_root,
                   col_comm, El::SyncInfo<El::Device::CPU>{});
     } else {
-      std::vector<entry> global_top_entries(col_comm_size * local_width * k);
+      std::vector<entry<TensorDataType>> global_top_entries(col_comm_size * local_width * k);
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data()),
-                  top_entries.size() * sizeof(entry),
+                  top_entries.size() * sizeof(entry<TensorDataType>),
                   reinterpret_cast<El::byte*>(global_top_entries.data()),
                   col_comm, El::SyncInfo<El::Device::CPU>{});
       LBANN_OMP_PARALLEL_FOR
       for (El::Int col = 0; col < local_width; ++col) {
-        std::vector<entry> col_entries(col_comm_size * k);
+        std::vector<entry<TensorDataType>> col_entries(col_comm_size * k);
         for (El::Int rank = 0; rank < col_comm_size; ++rank) {
           const auto* start = &global_top_entries[rank*local_width*k+col*k];
           std::copy(start, start + k, &col_entries[rank*k]);
@@ -147,7 +150,7 @@ void fp_cpu(lbann_comm& comm,
                                col_entries.end(),
                                &top_entries[col*k],
                                &top_entries[col*k] + k,
-                               entry::compare);
+                               entry<TensorDataType>::compare);
       }
     }
 
@@ -163,7 +166,7 @@ void fp_cpu(lbann_comm& comm,
         const auto& label_index = label_indices[col];
         if (top_entries[col*k+i].index == label_index
             && label_index < height) {
-          local_loss(0, col) = DataType(1);
+          local_loss(0, col) = El::TypeTraits<TensorDataType>::One();
         }
       }
     }
@@ -173,23 +176,22 @@ void fp_cpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void top_k_categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(),
-         m_k,
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
-}
-template <>
-void top_k_categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(),
-         m_k,
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void top_k_categorical_accuracy_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_cpu(*this->get_comm(),
+         this->m_k,
+         this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations());
 }
 
+#define PROTO(T)                                      \
+  template class top_k_categorical_accuracy_layer<    \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class top_k_categorical_accuracy_layer<    \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/loss/top_k_categorical_accuracy.cu b/src/layers/loss/top_k_categorical_accuracy.cu
index 7ea4f64a77a..a388e1ab6ff 100644
--- a/src/layers/loss/top_k_categorical_accuracy.cu
+++ b/src/layers/loss/top_k_categorical_accuracy.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE
 #include "lbann/layers/loss/top_k_categorical_accuracy.hpp"
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
@@ -36,9 +37,10 @@ namespace lbann {
 namespace {
 
 /** Sparse vector entry. */
+template <typename TensorDataType>
 struct entry {
   /** Vector entry value. */
-  DataType value;
+  TensorDataType value;
   /** Vector entry index. */
   El::Int index;
 };
@@ -47,8 +49,9 @@ struct entry {
  *  Entries are sorted by value in decreasing order, with ties broken
  *  in favor of entries with smaller indices.
  */
-struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
-  __host__ __device__ bool operator()(const entry& a, const entry& b) const {
+template <typename TensorDataType>
+struct entry_compare : ::thrust::binary_function<entry<TensorDataType>,entry<TensorDataType>,bool> {
+  __host__ __device__ bool operator()(const entry<TensorDataType>& a, const entry<TensorDataType>& b) const {
     return a.value > b.value || (a.value == b.value && a.index < b.index);
   }
 };
@@ -58,15 +61,16 @@ struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
  *  the sparse vectors correspond to global row indices in the dense
  *  matrix.
  */
+template <typename TensorDataType>
 __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
                                                El::Int local_matrix_height,
                                                El::Int local_matrix_width,
                                                El::Int global_matrix_height,
                                                El::Int global_matrix_col_shift,
                                                El::Int global_matrix_col_stride,
-                                               const DataType* __restrict__ local_matrix,
+                                               const TensorDataType* __restrict__ local_matrix,
                                                El::Int local_matrix_ldim,
-                                               entry* __restrict__ local_entries,
+                                               entry<TensorDataType>* __restrict__ local_entries,
                                                El::Int local_entries_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
@@ -81,7 +85,7 @@ __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
       current_entry.value = local_matrix[local_row + local_col * local_matrix_ldim];
       current_entry.index = global_row;
     } else {
-      current_entry.value = -cuda::infinity<DataType>();
+      current_entry.value = -cuda::infinity<TensorDataType>();
       current_entry.index = global_matrix_height;
     }
   }
@@ -114,11 +118,12 @@ __global__ void fill_with_tensor_index(El::Int tensor_size,
  *  vector. Note that we may get race conditions if a matrix column is
  *  not a one-hot vector.
  */
+template <typename TensorDataType>
 __global__ void one_hot_matrix_to_indices(El::Int local_height,
                                           El::Int local_width,
                                           El::Int global_matrix_col_shift,
                                           El::Int global_matrix_col_stride,
-                                          const DataType* __restrict__ local_matrix,
+                                          const TensorDataType* __restrict__ local_matrix,
                                           El::Int local_matrix_ldim,
                                           El::Int* __restrict__ indices) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -127,7 +132,7 @@ __global__ void one_hot_matrix_to_indices(El::Int local_height,
   for (El::Int i = gid; i < local_size; i += num_threads) {
     const auto& local_row = i % local_height;
     const auto& local_col = i / local_height;
-    if (local_matrix[local_row + local_col * local_matrix_ldim] > DataType(0)) {
+    if (local_matrix[local_row + local_col * local_matrix_ldim] > TensorDataType(0.0)) {
       const auto& global_row = (global_matrix_col_shift
                                 + local_row * global_matrix_col_stride);
       indices[local_col] = global_row;
@@ -139,13 +144,14 @@ __global__ void one_hot_matrix_to_indices(El::Int local_height,
  *  Loss is one if the label index matches one of the top-k entries
  *  and is otherwise zero.
  */
+template <typename TensorDataType>
 __global__ void compute_categorical_accuracy(El::Int k,
                                              El::Int width,
                                              El::Int max_entry,
-                                             const entry*  __restrict__ top_entries,
+                                             const entry<TensorDataType>*  __restrict__ top_entries,
                                              El::Int top_entries_ldim,
                                              const El::Int*  __restrict__ label_indices,
-                                             DataType* __restrict__ loss,
+                                             TensorDataType* __restrict__ loss,
                                              El::Int loss_stride) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
@@ -156,17 +162,18 @@ __global__ void compute_categorical_accuracy(El::Int k,
     const auto& label_index = label_indices[col];
     if (top_entries[ind + col * top_entries_ldim].index == label_index
         && label_index <= max_entry) {
-      loss[col * loss_stride] = DataType(1);
+      loss[col * loss_stride] = TensorDataType(1.0);
     }
   }
 }
 
 /** GPU implementation of top-k categorical accuracy layer forward prop. */
+template <typename TensorDataType>
 void fp_gpu(lbann_comm& comm,
             El::Int k,
-            const AbsDistMat& predictions,
-            const AbsDistMat& labels,
-            AbsDistMat& loss) {
+            const El::AbstractDistMatrix<TensorDataType>& predictions,
+            const El::AbstractDistMatrix<TensorDataType>& labels,
+            El::AbstractDistMatrix<TensorDataType>& loss) {
 
   // Local matrices
   const auto& local_predictions = predictions.LockedMatrix();
@@ -181,7 +188,7 @@ void fp_gpu(lbann_comm& comm,
     El::Zero(loss);
     return;
   } else if (k >= height) {
-    El::Fill(loss, DataType(1));
+    El::Fill(loss, El::TypeTraits<TensorDataType>::One());
     return;
   } else if (local_width < 1) {
     return;
@@ -211,7 +218,7 @@ void fp_gpu(lbann_comm& comm,
       local_labels.LockedBuffer(), local_labels.LDim(),
       label_indices.data().get());
     /// @todo The LBANN Aluminum interface doesn't gracefully handle
-    /// GPU data that is not DataType.
+    /// GPU data that is not TensorDataType.
     El::mpi::AllReduce(label_indices.data().get(),
                        label_indices.size(),
                        El::mpi::MIN,
@@ -219,13 +226,13 @@ void fp_gpu(lbann_comm& comm,
   }
 
   // Find top-k entries in each column of local prediction matrix
-  cuda::thrust::vector<entry> top_entries(local_width * k);
+  cuda::thrust::vector<entry<TensorDataType>> top_entries(local_width * k);
   {
     const auto& num_local_entries_per_col = std::max(local_height, k);
     const auto& num_local_entries = local_width * num_local_entries_per_col;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_local_entries + block_dim - 1) / block_dim;
-    cuda::thrust::vector<entry> local_entries(num_local_entries);
+    cuda::thrust::vector<entry<TensorDataType>> local_entries(num_local_entries);
     cuda::thrust::vector<El::Int> local_entries_cols(num_local_entries);
     dense_matrix_to_sparse_vectors<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries_per_col, local_height, local_width, height,
@@ -239,16 +246,16 @@ void fp_gpu(lbann_comm& comm,
                           local_entries.begin(),
                           local_entries.end(),
                           local_entries_cols.begin(),
-                          entry_compare());
+                          entry_compare<TensorDataType>());
     ::thrust::stable_sort_by_key(alloc.system(),
                                  local_entries_cols.begin(),
                                  local_entries_cols.end(),
                                  local_entries.begin());
     CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
-                                 k * sizeof(entry),
+                                 k * sizeof(entry<TensorDataType>),
                                  local_entries.data().get(),
-                                 num_local_entries_per_col * sizeof(entry),
-                                 k * sizeof(entry),
+                                 num_local_entries_per_col * sizeof(entry<TensorDataType>),
+                                 k * sizeof(entry<TensorDataType>),
                                  local_width,
                                  cudaMemcpyDeviceToDevice,
                                  stream));
@@ -262,14 +269,14 @@ void fp_gpu(lbann_comm& comm,
     const auto& grid_dim = (num_entries + block_dim - 1) / block_dim;
     if (col_comm_rank != col_comm_root) {
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
-                  top_entries.size() * sizeof(entry),
+                  top_entries.size() * sizeof(entry<TensorDataType>),
                   col_comm_root,
                   col_comm, syncInfo);
     } else {
-      cuda::thrust::vector<entry> global_top_entries(num_entries);
+      cuda::thrust::vector<entry<TensorDataType>> global_top_entries(num_entries);
       cuda::thrust::vector<El::Int> global_top_entries_cols(num_entries);
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
-                  top_entries.size() * sizeof(entry),
+                  top_entries.size() * sizeof(entry<TensorDataType>),
                   reinterpret_cast<El::byte*>(global_top_entries.data().get()),
                   col_comm, syncInfo);
       fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
@@ -278,16 +285,16 @@ void fp_gpu(lbann_comm& comm,
                             global_top_entries.begin(),
                             global_top_entries.end(),
                             global_top_entries_cols.begin(),
-                            entry_compare());
+                            entry_compare<TensorDataType>());
       ::thrust::stable_sort_by_key(alloc.system(),
                                    global_top_entries_cols.begin(),
                                    global_top_entries_cols.end(),
                                    global_top_entries.begin());
       CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
-                                   k * sizeof(entry),
+                                   k * sizeof(entry<TensorDataType>),
                                    global_top_entries.data().get(),
-                                   col_comm_size * k * sizeof(entry),
-                                   k * sizeof(entry),
+                                   col_comm_size * k * sizeof(entry<TensorDataType>),
+                                   k * sizeof(entry<TensorDataType>),
                                    local_width,
                                    cudaMemcpyDeviceToDevice,
                                    stream));
@@ -311,23 +318,22 @@ void fp_gpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void top_k_categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(),
-         m_k,
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
-}
-template <>
-void top_k_categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(),
-         m_k,
-         get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void top_k_categorical_accuracy_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_gpu(*this->get_comm(),
+         this->m_k,
+         this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations());
 }
 
+#define PROTO(T)                                      \
+  template class top_k_categorical_accuracy_layer<    \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class top_k_categorical_accuracy_layer<    \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/math/CMakeLists.txt b/src/layers/math/CMakeLists.txt
index 22125a51af2..6318913c367 100644
--- a/src/layers/math/CMakeLists.txt
+++ b/src/layers/math/CMakeLists.txt
@@ -3,6 +3,7 @@ set_full_path(THIS_DIR_SOURCES
   unary.cpp
   binary.cpp
   clamp.cpp
+  matmul.cpp
   )
 
 if (LBANN_HAS_CUDA)
diff --git a/src/layers/math/binary.cpp b/src/layers/math/binary.cpp
index 774bc7c7bf5..79e8c475371 100644
--- a/src/layers/math/binary.cpp
+++ b/src/layers/math/binary.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BINARY_LAYER_INSTANTIATE
 #include "lbann/layers/math/binary.hpp"
 #include "lbann/utils/entrywise_operator.hpp"
 
@@ -31,10 +32,6 @@ namespace lbann {
 
 namespace {
 
-// Helpful constants
-constexpr DataType zero = 0;
-constexpr DataType one = 1;
-
 /** Apply a binary backprop operator to CPU data.
  *  The input and output data must be on CPU and must have the same
  *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
@@ -43,12 +40,14 @@ constexpr DataType one = 1;
  *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
  *  the BinaryBackPropOperator is called.
  */
-template <typename BinaryBackPropOperator>
-void apply_binary_backprop_operator(const AbsMat& x1,
-                                    const AbsMat& x2,
-                                    const AbsMat& dy,
-                                    AbsMat& dx1,
-                                    AbsMat& dx2) {
+template <template <typename> class Op, typename TensorDataType>
+void apply_binary_backprop_operator(
+  const El::AbstractMatrix<TensorDataType>& x1,
+  const El::AbstractMatrix<TensorDataType>& x2,
+  const El::AbstractMatrix<TensorDataType>& dy,
+  El::AbstractMatrix<TensorDataType>& dx1,
+  El::AbstractMatrix<TensorDataType>& dx2) {
+  using BinaryBackPropOperator = Op<TensorDataType>;
   if (x1.Contiguous() && x2.Contiguous() && dy.Contiguous()
       && dx1.Contiguous() && dx2.Contiguous()) {
     const auto* x1_buffer = x1.LockedBuffer();
@@ -87,98 +86,105 @@ void apply_binary_backprop_operator(const AbsMat& x1,
 // (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
 
 /** Add operator. */
+template <typename TensorDataType>
 struct add_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return x1 + x2;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = dy;
   }
 };
 
 /** Subtract operator. */
+template <typename TensorDataType>
 struct subtract_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return x1 - x2;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = -dy;
   }
 };
 
 /** Multiply operator. */
+template <typename TensorDataType>
 struct multiply_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return x1 * x2;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy * x2;
     dx2 = dy * x1;
   }
 };
 
 /** Divide operator. */
+template <typename TensorDataType>
 struct divide_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return x1 / x2;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy / x2;
     dx2 = -dy * x1 / (x2*x2);
   }
 };
 
 /** Modulo operator. */
+template <typename TensorDataType>
 struct mod_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return std::fmod(x1, x2);
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    using std::fmod;
+    return fmod(x1, x2);
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = -dy * std::floor(x1 / x2);
   }
 };
 
 /** Power operator. */
+template <typename TensorDataType>
 struct pow_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return std::pow(x1, x2);
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return El::Pow(x1, x2);
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
 
-    dx1 = dy * x2 * std::pow(x1, x2 - one);
+    dx1 = dy * x2 * std::pow(x1, x2 - El::TypeTraits<TensorDataType>::One());
     dx2 = dy * std::log(x1) * std::pow(x1, x2);
   }
 };
@@ -187,62 +193,65 @@ struct pow_op {
  *  If a standard division produces an infinity or NaN, zero is output
  *  instead.
  */
+template <typename TensorDataType>
 struct safe_divide_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     const auto& y = x1 / x2;
     if (std::isfinite(y)) { return y; }
-    else                  { return zero; }
+    else                  { return El::TypeTraits<TensorDataType>::Zero(); }
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     const auto& y = x1 / x2;
     if (std::isfinite(y)) {
       dx1 = dy / x2;
       dx2 = -dy * x1 / (x2*x2);
     } else {
-      dx1 = zero;
-      dx2 = zero;
+      dx1 = El::TypeTraits<TensorDataType>::Zero();
+      dx2 = El::TypeTraits<TensorDataType>::Zero();
     }
   }
 };
 
 /** Squared difference operator. */
+template <typename TensorDataType>
 struct squared_difference_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     const auto& diff = x1 - x2;
     return diff * diff;
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     dx1 = dy * 2*(x1-x2);
     dx2 = dy * 2*(x2-x1);
   }
 };
 
 /** Maximum operator. */
+template <typename TensorDataType>
 struct max_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return std::max(x1, x2);
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     if (x1 > x2) {
       dx1 = dy;
-      dx2 = zero;
+      dx2 = El::TypeTraits<TensorDataType>::Zero();
     } else if (x2 > x1) {
-      dx1 = zero;
+      dx1 = El::TypeTraits<TensorDataType>::Zero();
       dx2 = dy;
     } else {
       dx1 = dy / 2;
@@ -252,21 +261,22 @@ struct max_op {
 };
 
 /** Minimum operator. */
+template <typename TensorDataType>
 struct min_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
     return std::min(x1, x2);
   }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
     if (x1 < x2) {
       dx1 = dy;
-      dx2 = zero;
+      dx2 = El::TypeTraits<TensorDataType>::Zero();
     } else if (x2 < x1) {
-      dx1 = zero;
+      dx1 = El::TypeTraits<TensorDataType>::Zero();
       dx2 = dy;
     } else {
       dx1 = dy / 2;
@@ -276,209 +286,227 @@ struct min_op {
 };
 
 /** Equal operator. */
+template <typename TensorDataType>
 struct equal_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 == x2 ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 == x2 ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Not equal operator. */
+template <typename TensorDataType>
 struct not_equal_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 == x2 ? zero : one;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 == x2 ? El::TypeTraits<TensorDataType>::Zero() : El::TypeTraits<TensorDataType>::One();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Less than operator. */
+template <typename TensorDataType>
 struct less_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 < x2 ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 < x2 ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Less than or equal operator. */
+template <typename TensorDataType>
 struct less_equal_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 <= x2 ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 <= x2 ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Greater than operator. */
+template <typename TensorDataType>
 struct greater_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 > x2 ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 > x2 ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Greater than or equal operator. */
+template <typename TensorDataType>
 struct greater_equal_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    return x1 >= x2 ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    return x1 >= x2 ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Logical and operator. */
+template <typename TensorDataType>
 struct logical_and_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 != zero && !std::isnan(x1);
-    const auto& b2 = x2 != zero && !std::isnan(x2);
-    return (b1 && b2) ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    const auto& b1 = x1 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x1);
+    const auto& b2 = x2 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x2);
+    return (b1 && b2) ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Logical or operator. */
+template <typename TensorDataType>
 struct logical_or_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 != zero && !std::isnan(x1);
-    const auto& b2 = x2 != zero && !std::isnan(x2);
-    return (b1 || b2) ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    const auto& b1 = x1 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x1);
+    const auto& b2 = x2 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x2);
+    return (b1 || b2) ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Logical xor operator. */
+template <typename TensorDataType>
 struct logical_xor_op {
-  inline DataType operator()(const DataType& x1,
-                             const DataType& x2) const {
-    const auto& b1 = x1 != zero && !std::isnan(x1);
-    const auto& b2 = x2 != zero && !std::isnan(x2);
-    return (b1 || b2) && !(b1 && b2) ? one : zero;
-  }
-  inline void operator()(const DataType& x1,
-                         const DataType& x2,
-                         const DataType& dy,
-                         DataType& dx1,
-                         DataType& dx2) const {
-    dx1 = zero;
-    dx2 = zero;
+  inline TensorDataType operator()(const TensorDataType& x1,
+                             const TensorDataType& x2) const {
+    const auto& b1 = x1 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x1);
+    const auto& b2 = x2 != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x2);
+    return (b1 || b2) && !(b1 && b2) ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
+  }
+  inline void operator()(const TensorDataType& x1,
+                         const TensorDataType& x2,
+                         const TensorDataType& dy,
+                         TensorDataType& dx1,
+                         TensorDataType& dx2) const {
+    dx1 = El::TypeTraits<TensorDataType>::Zero();
+    dx2 = El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::fp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
-                                        get_prev_activations(1),        \
-                                        get_activations());             \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::bp_compute() {                                               \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-         ::fp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
-                                        get_prev_activations(1),        \
-                                        get_activations());             \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    apply_entrywise_binary_operator<op>(                                \
+      this->get_prev_activations(0),                                    \
+      this->get_prev_activations(1),                                    \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-  ::bp_compute() {                                                      \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }
-  INSTANTIATE(add_layer, add_op)
-  INSTANTIATE(subtract_layer, subtract_op)
-  INSTANTIATE(multiply_layer, multiply_op)
-  INSTANTIATE(divide_layer, divide_op)
-  INSTANTIATE(mod_layer, mod_op)
-  INSTANTIATE(pow_layer, pow_op)
-  INSTANTIATE(safe_divide_layer, safe_divide_op)
-  INSTANTIATE(squared_difference_layer, squared_difference_op)
-  INSTANTIATE(max_layer, max_op)
-  INSTANTIATE(min_layer, min_op)
-  INSTANTIATE(equal_layer, equal_op)
-  INSTANTIATE(not_equal_layer, not_equal_op)
-  INSTANTIATE(less_layer, less_op)
-  INSTANTIATE(less_equal_layer, less_equal_op)
-  INSTANTIATE(greater_layer, greater_op)
-  INSTANTIATE(greater_equal_layer, greater_equal_op)
-  INSTANTIATE(logical_and_layer, logical_and_op)
-  INSTANTIATE(logical_or_layer, logical_or_op)
-  INSTANTIATE(logical_xor_layer, logical_xor_op)
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_binary_backprop_operator<op>(                                 \
+      this->get_local_prev_activations(0),                              \
+      this->get_local_prev_activations(1),                              \
+      this->get_local_prev_error_signals(),                             \
+      this->get_local_error_signals(0),                                 \
+      this->get_local_error_signals(1));                                \
+  }
+
+DEFINE_COMPUTE_OPS(add_layer, add_op)
+DEFINE_COMPUTE_OPS(subtract_layer, subtract_op)
+DEFINE_COMPUTE_OPS(multiply_layer, multiply_op)
+DEFINE_COMPUTE_OPS(divide_layer, divide_op)
+DEFINE_COMPUTE_OPS(mod_layer, mod_op)
+DEFINE_COMPUTE_OPS(pow_layer, pow_op)
+DEFINE_COMPUTE_OPS(safe_divide_layer, safe_divide_op)
+DEFINE_COMPUTE_OPS(squared_difference_layer, squared_difference_op)
+DEFINE_COMPUTE_OPS(max_layer, max_op)
+DEFINE_COMPUTE_OPS(min_layer, min_op)
+DEFINE_COMPUTE_OPS(equal_layer, equal_op)
+DEFINE_COMPUTE_OPS(not_equal_layer, not_equal_op)
+DEFINE_COMPUTE_OPS(less_layer, less_op)
+DEFINE_COMPUTE_OPS(less_equal_layer, less_equal_op)
+DEFINE_COMPUTE_OPS(greater_layer, greater_op)
+DEFINE_COMPUTE_OPS(greater_equal_layer, greater_equal_op)
+DEFINE_COMPUTE_OPS(logical_and_layer, logical_and_op)
+DEFINE_COMPUTE_OPS(logical_or_layer, logical_or_op)
+DEFINE_COMPUTE_OPS(logical_xor_layer, logical_xor_op)
+
+#define PROTO(T)                                                       \
+  BINARY_ETI_INST_MACRO_DEV_DT(add_layer, T, El::Device::CPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(subtract_layer, T, El::Device::CPU);    \
+  BINARY_ETI_INST_MACRO_DEV_DT(multiply_layer, T, El::Device::CPU);    \
+  BINARY_ETI_INST_MACRO_DEV_DT(divide_layer, T, El::Device::CPU);      \
+  BINARY_ETI_INST_MACRO_DEV_DT(mod_layer, T, El::Device::CPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(pow_layer, T, El::Device::CPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(safe_divide_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(squared_difference_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(max_layer, T, El::Device::CPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(min_layer, T, El::Device::CPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(equal_layer, T, El::Device::CPU);       \
+  BINARY_ETI_INST_MACRO_DEV_DT(not_equal_layer, T, El::Device::CPU);   \
+  BINARY_ETI_INST_MACRO_DEV_DT(less_layer, T, El::Device::CPU);        \
+  BINARY_ETI_INST_MACRO_DEV_DT(less_equal_layer, T, El::Device::CPU);  \
+  BINARY_ETI_INST_MACRO_DEV_DT(greater_layer, T, El::Device::CPU);     \
+  BINARY_ETI_INST_MACRO_DEV_DT(greater_equal_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_and_layer, T, El::Device::CPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_or_layer, T, El::Device::CPU);  \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_xor_layer, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu
index 401ea2c94b8..3367334a08c 100644
--- a/src/layers/math/binary.cu
+++ b/src/layers/math/binary.cu
@@ -24,30 +24,33 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BINARY_LAYER_INSTANTIATE
 #include "lbann/layers/math/binary.hpp"
+#include "lbann/utils/cuda.hpp"
 
 namespace lbann {
 
 namespace {
 
 /** CUDA kernel to apply an binary backprop operator. */
-template <typename BinaryBackPropOperator>
+template <template <typename> class BinaryBackPropOperator,
+          typename TensorDataType>
 __global__
 void binary_backprop_operator_kernel(El::Int height, El::Int width,
-                                     const DataType* __restrict__ x1,
+                                     const TensorDataType* __restrict__ x1,
                                      El::Int x1_ldim,
-                                     const DataType* __restrict__ x2,
+                                     const TensorDataType* __restrict__ x2,
                                      El::Int x2_ldim,
-                                     const DataType* __restrict__ dy,
+                                     const TensorDataType* __restrict__ dy,
                                      El::Int dy_ldim,
-                                     DataType* __restrict__ dx1,
+                                     TensorDataType* __restrict__ dx1,
                                      El::Int dx1_ldim,
-                                     DataType* __restrict__ dx2,
+                                     TensorDataType* __restrict__ dx2,
                                      El::Int dx2_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
   const El::Int num_threads = blockDim.x * gridDim.x;
-  BinaryBackPropOperator op;
+  BinaryBackPropOperator<TensorDataType> op;
   for (El::Int pos = gid; pos < size; pos += num_threads) {
     const auto& row = pos % height;
     const auto& col = pos / height;
@@ -68,12 +71,13 @@ void binary_backprop_operator_kernel(El::Int height, El::Int width,
  *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
  *  the BinaryBackPropOperator is called.
  */
-template <typename BinaryBackPropOperator>
-void apply_binary_backprop_operator(const AbsMat& x1,
-                                    const AbsMat& x2,
-                                    const AbsMat& dy,
-                                    AbsMat& dx1,
-                                    AbsMat& dx2) {
+template <template <typename> class BinaryBackPropOperator,
+          typename TensorDataType>
+void apply_binary_backprop_operator(const El::AbstractMatrix<TensorDataType>& x1,
+                                    const El::AbstractMatrix<TensorDataType>& x2,
+                                    const El::AbstractMatrix<TensorDataType>& dy,
+                                    El::AbstractMatrix<TensorDataType>& dx1,
+                                    El::AbstractMatrix<TensorDataType>& dx2) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -111,98 +115,104 @@ void apply_binary_backprop_operator(const AbsMat& x1,
 // (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
 
 /** Add operator. */
+template <typename TensorDataType>
 struct add_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return x1 + x2;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = dy;
   }
 };
 
 /** Subtract operator. */
+template <typename TensorDataType>
 struct subtract_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return x1 - x2;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = -dy;
   }
 };
 
 /** Multiply operator. */
+template <typename TensorDataType>
 struct multiply_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return x1 * x2;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     dx1 = dy * x2;
     dx2 = dy * x1;
   }
 };
 
 /** Divide operator. */
+template <typename TensorDataType>
 struct divide_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return x1 / x2;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     dx1 = dy / x2;
     dx2 = -dy * x1 / (x2*x2);
   }
 };
 
 /** Modulo operator. */
+template <typename TensorDataType>
 struct mod_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return cuda::mod(x1, x2);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     dx1 = dy;
     dx2 = -dy * cuda::floor(x1 / x2);
   }
 };
 
 /** Power operator. */
+template <typename TensorDataType>
 struct pow_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return cuda::pow(x1, x2);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
 
-    dx1 = dy * x2 * cuda::pow(x1, x2 - DataType(1));
+    dx1 = dy * x2 * cuda::pow(x1, x2 - TensorDataType(1.0));
     dx2 = dy * cuda::log(x1) * cuda::pow(x1, x2);
   }
 };
@@ -211,298 +221,320 @@ struct pow_op {
  *  If a standard division produces an infinity or NaN, zero is output
  *  instead.
  */
+template <typename TensorDataType>
 struct safe_divide_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     const auto& y = x1 / x2;
-    if (isfinite(y)) { return y; }
-    else             { return DataType(0); }
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+    if (cuda::isfinite(y)) { return y; }
+    else             { return TensorDataType(0.0); }
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     const auto& y = x1 / x2;
-    if (isfinite(y)) {
+    if (cuda::isfinite(y)) {
       dx1 = dy / x2;
       dx2 = -dy * x1 / (x2*x2);
     } else {
-      dx1 = DataType(0);
-      dx2 = DataType(0);
+      dx1 = TensorDataType(0.0);
+      dx2 = TensorDataType(0.0);
     }
   }
 };
 
 /** Squared difference operator. */
+template <typename TensorDataType>
 struct squared_difference_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     const auto& diff = x1 - x2;
     return diff * diff;
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = dy * 2*(x1-x2);
-    dx2 = dy * 2*(x2-x1);
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = dy * TensorDataType(2.) * (x1-x2);
+    dx2 = dy * TensorDataType(2.) * (x2-x1);
   }
 };
 
 /** Maximum operator. */
+template <typename TensorDataType>
 struct max_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return cuda::max(x1, x2);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     if (x1 > x2) {
       dx1 = dy;
-      dx2 = DataType(0);
+      dx2 = TensorDataType(0.0);
     } else if (x2 > x1) {
-      dx1 = DataType(0);
+      dx1 = TensorDataType(0.0);
       dx2 = dy;
     } else {
-      dx1 = dy / 2;
-      dx2 = dy / 2;
+      dx1 = dy / TensorDataType(2.);
+      dx2 = dy / TensorDataType(2.);
     }
   }
 };
 
 /** Minimum operator. */
+template <typename TensorDataType>
 struct min_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
     return cuda::min(x1, x2);
   }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
     if (x1 < x2) {
       dx1 = dy;
-      dx2 = DataType(0);
+      dx2 = TensorDataType(0.0);
     } else if (x2 < x1) {
-      dx1 = DataType(0);
+      dx1 = TensorDataType(0.0);
       dx2 = dy;
     } else {
-      dx1 = dy / 2;
-      dx2 = dy / 2;
+      dx1 = dy / TensorDataType(2.);
+      dx2 = dy / TensorDataType(2.);
     }
   }
 };
 
 /** Equal operator. */
+template <typename TensorDataType>
 struct equal_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 == x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 == x2 ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Not equal operator. */
+template <typename TensorDataType>
 struct not_equal_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 == x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 == x2 ? TensorDataType(0.0) : TensorDataType(1.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Less than operator. */
+template <typename TensorDataType>
 struct less_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 < x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 < x2 ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Less than or equal operator. */
+template <typename TensorDataType>
 struct less_equal_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 <= x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 <= x2 ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Greater than operator. */
+template <typename TensorDataType>
 struct greater_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 > x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 > x2 ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Greater than or equal operator. */
+template <typename TensorDataType>
 struct greater_equal_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    return x1 >= x2 ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    return x1 >= x2 ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Logical and operator. */
+template <typename TensorDataType>
 struct logical_and_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 != DataType(0) && !isnan(x1);
-    const auto& b2 = x2 != DataType(0) && !isnan(x2);
-    return (b1 && b2) ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 != TensorDataType(0.0) && !cuda::isnan(x1);
+    const auto& b2 = x2 != TensorDataType(0.0) && !cuda::isnan(x2);
+    return (b1 && b2) ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Logical or operator. */
+template <typename TensorDataType>
 struct logical_or_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 != DataType(0) && !isnan(x1);
-    const auto& b2 = x2 != DataType(0) && !isnan(x2);
-    return (b1 || b2) ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 != TensorDataType(0.0) && !cuda::isnan(x1);
+    const auto& b2 = x2 != TensorDataType(0.0) && !cuda::isnan(x2);
+    return (b1 || b2) ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 /** Logical xor operator. */
+template <typename TensorDataType>
 struct logical_xor_op {
-  inline __device__ DataType operator()(const DataType& x1,
-                                        const DataType& x2) const {
-    const auto& b1 = x1 != DataType(0) && !isnan(x1);
-    const auto& b2 = x2 != DataType(0) && !isnan(x2);
-    return (b1 || b2) && !(b1 && b2) ? DataType(1) : DataType(0);
-  }
-  inline __device__ void operator()(const DataType& x1,
-                                    const DataType& x2,
-                                    const DataType& dy,
-                                    DataType& dx1,
-                                    DataType& dx2) const {
-    dx1 = DataType(0);
-    dx2 = DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x1,
+                                        const TensorDataType& x2) const {
+    const auto& b1 = x1 != TensorDataType(0.0) && !cuda::isnan(x1);
+    const auto& b2 = x2 != TensorDataType(0.0) && !cuda::isnan(x2);
+    return (b1 || b2) && !(b1 && b2) ? TensorDataType(1.0) : TensorDataType(0.0);
+  }
+  inline __device__ void operator()(const TensorDataType& x1,
+                                    const TensorDataType& x2,
+                                    const TensorDataType& dy,
+                                    TensorDataType& dx1,
+                                    TensorDataType& dx2) const {
+    dx1 = TensorDataType(0.0);
+    dx2 = TensorDataType(0.0);
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-         ::fp_compute() {                                               \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
-                                              get_prev_activations(1),  \
-                                              get_activations());       \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    cuda::apply_entrywise_binary_operator<op>(                          \
+      this->get_prev_activations(0),                                    \
+      this->get_prev_activations(1),                                    \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-         ::bp_compute() {                                               \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-         ::fp_compute() {                                               \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
-                                              get_prev_activations(1),  \
-                                              get_activations());       \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::bp_compute() {                                                      \
-    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
-                                       get_local_prev_activations(1),   \
-                                       get_local_prev_error_signals(),  \
-                                       get_local_error_signals(0),      \
-                                       get_local_error_signals(1));     \
-  }
-  INSTANTIATE(add_layer, add_op)
-  INSTANTIATE(subtract_layer, subtract_op)
-  INSTANTIATE(multiply_layer, multiply_op)
-  INSTANTIATE(divide_layer, divide_op)
-  INSTANTIATE(mod_layer, mod_op)
-  INSTANTIATE(pow_layer, pow_op)
-  INSTANTIATE(safe_divide_layer, safe_divide_op)
-  INSTANTIATE(squared_difference_layer, squared_difference_op)
-  INSTANTIATE(max_layer, max_op)
-  INSTANTIATE(min_layer, min_op)
-  INSTANTIATE(equal_layer, equal_op)
-  INSTANTIATE(not_equal_layer, not_equal_op)
-  INSTANTIATE(less_layer, less_op)
-  INSTANTIATE(less_equal_layer, less_equal_op)
-  INSTANTIATE(greater_layer, greater_op)
-  INSTANTIATE(greater_equal_layer, greater_equal_op)
-  INSTANTIATE(logical_and_layer, logical_and_op)
-  INSTANTIATE(logical_or_layer, logical_or_op)
-  INSTANTIATE(logical_xor_layer, logical_xor_op)
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_binary_backprop_operator<op>(                                 \
+      this->get_local_prev_activations(0),                              \
+      this->get_local_prev_activations(1),                              \
+      this->get_local_prev_error_signals(),                             \
+      this->get_local_error_signals(0),                                 \
+      this->get_local_error_signals(1));                                \
+  }
+
+DEFINE_COMPUTE_OPS(add_layer, add_op)
+DEFINE_COMPUTE_OPS(subtract_layer, subtract_op)
+DEFINE_COMPUTE_OPS(multiply_layer, multiply_op)
+DEFINE_COMPUTE_OPS(divide_layer, divide_op)
+DEFINE_COMPUTE_OPS(mod_layer, mod_op)
+DEFINE_COMPUTE_OPS(pow_layer, pow_op)
+DEFINE_COMPUTE_OPS(safe_divide_layer, safe_divide_op)
+DEFINE_COMPUTE_OPS(squared_difference_layer, squared_difference_op)
+DEFINE_COMPUTE_OPS(max_layer, max_op)
+DEFINE_COMPUTE_OPS(min_layer, min_op)
+DEFINE_COMPUTE_OPS(equal_layer, equal_op)
+DEFINE_COMPUTE_OPS(not_equal_layer, not_equal_op)
+DEFINE_COMPUTE_OPS(less_layer, less_op)
+DEFINE_COMPUTE_OPS(less_equal_layer, less_equal_op)
+DEFINE_COMPUTE_OPS(greater_layer, greater_op)
+DEFINE_COMPUTE_OPS(greater_equal_layer, greater_equal_op)
+DEFINE_COMPUTE_OPS(logical_and_layer, logical_and_op)
+DEFINE_COMPUTE_OPS(logical_or_layer, logical_or_op)
+DEFINE_COMPUTE_OPS(logical_xor_layer, logical_xor_op)
+
+#define PROTO(T)                                                       \
+  BINARY_ETI_INST_MACRO_DEV_DT(add_layer, T, El::Device::GPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(subtract_layer, T, El::Device::GPU);    \
+  BINARY_ETI_INST_MACRO_DEV_DT(multiply_layer, T, El::Device::GPU);    \
+  BINARY_ETI_INST_MACRO_DEV_DT(divide_layer, T, El::Device::GPU);      \
+  BINARY_ETI_INST_MACRO_DEV_DT(mod_layer, T, El::Device::GPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(pow_layer, T, El::Device::GPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(safe_divide_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(squared_difference_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(max_layer, T, El::Device::GPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(min_layer, T, El::Device::GPU);         \
+  BINARY_ETI_INST_MACRO_DEV_DT(equal_layer, T, El::Device::GPU);       \
+  BINARY_ETI_INST_MACRO_DEV_DT(not_equal_layer, T, El::Device::GPU);   \
+  BINARY_ETI_INST_MACRO_DEV_DT(less_layer, T, El::Device::GPU);        \
+  BINARY_ETI_INST_MACRO_DEV_DT(less_equal_layer, T, El::Device::GPU);  \
+  BINARY_ETI_INST_MACRO_DEV_DT(greater_layer, T, El::Device::GPU);     \
+  BINARY_ETI_INST_MACRO_DEV_DT(greater_equal_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_and_layer, T, El::Device::GPU); \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_or_layer, T, El::Device::GPU);  \
+  BINARY_ETI_INST_MACRO_DEV_DT(logical_xor_layer, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/math/clamp.cpp b/src/layers/math/clamp.cpp
index 0815cc2ee61..f6303a1730e 100644
--- a/src/layers/math/clamp.cpp
+++ b/src/layers/math/clamp.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CLAMP_LAYER_INSTANTIATE
 #include "lbann/layers/math/clamp.hpp"
 
 namespace lbann {
@@ -31,10 +32,11 @@ namespace lbann {
 namespace {
 
 /** Local forward prop computation. */
-void local_fp(DataType min,
-              DataType max,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType min,
+              TensorDataType max,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -50,11 +52,12 @@ void local_fp(DataType min,
 }
 
 /** Local backprop computation. */
-void local_bp(DataType min,
-              DataType max,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType min,
+              TensorDataType max,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
   const auto& height = input.Height();
   const auto& width = input.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
@@ -63,42 +66,34 @@ void local_bp(DataType min,
       const auto& x = input(row, col);
       const auto& dy = gradient_wrt_output(row, col);
       auto& dx = gradient_wrt_input(row, col);
-      dx = (x <= min || x >= max) ? DataType(0) : dy;
+      dx = (x <= min || x >= max) ? El::TypeTraits<TensorDataType>::Zero() : dy;
     }
   }
 }
 
 } // namespace
 
-template <>
-void clamp_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_activations());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void clamp_layer<TensorDataType, Layout, Device>::fp_compute() {
+  local_fp(this->m_min, this->m_max,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void clamp_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
-}
-template <>
-void clamp_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-       ::fp_compute() {
-  local_fp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_activations());
-}
-template <>
-void clamp_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  local_bp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void clamp_layer<TensorDataType, Layout, Device>::bp_compute() {
+  local_bp(this->m_min, this->m_max,
+           this->get_local_prev_activations(),
+           this->get_local_prev_error_signals(),
+           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                     \
+  template class clamp_layer<                        \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class clamp_layer<                        \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/math/clamp.cu b/src/layers/math/clamp.cu
index 31e4064f745..13947b016c6 100644
--- a/src/layers/math/clamp.cu
+++ b/src/layers/math/clamp.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CLAMP_LAYER_INSTANTIATE
 #include "lbann/layers/math/clamp.hpp"
 
 namespace lbann {
@@ -31,13 +32,14 @@ namespace lbann {
 namespace {
 
 /** CUDA kernel for forward prop computation. */
-__global__ void fp_kernel(DataType min,
-                          DataType max,
+template <typename TensorDataType>
+__global__ void fp_kernel(TensorDataType min,
+                          TensorDataType max,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          DataType* __restrict__ output,
+                          TensorDataType* __restrict__ output,
                           El::Int output_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -54,15 +56,16 @@ __global__ void fp_kernel(DataType min,
 }
 
 /** CUDA kernel for backprop computation. */
-__global__ void bp_kernel(DataType min,
-                          DataType max,
+template <typename TensorDataType>
+__global__ void bp_kernel(TensorDataType min,
+                          TensorDataType max,
                           El::Int height,
                           El::Int width,
-                          const DataType* __restrict__ input,
+                          const TensorDataType* __restrict__ input,
                           El::Int input_ldim,
-                          const DataType* __restrict__ gradient_wrt_output,
+                          const TensorDataType* __restrict__ gradient_wrt_output,
                           El::Int gradient_wrt_output_ldim,
-                          DataType* __restrict__ gradient_wrt_input,
+                          TensorDataType* __restrict__ gradient_wrt_input,
                           El::Int gradient_wrt_input_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -73,15 +76,16 @@ __global__ void bp_kernel(DataType min,
     const auto& x = input[row + col * input_ldim];
     const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
     auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
-    dx = (x <= min || x >= max) ? DataType(0) : dy;
+    dx = (x <= min || x >= max) ? TensorDataType(0.f) : dy;
   }
 }
 
 /** Local forward prop computation. */
-void local_fp(DataType min,
-              DataType max,
-              const AbsMat& input,
-              AbsMat& output) {
+template <typename TensorDataType>
+void local_fp(TensorDataType min,
+              TensorDataType max,
+              const El::AbstractMatrix<TensorDataType>& input,
+              El::AbstractMatrix<TensorDataType>& output) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -106,11 +110,12 @@ void local_fp(DataType min,
 }
 
 /** Local backprop computation. */
-void local_bp(DataType min,
-              DataType max,
-              const AbsMat& input,
-              const AbsMat& gradient_wrt_output,
-              AbsMat& gradient_wrt_input) {
+template <typename TensorDataType>
+void local_bp(TensorDataType min,
+              TensorDataType max,
+              const El::AbstractMatrix<TensorDataType>& input,
+              const El::AbstractMatrix<TensorDataType>& gradient_wrt_output,
+              El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
 
   // Get CUDA grid dimensions
   // Note: Maximum CUDA grid dimension is 2^32-1
@@ -137,35 +142,27 @@ void local_bp(DataType min,
 
 } // namespace
 
-template <>
-void clamp_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_activations());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void clamp_layer<TensorDataType, Layout, Device>::fp_compute() {
+  local_fp(this->m_min, this->m_max,
+           this->get_local_prev_activations(),
+           this->get_local_activations());
 }
-template <>
-void clamp_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
-}
-template <>
-void clamp_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-       ::fp_compute() {
-  local_fp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_activations());
-}
-template <>
-void clamp_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  local_bp(m_min, m_max,
-           get_local_prev_activations(),
-           get_local_prev_error_signals(),
-           get_local_error_signals());
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void clamp_layer<TensorDataType, Layout, Device>::bp_compute() {
+  local_bp(this->m_min, this->m_max,
+           this->get_local_prev_activations(),
+           this->get_local_prev_error_signals(),
+           this->get_local_error_signals());
 }
 
+#define PROTO(T)                                     \
+  template class clamp_layer<                        \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class clamp_layer<                        \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/math/matmul.cpp b/src/layers/math/matmul.cpp
new file mode 100644
index 00000000000..4baabf62f03
--- /dev/null
+++ b/src/layers/math/matmul.cpp
@@ -0,0 +1,298 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_MATMUL_LAYER_INSTANTIATE
+#include "lbann/layers/math/matmul.hpp"
+#ifdef LBANN_HAS_GPU
+#include "lbann/utils/cublas.hpp"
+#endif // LBANN_HAS_GPU
+
+namespace lbann {
+
+template <typename TensorDataType>
+void fp_compute_impl(matmul_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l,
+                     bool transpose_input0,
+                     bool transpose_input1) {
+
+  // Local data
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input0 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_input1 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  auto& local_output = dynamic_cast<LocalMat&>(l.get_local_activations());
+  const auto& local_mini_batch_size = local_input0.Width();
+
+  // Matrix dimensions
+  const auto input0_dims = l.get_input_dims(0);
+  const auto input1_dims = l.get_input_dims(1);
+  const auto output_dims = l.get_output_dims();
+  const El::Int input0_height = *(input0_dims.rbegin()+1);
+  const El::Int input0_width = *(input0_dims.rbegin());
+  const El::Int input1_height = *(input1_dims.rbegin()+1);
+  const El::Int input1_width = *(input1_dims.rbegin());
+  const El::Int output_height = *(output_dims.rbegin()+1);
+  const El::Int output_width = *(output_dims.rbegin());
+
+  // Compute matrix multiplication for each mini-batch sample
+  // Note: Elemental matrices are in Fortran layout while LBANN
+  // tensors are in C layout.
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < local_mini_batch_size; ++i) {
+    LocalMat input0_v, input1_v, output_v;
+    input0_v.LockedAttach(input0_width, input0_height,
+                          local_input0.LockedBuffer(0,i), input0_width);
+    input1_v.LockedAttach(input1_width, input1_height,
+                          local_input1.LockedBuffer(0,i), input1_width);
+    output_v.Attach(output_width, output_height,
+                    local_output.Buffer(0,i), output_width);
+    El::Gemm(transpose_input1 ? El::TRANSPOSE : El::NORMAL,
+             transpose_input0 ? El::TRANSPOSE : El::NORMAL,
+             El::TypeTraits<TensorDataType>::One(), input1_v, input0_v,
+             El::TypeTraits<TensorDataType>::Zero(), output_v);
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(matmul_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l,
+                     bool transpose_input0,
+                     bool transpose_input1) {
+
+  // Local data
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input0 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_input1 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(l.get_local_prev_error_signals());
+  auto& local_input0_grad = dynamic_cast<LocalMat&>(l.get_local_error_signals(0));
+  auto& local_input1_grad = dynamic_cast<LocalMat&>(l.get_local_error_signals(1));
+  const auto& local_mini_batch_size = local_input0.Width();
+
+  // Matrix dimensions
+  const auto input0_dims = l.get_input_dims(0);
+  const auto input1_dims = l.get_input_dims(1);
+  const auto output_dims = l.get_output_dims();
+  const El::Int input0_height = *(input0_dims.rbegin()+1);
+  const El::Int input0_width = *(input0_dims.rbegin());
+  const El::Int input1_height = *(input1_dims.rbegin()+1);
+  const El::Int input1_width = *(input1_dims.rbegin());
+  const El::Int output_height = *(output_dims.rbegin()+1);
+  const El::Int output_width = *(output_dims.rbegin());
+
+  // Compute gradients for each mini-batch sample
+  // Note: Elemental matrices are in Fortran layout while LBANN
+  // tensors are in C layout.
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < local_mini_batch_size; ++i) {
+    LocalMat input0_v, input1_v, output_grad_v, input0_grad_v, input1_grad_v;
+    input0_v.LockedAttach(input0_width, input0_height,
+                          local_input0.LockedBuffer(0,i), input0_width);
+    input1_v.LockedAttach(input1_width, input1_height,
+                          local_input1.LockedBuffer(0,i), input1_width);
+    output_grad_v.LockedAttach(output_width, output_height,
+                               local_output_grad.LockedBuffer(0,i), output_width);
+    input0_grad_v.Attach(input0_width, input0_height,
+                         local_input0_grad.Buffer(0,i), input0_width);
+    input1_grad_v.Attach(input1_width, input1_height,
+                         local_input1_grad.Buffer(0,i), input1_width);
+    if (transpose_input0) {
+      El::Gemm(El::TRANSPOSE,
+               transpose_input1 ? El::TRANSPOSE : El::NORMAL,
+               El::TypeTraits<TensorDataType>::One(), output_grad_v, input1_v,
+               El::TypeTraits<TensorDataType>::Zero(), input0_grad_v);
+    }
+    else {
+      El::Gemm(transpose_input1 ? El::NORMAL : El::TRANSPOSE,
+               El::NORMAL,
+               El::TypeTraits<TensorDataType>::One(), input1_v, output_grad_v,
+               El::TypeTraits<TensorDataType>::Zero(), input0_grad_v);
+    }
+    if (transpose_input1) {
+      El::Gemm(transpose_input0 ? El::TRANSPOSE : El::NORMAL,
+               El::TRANSPOSE,
+               El::TypeTraits<TensorDataType>::One(), input0_v, output_grad_v,
+               El::TypeTraits<TensorDataType>::Zero(), input1_grad_v);
+    }
+    else {
+      El::Gemm(El::NORMAL,
+               transpose_input0 ? El::NORMAL : El::TRANSPOSE,
+               El::TypeTraits<TensorDataType>::One(), output_grad_v, input0_v,
+               El::TypeTraits<TensorDataType>::Zero(), input1_grad_v);
+    }
+  }
+
+}
+
+#ifdef LBANN_HAS_GPU
+template <typename TensorDataType>
+void fp_compute_impl(matmul_layer<TensorDataType, data_layout::DATA_PARALLEL,El::Device::GPU>& l,
+                     bool transpose_input0,
+                     bool transpose_input1) {
+
+  // Local data
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_input0 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_input1 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  auto& local_output = dynamic_cast<LocalMat&>(l.get_local_activations());
+  const auto& local_mini_batch_size = local_input0.Width();
+
+  // Return immediately if nothing needs to be done
+  if (local_mini_batch_size < 1) { return; }
+
+  // Matrix dimensions
+  const auto input0_dims = l.get_input_dims(0);
+  const auto input1_dims = l.get_input_dims(1);
+  const auto output_dims = l.get_output_dims();
+  const El::Int input0_height = *(input0_dims.rbegin()+1);
+  const El::Int input0_width = *(input0_dims.rbegin());
+  const El::Int input1_width = *(input1_dims.rbegin());
+  const El::Int output_height = *(output_dims.rbegin()+1);
+  const El::Int output_width = *(output_dims.rbegin());
+
+  // Compute matrix multiplication for each mini-batch sample
+  // Note: cuBLAS expects matrices in Fortran layout while LBANN
+  // tensors are in C layout.
+  auto&& handle = El::GPUManager::cuBLASHandle();
+  cublas::gemm_strided_batched(
+    handle,
+    transpose_input1 ? CUBLAS_OP_T : CUBLAS_OP_N,
+    transpose_input0 ? CUBLAS_OP_T : CUBLAS_OP_N,
+    output_width,
+    output_height,
+    transpose_input0 ? input0_height : input0_width,
+    El::TypeTraits<TensorDataType>::One(),
+    local_input1.LockedBuffer(), input1_width, local_input1.LDim(),
+    local_input0.LockedBuffer(), input0_width, local_input0.LDim(),
+    El::TypeTraits<TensorDataType>::Zero(),
+    local_output.Buffer(), output_width, local_output.LDim(),
+    local_mini_batch_size);
+
+}
+#endif // LBANN_HAS_GPU
+
+#ifdef LBANN_HAS_GPU
+template <typename TensorDataType>
+void bp_compute_impl(matmul_layer<TensorDataType, data_layout::DATA_PARALLEL,El::Device::GPU>& l,
+                     bool transpose_input0,
+                     bool transpose_input1) {
+
+  // Local data
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_input0 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(0));
+  const auto& local_input1 = dynamic_cast<const LocalMat&>(l.get_local_prev_activations(1));
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(l.get_local_prev_error_signals());
+  auto& local_input0_grad = dynamic_cast<LocalMat&>(l.get_local_error_signals(0));
+  auto& local_input1_grad = dynamic_cast<LocalMat&>(l.get_local_error_signals(1));
+  const auto& local_mini_batch_size = local_input0.Width();
+
+  // Return immediately if nothing needs to be done
+  if (local_mini_batch_size < 1) { return; }
+
+  // Matrix dimensions
+  const auto input0_dims = l.get_input_dims(0);
+  const auto input1_dims = l.get_input_dims(1);
+  const auto output_dims = l.get_output_dims();
+  const El::Int input0_height = *(input0_dims.rbegin()+1);
+  const El::Int input0_width = *(input0_dims.rbegin());
+  const El::Int input1_height = *(input1_dims.rbegin()+1);
+  const El::Int input1_width = *(input1_dims.rbegin());
+  const El::Int output_height = *(output_dims.rbegin()+1);
+  const El::Int output_width = *(output_dims.rbegin());
+
+  // Compute gradients for each mini-batch sample
+  // Note: cuBLAS expects matrices in Fortran layout while LBANN
+  // tensors are in C layout.
+  auto&& handle = El::GPUManager::cuBLASHandle();
+  if (transpose_input0) {
+    cublas::gemm_strided_batched(
+      handle,
+      CUBLAS_OP_T,
+      transpose_input1 ? CUBLAS_OP_T : CUBLAS_OP_N,
+      input0_width, input0_height, output_width,
+      El::TypeTraits<TensorDataType>::One(),
+      local_output_grad.LockedBuffer(), output_width, local_output_grad.LDim(),
+      local_input1.LockedBuffer(), input1_width, local_input1.LDim(),
+      El::TypeTraits<TensorDataType>::Zero(),
+      local_input0_grad.Buffer(), input0_width, local_input0_grad.LDim(),
+      local_mini_batch_size);
+  }
+  else {
+    cublas::gemm_strided_batched(
+      handle,
+      transpose_input1 ? CUBLAS_OP_N : CUBLAS_OP_T,
+      CUBLAS_OP_N,
+      input0_width, input0_height, output_width,
+      El::TypeTraits<TensorDataType>::One(),
+      local_input1.LockedBuffer(), input1_width, local_input1.LDim(),
+      local_output_grad.LockedBuffer(), output_width, local_output_grad.LDim(),
+      El::TypeTraits<TensorDataType>::Zero(),
+      local_input0_grad.Buffer(), input0_width, local_input0_grad.LDim(),
+      local_mini_batch_size);
+  }
+  if (transpose_input1) {
+    cublas::gemm_strided_batched(
+      handle,
+      transpose_input0 ? CUBLAS_OP_T : CUBLAS_OP_N,
+      CUBLAS_OP_T,
+      input1_width, input1_height, output_height,
+      El::TypeTraits<TensorDataType>::One(),
+      local_input0.LockedBuffer(), input0_width, local_input0.LDim(),
+      local_output_grad.LockedBuffer(), output_width, local_output_grad.LDim(),
+      El::TypeTraits<TensorDataType>::Zero(),
+      local_input1_grad.Buffer(), input1_width, local_input1_grad.LDim(),
+      local_mini_batch_size);
+  }
+  else {
+    cublas::gemm_strided_batched(
+      handle,
+      CUBLAS_OP_N,
+      transpose_input0 ? CUBLAS_OP_N : CUBLAS_OP_T,
+      input1_width, input1_height, output_height,
+      El::TypeTraits<TensorDataType>::One(),
+      local_output_grad.LockedBuffer(), output_width, local_output_grad.LDim(),
+      local_input0.LockedBuffer(), input0_width, local_input0.LDim(),
+      El::TypeTraits<TensorDataType>::Zero(),
+      local_input1_grad.Buffer(), input1_width, local_input1_grad.LDim(),
+      local_mini_batch_size);
+  }
+
+}
+#endif // LBANN_HAS_GPU
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void matmul_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_compute_impl(*this, m_transpose_a, m_transpose_b);
+}
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void matmul_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_compute_impl(*this, m_transpose_a, m_transpose_b);
+}
+
+// Explicit instantiation
+#define PROTO_DEVICE(T, Device) \
+  template class matmul_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/math/unary.cpp b/src/layers/math/unary.cpp
index f0d8954e0dd..5f19bb55f40 100644
--- a/src/layers/math/unary.cpp
+++ b/src/layers/math/unary.cpp
@@ -1,4 +1,4 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
 // Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
 // Produced at the Lawrence Livermore National Laboratory.
 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_UNARY_LAYER_INSTANTIATE
 #include "lbann/layers/math/unary.hpp"
 #include "lbann/utils/entrywise_operator.hpp"
 
@@ -31,10 +32,6 @@ namespace lbann {
 
 namespace {
 
-// Helpful constants
-constexpr DataType zero = 0;
-constexpr DataType one = 1;
-
 // =========================================================
 // Operator objects for entry-wise unary layers
 // =========================================================
@@ -44,359 +41,413 @@ constexpr DataType one = 1;
 // (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
 
 /** Logical not operator. */
+template <typename TensorDataType>
 struct logical_not_op {
-  inline DataType operator()(const DataType& x) const {
-    const auto& b = x != zero && !std::isnan(x);
-    return !b ? one : zero;
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    const auto& b = x != El::TypeTraits<TensorDataType>::Zero() && !std::isnan(x);
+    return !b ? El::TypeTraits<TensorDataType>::One() : El::TypeTraits<TensorDataType>::Zero();
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Absolute value operator. */
+template <typename TensorDataType>
 struct abs_op {
-  inline DataType operator()(const DataType& x) const {
-    return x >= zero ? x : -x;
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return x >= El::TypeTraits<TensorDataType>::Zero() ? x : -x;
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    if      (x > zero) { return dy;   }
-    else if (x < zero) { return -dy;  }
-    else               { return zero; }
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    if      (x > El::TypeTraits<TensorDataType>::Zero()) { return dy;   }
+    else if (x < El::TypeTraits<TensorDataType>::Zero()) { return -dy;  }
+    else               { return El::TypeTraits<TensorDataType>::Zero(); }
   }
 };
 
 /** Negative operator. */
+template <typename TensorDataType>
 struct negative_op {
-  inline DataType operator()(const DataType& x) const {
+  inline TensorDataType operator()(const TensorDataType& x) const {
     return -x;
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return -dy;
   }
 };
 
 /** Sign operator. */
+template <typename TensorDataType>
 struct sign_op {
-  inline DataType operator()(const DataType& x) const {
-    if      (x > zero) { return one;  }
-    else if (x < zero) { return -one; }
-    else               { return zero; }
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    if      (x > El::TypeTraits<TensorDataType>::Zero()) { return El::TypeTraits<TensorDataType>::One();  }
+    else if (x < El::TypeTraits<TensorDataType>::Zero()) { return -El::TypeTraits<TensorDataType>::One(); }
+    else               { return El::TypeTraits<TensorDataType>::Zero(); }
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Round operator. */
+template <typename TensorDataType>
 struct round_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::round(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::round;
+    return round(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Ceiling operator. */
+template <typename TensorDataType>
 struct ceil_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::ceil(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::ceil;
+    return ceil(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Floor operator. */
+template <typename TensorDataType>
 struct floor_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::floor(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::floor;
+    return floor(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return zero;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::TypeTraits<TensorDataType>::Zero();
   }
 };
 
 /** Reciprocal operator.
- *  If a standard reciprocal produces an infinity or NaN, zero is
+ *  If a standard reciprocal produces an infinity or NaN, El::TypeTraits<TensorDataType>::Zero() is
  *  output instead.
  */
+template <typename TensorDataType>
 struct reciprocal_op {
-  inline DataType operator()(const DataType& x) const {
-    return 1 / x;
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::To<TensorDataType>(1) / x;
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    if (dy == zero) { return zero; }
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    if (dy == El::TypeTraits<TensorDataType>::Zero()) { return El::TypeTraits<TensorDataType>::Zero(); }
     else            { return - dy / (x*x); }
   }
 };
 
 /** Square operator. */
+template <typename TensorDataType>
 struct square_op {
-  inline DataType operator()(const DataType& x) const {
+  inline TensorDataType operator()(const TensorDataType& x) const {
     return x*x;
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return 2*x * dy;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return El::To<TensorDataType>(2)*x * dy;
   }
 };
 
 
 /** Square root operator. */
+template <typename TensorDataType>
 struct sqrt_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::sqrt(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Sqrt(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (2 * std::sqrt(x));
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (El::To<TensorDataType>(2) * El::Sqrt(x));
   }
 };
 
 /** Reciprocal square root operator. */
+template <typename TensorDataType>
 struct rsqrt_op {
-  inline DataType operator()(const DataType& x) const {
-    return 1 / std::sqrt(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::To<TensorDataType>(1) / El::Sqrt(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& s = std::sqrt(x);
-    return - dy / (2 * x * s);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& s = El::Sqrt(x);
+    return - dy / (El::To<TensorDataType>(2) * x * s);
   }
 };
 
 /** Safe reciprocal operator. */
+template <typename TensorDataType>
 struct safe_reciprocal_op {
-  inline DataType operator()(const DataType& x) const {
-    const auto& y = 1 / x;
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    const auto& y = El::To<TensorDataType>(1) / x;
     if (std::isfinite(y)) { return y; }
-    else                  { return zero; }
+    else                  { return El::TypeTraits<TensorDataType>::Zero(); }
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& y = 1 / x;
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& y = El::To<TensorDataType>(1) / x;
     if (std::isfinite(y)) { return - dy * y*y; }
-    else                  { return zero; }
+    else                  { return El::TypeTraits<TensorDataType>::Zero(); }
   }
 };
 
 /** Exponential operator. */
+template <typename TensorDataType>
 struct exp_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::exp(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Exp(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy * std::exp(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy * El::Exp(x);
   }
 };
 
 /** Exponential minus one operator. */
+template <typename TensorDataType>
 struct expm1_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::expm1(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::expm1;
+    return expm1(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy * std::exp(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy * El::Exp(x);
   }
 };
 
 /** Natural logarithm operator. */
+template <typename TensorDataType>
 struct log_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::log(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Log(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy / x;
   }
 };
 
 /** Natural logarithm one plus operator. */
+template <typename TensorDataType>
 struct log1p_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::log1p(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    using std::log1p;
+    return log1p(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (x + one);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (x + El::TypeTraits<TensorDataType>::One());
   }
 };
 
 /** Cosine operator. */
+template <typename TensorDataType>
 struct cos_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::cos(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Cos(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return -dy * std::sin(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return -dy * El::Sin(x);
   }
 };
 
 /** Sine operator. */
+template <typename TensorDataType>
 struct sin_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::sin(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Sin(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy * std::cos(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy * El::Cos(x);
   }
 };
 
 /** Tangent operator. */
+template <typename TensorDataType>
 struct tan_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::tan(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Tan(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& c = std::cos(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& c = El::Cos(x);
     return dy / (c*c);
   }
 };
 
 /** Arccosine operator. */
+template <typename TensorDataType>
 struct acos_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::acos(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Acos(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return -dy / std::sqrt(one - x*x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return -dy / El::Sqrt(El::TypeTraits<TensorDataType>::One() - x*x);
   }
 };
 
 /** Arcsine operator. */
+template <typename TensorDataType>
 struct asin_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::asin(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Asin(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / std::sqrt(one - x*x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / El::Sqrt(El::TypeTraits<TensorDataType>::One() - x*x);
   }
 };
 
 /** Arctangent operator. */
+template <typename TensorDataType>
 struct atan_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::atan(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Atan(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (one + x*x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (El::TypeTraits<TensorDataType>::One() + x*x);
   }
 };
 
 /** Hyperbolic cosine operator. */
+template <typename TensorDataType>
 struct cosh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::cosh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Cosh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy * std::sinh(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy * El::Sinh(x);
   }
 };
 
 /** Hyperbolic sine operator. */
+template <typename TensorDataType>
 struct sinh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::sinh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Sinh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy * std::cosh(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy * El::Cosh(x);
   }
 };
 
 /** Hyperbolic tangent operator. */
+template <typename TensorDataType>
 struct tanh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::tanh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Tanh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& c = std::cosh(x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& c = El::Cosh(x);
     return dy / (c*c);
   }
 };
 
 /** Hyperbolic arccosine operator. */
+template <typename TensorDataType>
 struct acosh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::acosh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Acosh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return -dy / (std::sqrt(x - one) * std::sqrt(x + one));
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return -dy / (El::Sqrt(x - El::TypeTraits<TensorDataType>::One()) * El::Sqrt(x + El::TypeTraits<TensorDataType>::One()));
   }
 };
 
 /** Hyperbolic arcsine operator. */
+template <typename TensorDataType>
 struct asinh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::asinh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Asinh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / std::sqrt(one + x*x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / El::Sqrt(El::TypeTraits<TensorDataType>::One() + x*x);
   }
 };
 
 /** Hyperbolic arctangent operator. */
+template <typename TensorDataType>
 struct atanh_op {
-  inline DataType operator()(const DataType& x) const {
-    return std::atanh(x);
+  inline TensorDataType operator()(const TensorDataType& x) const {
+    return El::Atanh(x);
   }
-  inline DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (one - x*x);
+  inline TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (El::TypeTraits<TensorDataType>::One() - x*x);
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::fp_compute() {                                               \
-    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
-                                       get_activations());              \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
-         ::bp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
-                                        get_prev_error_signals(),       \
-                                        get_error_signals());           \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-         ::fp_compute() {                                               \
-    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
-                                       get_activations());              \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+      apply_entrywise_unary_operator<op>(                               \
+        this->get_prev_activations(),                                   \
+    this->get_activations());                                           \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
-         ::bp_compute() {                                               \
-    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
-                                        get_prev_error_signals(),       \
-                                        get_error_signals());           \
-  }
-  INSTANTIATE(logical_not_layer, logical_not_op)
-  INSTANTIATE(abs_layer, abs_op)
-  INSTANTIATE(negative_layer, negative_op)
-  INSTANTIATE(sign_layer, sign_op)
-  INSTANTIATE(round_layer, round_op)
-  INSTANTIATE(ceil_layer, ceil_op)
-  INSTANTIATE(floor_layer, floor_op)
-  INSTANTIATE(reciprocal_layer, reciprocal_op)
-  INSTANTIATE(square_layer, square_op)
-  INSTANTIATE(sqrt_layer, sqrt_op)
-  INSTANTIATE(rsqrt_layer, rsqrt_op)
-  INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op)
-  INSTANTIATE(exp_layer, exp_op)
-  INSTANTIATE(expm1_layer, expm1_op)
-  INSTANTIATE(log_layer, log_op)
-  INSTANTIATE(log1p_layer, log1p_op)
-  INSTANTIATE(cos_layer, cos_op)
-  INSTANTIATE(sin_layer, sin_op)
-  INSTANTIATE(tan_layer, tan_op)
-  INSTANTIATE(acos_layer, acos_op)
-  INSTANTIATE(asin_layer, asin_op)
-  INSTANTIATE(atan_layer, atan_op)
-  INSTANTIATE(cosh_layer, cosh_op)
-  INSTANTIATE(sinh_layer, sinh_op)
-  INSTANTIATE(tanh_layer, tanh_op)
-  INSTANTIATE(acosh_layer, acosh_op)
-  INSTANTIATE(asinh_layer, asinh_op)
-  INSTANTIATE(atanh_layer, atanh_op)
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    apply_entrywise_binary_operator<op>(                                \
+      this->get_prev_activations(),                                     \
+      this->get_prev_error_signals(),                                   \
+      this->get_error_signals());                                       \
+  }
+
+DEFINE_COMPUTE_OPS(logical_not_layer, logical_not_op)
+DEFINE_COMPUTE_OPS(abs_layer, abs_op)
+DEFINE_COMPUTE_OPS(negative_layer, negative_op)
+DEFINE_COMPUTE_OPS(sign_layer, sign_op)
+DEFINE_COMPUTE_OPS(round_layer, round_op)
+DEFINE_COMPUTE_OPS(ceil_layer, ceil_op)
+DEFINE_COMPUTE_OPS(floor_layer, floor_op)
+DEFINE_COMPUTE_OPS(reciprocal_layer, reciprocal_op)
+DEFINE_COMPUTE_OPS(square_layer, square_op)
+DEFINE_COMPUTE_OPS(sqrt_layer, sqrt_op)
+DEFINE_COMPUTE_OPS(rsqrt_layer, rsqrt_op)
+DEFINE_COMPUTE_OPS(safe_reciprocal_layer, safe_reciprocal_op)
+DEFINE_COMPUTE_OPS(exp_layer, exp_op)
+DEFINE_COMPUTE_OPS(expm1_layer, expm1_op)
+DEFINE_COMPUTE_OPS(log_layer, log_op)
+DEFINE_COMPUTE_OPS(log1p_layer, log1p_op)
+DEFINE_COMPUTE_OPS(cos_layer, cos_op)
+DEFINE_COMPUTE_OPS(sin_layer, sin_op)
+DEFINE_COMPUTE_OPS(tan_layer, tan_op)
+DEFINE_COMPUTE_OPS(acos_layer, acos_op)
+DEFINE_COMPUTE_OPS(asin_layer, asin_op)
+DEFINE_COMPUTE_OPS(atan_layer, atan_op)
+DEFINE_COMPUTE_OPS(cosh_layer, cosh_op)
+DEFINE_COMPUTE_OPS(sinh_layer, sinh_op)
+DEFINE_COMPUTE_OPS(tanh_layer, tanh_op)
+DEFINE_COMPUTE_OPS(acosh_layer, acosh_op)
+DEFINE_COMPUTE_OPS(asinh_layer, asinh_op)
+DEFINE_COMPUTE_OPS(atanh_layer, atanh_op)
+
+#define PROTO(T) \
+  UNARY_ETI_INST_MACRO_DEV_DT(logical_not_layer, T, El::Device::CPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(abs_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(negative_layer, T, El::Device::CPU);    \
+  UNARY_ETI_INST_MACRO_DEV_DT(sign_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(round_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(ceil_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(floor_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(reciprocal_layer, T, El::Device::CPU);  \
+  UNARY_ETI_INST_MACRO_DEV_DT(square_layer, T, El::Device::CPU);      \
+  UNARY_ETI_INST_MACRO_DEV_DT(sqrt_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(rsqrt_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(safe_reciprocal_layer, T, El::Device::CPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(exp_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(expm1_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(log_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(log1p_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(cos_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(sin_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(tan_layer, T, El::Device::CPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(acos_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(asin_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(atan_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(cosh_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(sinh_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(tanh_layer, T, El::Device::CPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(acosh_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(asinh_layer, T, El::Device::CPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(atanh_layer, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/math/unary.cu b/src/layers/math/unary.cu
index 143381522e7..1de376053ee 100644
--- a/src/layers/math/unary.cu
+++ b/src/layers/math/unary.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_UNARY_LAYER_INSTANTIATE
 #include "lbann/layers/math/unary.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -40,23 +41,25 @@ namespace {
 // (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
 
 /** Logical not operator. */
+template <typename TensorDataType>
 struct logical_not_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    const auto& b = x != DataType(0) && !isnan(x);
-    return !b ? DataType(1) : DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    const auto& b = x != TensorDataType(0.0) && !cuda::isnan(x);
+    return !b ? TensorDataType(1.0) : TensorDataType(0.0);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(0.0);
   }
 };
 
 /** Absolute value operator. */
+template <typename TensorDataType>
 struct abs_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::abs(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    constexpr DataType zero = 0;
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const TensorDataType zero = 0.;
     if      (x > zero) { return dy;   }
     else if (x < zero) { return -dy;  }
     else               { return zero; }
@@ -64,100 +67,109 @@ struct abs_op {
 };
 
 /** Negative operator. */
+template <typename TensorDataType>
 struct negative_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return -x;
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return -dy;
   }
 };
 
 /** Sign operator. */
+template <typename TensorDataType>
 struct sign_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    constexpr DataType zero = 0;
-    constexpr DataType one = 1;
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    const TensorDataType zero = 0.;
+    const TensorDataType one = 1.;
     if      (x > zero) { return one;  }
     else if (x < zero) { return -one; }
     else               { return zero; }
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(0.0);
   }
 };
 
 /** Round operator. */
+template <typename TensorDataType>
 struct round_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::round(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(0.0);
   }
 };
 
 /** Ceiling operator. */
+template <typename TensorDataType>
 struct ceil_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::ceil(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(0.0);
   }
 };
 
 /** Floor operator. */
+template <typename TensorDataType>
 struct floor_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::floor(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return DataType(0);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(0.0);
   }
 };
 
 /** Reciprocal operator. */
+template <typename TensorDataType>
 struct reciprocal_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    return 1 / x;
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    return TensorDataType(1.) / x;
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    if (dy == DataType(0)) { return DataType(0); }
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    if (dy == TensorDataType(0.0)) { return TensorDataType(0.0); }
     else                   { return - dy / (x*x); }
 
   }
 };
 
 /** Square operator. */
+template <typename TensorDataType>
 struct square_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return x*x;
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return 2*x * dy;
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return TensorDataType(2.) * x * dy;
   }
 };
 
 
 /** Square root operator. */
+template <typename TensorDataType>
 struct sqrt_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::sqrt(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (2 * cuda::sqrt(x));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (TensorDataType(2.) * cuda::sqrt(x));
   }
 };
 
 /** Reciprocal square root operator. */
+template <typename TensorDataType>
 struct rsqrt_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::rsqrt(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     const auto& s = cuda::sqrt(x);
-    return - dy / (2 * x * s);
+    return - dy / (TensorDataType(2.) * x * s);
   }
 };
 
@@ -165,238 +177,276 @@ struct rsqrt_op {
  *  If a standard reciprocal produces an infinity or NaN, zero is
  *  output instead.
  */
+template <typename TensorDataType>
 struct safe_reciprocal_op {
-  inline __device__ DataType operator()(const DataType& x) const {
-    const auto& y = 1 / x;
-    if (isfinite(y)) { return y; }
-    else             { return DataType(0); }
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
+    const auto& y = TensorDataType(1.) / x;
+    if (cuda::isfinite(y)) { return y; }
+    else             { return TensorDataType(0.0); }
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    const auto& y = 1 / x;
-    if (isfinite(y)) { return - dy * y*y; }
-    else             { return DataType(0); }
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    const auto& y = TensorDataType(1.) / x;
+    if (cuda::isfinite(y)) { return - dy * y*y; }
+    else             { return TensorDataType(0.0); }
   }
 };
 
 /** Exponential operator. */
+template <typename TensorDataType>
 struct exp_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::exp(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy * cuda::exp(x);
   }
 };
 
 /** Exponential minus one operator. */
+template <typename TensorDataType>
 struct expm1_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::expm1(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy * cuda::exp(x);
   }
 };
 
 /** Natural logarithm operator. */
+template <typename TensorDataType>
 struct log_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::log(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy / x;
   }
 };
 
 /** Natural logarithm one plus operator. */
+template <typename TensorDataType>
 struct log1p_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::log1p(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (x + DataType(1));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (x + TensorDataType(1.0));
   }
 };
 
 /** Cosine operator. */
+template <typename TensorDataType>
 struct cos_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::cos(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return -dy * cuda::sin(x);
   }
 };
 
 /** Sine operator. */
+template <typename TensorDataType>
 struct sin_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::sin(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy * cuda::cos(x);
   }
 };
 
 /** Tangent operator. */
+template <typename TensorDataType>
 struct tan_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::tan(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     const auto& c = cuda::cos(x);
     return dy / (c*c);
   }
 };
 
 /** Arccosine operator. */
+template <typename TensorDataType>
 struct acos_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::acos(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return -dy / cuda::sqrt(DataType(1) - x*x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return -dy / cuda::sqrt(TensorDataType(1.0) - x*x);
   }
 };
 
 /** Arcsine operator. */
+template <typename TensorDataType>
 struct asin_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::asin(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / cuda::sqrt(DataType(1) - x*x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / cuda::sqrt(TensorDataType(1.0) - x*x);
   }
 };
 
 /** Arctangent operator. */
+template <typename TensorDataType>
 struct atan_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::atan(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (DataType(1) + x*x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (TensorDataType(1.0) + x*x);
   }
 };
 
 /** Hyperbolic cosine operator. */
+template <typename TensorDataType>
 struct cosh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::cosh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy * cuda::sinh(x);
   }
 };
 
 /** Hyperbolic sine operator. */
+template <typename TensorDataType>
 struct sinh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::sinh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     return dy * cuda::cosh(x);
   }
 };
 
 /** Hyperbolic tangent operator. */
+template <typename TensorDataType>
 struct tanh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::tanh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
     const auto& c = cuda::cosh(x);
     return dy / (c*c);
   }
 };
 
 /** Hyperbolic arccosine operator. */
+template <typename TensorDataType>
 struct acosh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::acosh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return -dy / (cuda::sqrt(x - DataType(1)) * cuda::sqrt(x + DataType(1)));
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return -dy / (cuda::sqrt(x - TensorDataType(1.0)) * cuda::sqrt(x + TensorDataType(1.0)));
   }
 };
 
 /** Hyperbolic arcsine operator. */
+template <typename TensorDataType>
 struct asinh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::asinh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / cuda::sqrt(DataType(1) + x*x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / cuda::sqrt(TensorDataType(1.0) + x*x);
   }
 };
 
 /** Hyperbolic arctangent operator. */
+template <typename TensorDataType>
 struct atanh_op {
-  inline __device__ DataType operator()(const DataType& x) const {
+  inline __device__ TensorDataType operator()(const TensorDataType& x) const {
     return cuda::atanh(x);
   }
-  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
-    return dy / (DataType(1) - x*x);
+  inline __device__ TensorDataType operator()(const TensorDataType& x, const TensorDataType& dy) const {
+    return dy / (TensorDataType(1.0) - x*x);
   }
 };
 
 } // namespace
 
 // Template instantiation
-#define INSTANTIATE(layer, op)                                          \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-  ::fp_compute() {                                                      \
-    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
-                                             get_activations());        \
+#define DEFINE_COMPUTE_OPS(layer, op)                                   \
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::fp_compute() {            \
+    cuda::apply_entrywise_unary_operator<op>(                           \
+      this->get_prev_activations(),                                     \
+      this->get_activations());                                         \
   }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
-  ::bp_compute() {                                                      \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
-                                              get_prev_error_signals(), \
-                                              get_error_signals());     \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::fp_compute() {                                                      \
-    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
-                                             get_activations());        \
-  }                                                                     \
-  template <>                                                           \
-  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
-  ::bp_compute() {                                                      \
-    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
-                                              get_prev_error_signals(), \
-                                              get_error_signals());     \
-  }
-  INSTANTIATE(logical_not_layer, logical_not_op)
-  INSTANTIATE(abs_layer, abs_op)
-  INSTANTIATE(negative_layer, negative_op)
-  INSTANTIATE(sign_layer, sign_op)
-  INSTANTIATE(round_layer, round_op)
-  INSTANTIATE(ceil_layer, ceil_op)
-  INSTANTIATE(floor_layer, floor_op)
-  INSTANTIATE(reciprocal_layer, reciprocal_op)
-  INSTANTIATE(square_layer, square_op)
-  INSTANTIATE(sqrt_layer, sqrt_op)
-  INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op)
-  INSTANTIATE(rsqrt_layer, rsqrt_op)
-  INSTANTIATE(exp_layer, exp_op)
-  INSTANTIATE(expm1_layer, expm1_op)
-  INSTANTIATE(log_layer, log_op)
-  INSTANTIATE(log1p_layer, log1p_op)
-  INSTANTIATE(cos_layer, cos_op)
-  INSTANTIATE(sin_layer, sin_op)
-  INSTANTIATE(tan_layer, tan_op)
-  INSTANTIATE(acos_layer, acos_op)
-  INSTANTIATE(asin_layer, asin_op)
-  INSTANTIATE(atan_layer, atan_op)
-  INSTANTIATE(cosh_layer, cosh_op)
-  INSTANTIATE(sinh_layer, sinh_op)
-  INSTANTIATE(tanh_layer, tanh_op)
-  INSTANTIATE(acosh_layer, acosh_op)
-  INSTANTIATE(asinh_layer, asinh_op)
-  INSTANTIATE(atanh_layer, atanh_op)
+  template <typename TensorDataType, data_layout Layout, El::Device Device> \
+  void layer<TensorDataType, Layout, Device>::bp_compute() {            \
+    cuda::apply_entrywise_binary_operator<op>(                          \
+      this->get_prev_activations(),                                     \
+      this->get_prev_error_signals(),                                   \
+      this->get_error_signals());                                       \
+  }
+
+DEFINE_COMPUTE_OPS(logical_not_layer, logical_not_op)
+DEFINE_COMPUTE_OPS(abs_layer, abs_op)
+DEFINE_COMPUTE_OPS(negative_layer, negative_op)
+DEFINE_COMPUTE_OPS(sign_layer, sign_op)
+DEFINE_COMPUTE_OPS(round_layer, round_op)
+DEFINE_COMPUTE_OPS(ceil_layer, ceil_op)
+DEFINE_COMPUTE_OPS(floor_layer, floor_op)
+DEFINE_COMPUTE_OPS(reciprocal_layer, reciprocal_op)
+DEFINE_COMPUTE_OPS(square_layer, square_op)
+DEFINE_COMPUTE_OPS(sqrt_layer, sqrt_op)
+DEFINE_COMPUTE_OPS(rsqrt_layer, rsqrt_op)
+DEFINE_COMPUTE_OPS(safe_reciprocal_layer, safe_reciprocal_op)
+DEFINE_COMPUTE_OPS(exp_layer, exp_op)
+DEFINE_COMPUTE_OPS(expm1_layer, expm1_op)
+DEFINE_COMPUTE_OPS(log_layer, log_op)
+DEFINE_COMPUTE_OPS(log1p_layer, log1p_op)
+DEFINE_COMPUTE_OPS(cos_layer, cos_op)
+DEFINE_COMPUTE_OPS(sin_layer, sin_op)
+DEFINE_COMPUTE_OPS(tan_layer, tan_op)
+DEFINE_COMPUTE_OPS(acos_layer, acos_op)
+DEFINE_COMPUTE_OPS(asin_layer, asin_op)
+DEFINE_COMPUTE_OPS(atan_layer, atan_op)
+DEFINE_COMPUTE_OPS(cosh_layer, cosh_op)
+DEFINE_COMPUTE_OPS(sinh_layer, sinh_op)
+DEFINE_COMPUTE_OPS(tanh_layer, tanh_op)
+DEFINE_COMPUTE_OPS(acosh_layer, acosh_op)
+DEFINE_COMPUTE_OPS(asinh_layer, asinh_op)
+DEFINE_COMPUTE_OPS(atanh_layer, atanh_op)
+
+#define PROTO(T) \
+  UNARY_ETI_INST_MACRO_DEV_DT(logical_not_layer, T, El::Device::GPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(abs_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(negative_layer, T, El::Device::GPU);    \
+  UNARY_ETI_INST_MACRO_DEV_DT(sign_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(round_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(ceil_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(floor_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(reciprocal_layer, T, El::Device::GPU);  \
+  UNARY_ETI_INST_MACRO_DEV_DT(square_layer, T, El::Device::GPU);      \
+  UNARY_ETI_INST_MACRO_DEV_DT(sqrt_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(rsqrt_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(safe_reciprocal_layer, T, El::Device::GPU); \
+  UNARY_ETI_INST_MACRO_DEV_DT(exp_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(expm1_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(log_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(log1p_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(cos_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(sin_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(tan_layer, T, El::Device::GPU);         \
+  UNARY_ETI_INST_MACRO_DEV_DT(acos_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(asin_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(atan_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(cosh_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(sinh_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(tanh_layer, T, El::Device::GPU);        \
+  UNARY_ETI_INST_MACRO_DEV_DT(acosh_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(asinh_layer, T, El::Device::GPU);       \
+  UNARY_ETI_INST_MACRO_DEV_DT(atanh_layer, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/matrix_builder.hpp b/src/layers/matrix_builder.hpp
new file mode 100644
index 00000000000..49c91d3383d
--- /dev/null
+++ b/src/layers/matrix_builder.hpp
@@ -0,0 +1,108 @@
+ ////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef NON_PUBLIC_LBANN_SRC_LAYERS_MATRIX_BUILDER_INCLUDED
+#define NON_PUBLIC_LBANN_SRC_LAYERS_MATRIX_BUILDER_INCLUDED
+
+#include "lbann/base.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <memory>
+#include <type_traits>
+
+namespace lbann {
+namespace details {
+namespace meta {
+// Quick, forced if-then-else
+template <bool B, typename T, typename F>
+using IfThenElse = typename std::conditional<B, T, F>::type;
+}// namespace meta
+
+template <typename T>
+class MatrixBuilder
+{
+public:
+  using size_type = El::Int;
+  using data_type = T;
+  using matrix_type = El::AbstractDistMatrix<T>;
+  using matrix_ptr_type = std::unique_ptr<matrix_type>;
+
+public:
+  virtual ~MatrixBuilder() = default;
+  virtual matrix_ptr_type MakeEmpty(El::Grid const& g, El::Int root) const = 0;
+  virtual matrix_ptr_type MakeWithSize(
+    El::Grid const& g, El::Int root,
+    size_type height, size_type width) const = 0;
+
+};// class MatrixBuilder
+
+// Uses memory mode = 1 for CPU (pinned memory) and for GPU (CUB memory).
+template <typename T, data_layout L, El::Device D>
+class DefaultMemoryMatrixBuilder : public MatrixBuilder<T>
+{
+  using base_type = MatrixBuilder<T>;
+  using concrete_matrix_type =
+    meta::IfThenElse<L == data_layout::DATA_PARALLEL,
+                     El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, D>,
+                     El::DistMatrix<T, El::MC  , El::MR, El::ELEMENT, D>>;
+
+#if defined(HYDROGEN_HAVE_CUDA) && defined(HYDROGEN_HAVE_CUB)
+  // Pinned host memory; memory-pooled device memory
+  static constexpr unsigned memory_mode_ = 1U;
+#elif defined(HYDROGEN_HAVE_CUDA)
+  // Pinned host memory; default-allocated device memory
+  static constexpr unsigned memory_mode_ = (D == El::Device::CPU ? 1U : 0U);
+#else
+  // Default memory
+  static constexpr unsigned memory_mode_ =
+    El::DefaultMemoryMode<El::Device::CPU>();
+#endif // defined(HYDROGEN_HAVE_CUDA) && defined(HYDROGEN_HAVE_CUB)
+
+public:
+  using size_type = typename base_type::size_type;
+  using matrix_ptr_type = typename base_type::matrix_ptr_type;
+
+public:
+  matrix_ptr_type MakeEmpty(El::Grid const& g, El::Int root) const final
+  {
+    auto ret = make_unique<concrete_matrix_type>(g, root);
+    ret->Matrix().SetMemoryMode(memory_mode_);
+    return ret;
+  }
+
+  matrix_ptr_type MakeWithSize(El::Grid const& g, El::Int root,
+                               size_type height, size_type width) const final
+  {
+    auto ret = this->MakeEmpty(g, root);
+    ret->Resize(height, width);
+    return ret;
+  }
+
+};// class DefaultMemoryMatrixBuilder
+
+}// namespace details
+}// namespace lbann
+#endif // NON_PUBLIC_LBANN_SRC_LAYERS_MATRIX_BUILDER_INCLUDED
diff --git a/src/layers/misc/CMakeLists.txt b/src/layers/misc/CMakeLists.txt
index 69fe4933e5f..2c794550392 100644
--- a/src/layers/misc/CMakeLists.txt
+++ b/src/layers/misc/CMakeLists.txt
@@ -1,16 +1,26 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  argmax.cpp
+  argmin.cpp
+  channelwise_mean.cpp
+  channelwise_softmax.cpp
   covariance.cpp
+  dist_embedding.cpp
+  mini_batch_index.cpp
+  mini_batch_size.cpp
+  one_hot.cpp
   variance.cpp
-  channelwise_mean.cpp
   )
 
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
     covariance.cu
+    dist_embedding.cu
     variance.cu
     channelwise_mean.cu
+    channelwise_softmax.cu
+    one_hot.cu
     )
 endif ()
 
diff --git a/src/layers/misc/argmax.cpp b/src/layers/misc/argmax.cpp
new file mode 100644
index 00000000000..7094c9e9d4d
--- /dev/null
+++ b/src/layers/misc/argmax.cpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ARGMAX_LAYER_INSTANTIATE
+#include "lbann/layers/misc/argmax.hpp"
+#include <algorithm>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void argmax_layer<TensorDataType, Layout, Device>::fp_compute() {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<CPUMatType&>(this->get_local_activations());
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto buf_start = local_input.LockedBuffer(0, col);
+    const auto buf_max = std::max_element(buf_start,
+                                          buf_start+local_height);
+    const auto max_ind = std::distance(buf_start, buf_max);
+    local_output(0, col) = static_cast<TensorDataType>(max_ind);
+  }
+}
+
+#define PROTO(T)                     \
+  template class argmax_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/argmin.cpp b/src/layers/misc/argmin.cpp
new file mode 100644
index 00000000000..6946333a99c
--- /dev/null
+++ b/src/layers/misc/argmin.cpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ARGMIN_LAYER_INSTANTIATE
+#include "lbann/layers/misc/argmin.hpp"
+#include <algorithm>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void argmin_layer<TensorDataType, Layout, Device>::fp_compute() {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<CPUMatType&>(this->get_local_activations());
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto buf_start = local_input.LockedBuffer(0, col);
+    const auto buf_min = std::min_element(buf_start,
+                                          buf_start+local_height);
+    const auto min_ind = std::distance(buf_start, buf_min);
+    local_output(0, col) = static_cast<TensorDataType>(min_ind);
+  }
+}
+
+#define PROTO(T)                     \
+  template class argmin_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/channelwise_mean.cpp b/src/layers/misc/channelwise_mean.cpp
index fa9561ef6c6..cfc89aa785a 100644
--- a/src/layers/misc/channelwise_mean.cpp
+++ b/src/layers/misc/channelwise_mean.cpp
@@ -24,22 +24,22 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE
 #include "lbann/layers/misc/channelwise_mean.hpp"
 
 namespace lbann {
 
-template <>
-void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_mean_layer<TensorDataType, Layout, Device>::fp_compute() {
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
 
   // Dimensions
   // Note: channel_size is the number of input entries per channel and
   // local_width is the number of local mini-batch samples.
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const El::Int num_channels = input_dims[0];
   const El::Int channel_size = std::accumulate(input_dims.begin() + 1,
                                                input_dims.end(),
@@ -50,7 +50,7 @@ void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int channel = 0; channel < num_channels; ++channel) {
-      DataType sum = 0;
+      TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
       for (El::Int i = 0; i < channel_size; ++i) {
         sum += local_input(i + channel * channel_size, col);
       }
@@ -60,18 +60,17 @@ void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
 
 }
 
-template <>
-void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_mean_layer<TensorDataType, Layout, Device>::bp_compute() {
 
   // Local matrices
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
 
   // Dimensions
   // Note: channel_size is the number of input entries per channel and
   // local_width is the number of local mini-batch samples.
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const El::Int num_channels = input_dims[0];
   const El::Int channel_size = std::accumulate(input_dims.begin() + 1,
                                                input_dims.end(),
@@ -92,4 +91,10 @@ void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
 
 }
 
+#define PROTO(T)                     \
+  template class channelwise_mean_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/misc/channelwise_mean.cu b/src/layers/misc/channelwise_mean.cu
index e4aa15c3850..30da86c7dba 100644
--- a/src/layers/misc/channelwise_mean.cu
+++ b/src/layers/misc/channelwise_mean.cu
@@ -24,19 +24,20 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE
 #include "lbann/layers/misc/channelwise_mean.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void mean_kernel(El::Int num_channels,
                             El::Int channel_size,
                             El::Int width,
-                            const DataType* __restrict__ input,
+                            const TensorDataType* __restrict__ input,
                             El::Int input_ldim,
-                            DataType* __restrict__ output,
+                            TensorDataType* __restrict__ output,
                             El::Int output_ldim) {
 
   // Indices
@@ -53,14 +54,14 @@ __global__ void mean_kernel(El::Int num_channels,
     for (El::Int channel = bidy; channel < num_channels; channel += nblocksy) {
 
       // Sum for each thread
-      DataType private_sum = 0;
+      TensorDataType private_sum = 0;
       for (El::Int i = gidx; i < channel_size; i += nthreadsx) {
         private_sum += input[i + channel*channel_size + col*input_ldim];
       }
 
       // Shared memory reduction to get sum for each block
       /// @todo unroll loops
-      __shared__ DataType shared_sums[block_size];
+      __shared__ TensorDataType shared_sums[block_size];
       shared_sums[tid] = private_sum;
       for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
         __syncthreads();
@@ -70,7 +71,7 @@ __global__ void mean_kernel(El::Int num_channels,
       }
       if (tid == 0) {
         cuda::atomic_add(&output[channel + col * output_ldim],
-                         shared_sums[0] / channel_size);
+                         shared_sums[0] / TensorDataType(channel_size));
       }
 
     }
@@ -78,12 +79,13 @@ __global__ void mean_kernel(El::Int num_channels,
 
 }
 
+template <typename TensorDataType>
 __global__ void backprop_kernel(El::Int num_channels,
                                 El::Int channel_size,
                                 El::Int width,
-                                const DataType* __restrict__ gradient_wrt_output,
+                                const TensorDataType* __restrict__ gradient_wrt_output,
                                 El::Int gradient_wrt_output_ldim,
-                                DataType* __restrict__ gradient_wrt_input,
+                                TensorDataType* __restrict__ gradient_wrt_input,
                                 El::Int gradient_wrt_input_ldim) {
 
   // Indices
@@ -98,7 +100,7 @@ __global__ void backprop_kernel(El::Int num_channels,
   for (El::Int col = bidz; col < width; col += nblocksz) {
     for (El::Int channel = bidy; channel < num_channels; channel += nblocksy) {
       const auto& dy = gradient_wrt_output[channel + col * gradient_wrt_output_ldim];
-      const auto& dx = dy / channel_size;
+      const auto& dx = dy / TensorDataType(channel_size);
       for (El::Int i = gidx; i < channel_size; i += nthreadsx) {
         gradient_wrt_input[i + channel*channel_size + col*gradient_wrt_input_ldim] = dx;
       }
@@ -110,18 +112,17 @@ __global__ void backprop_kernel(El::Int num_channels,
 
 } // namespace
 
-template <>
-void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_mean_layer<TensorDataType, Layout, Device>::fp_compute() {
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
 
   // Dimensions
   // Note: channel_size is the number of input entries per channel and
   // local_width is the number of local mini-batch samples.
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const El::Int num_channels = input_dims[0];
   const El::Int channel_size = std::accumulate(input_dims.begin() + 1,
                                                input_dims.end(),
@@ -146,17 +147,16 @@ void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
 
 }
 
-template <>
-void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_mean_layer<TensorDataType, Layout, Device>::bp_compute() {
   // Local matrices
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
 
   // Dimensions
   // Note: channel_size is the number of input entries per channel and
   // local_width is the number of local mini-batch samples.
-  const auto& input_dims = get_input_dims();
+  const auto& input_dims = this->get_input_dims();
   const El::Int num_channels = input_dims[0];
   const El::Int channel_size = std::accumulate(input_dims.begin() + 1,
                                                input_dims.end(),
@@ -171,13 +171,19 @@ void channelwise_mean_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
     grid_dims.x = (channel_size + block_size - 1) / block_size;
     grid_dims.y = num_channels;
     grid_dims.z = local_width;
-    backprop_kernel
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+    backprop_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
         num_channels, channel_size, local_width,
-        local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
         local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim());
   }
 
 }
 
+#define PROTO(T)                     \
+  template class channelwise_mean_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/misc/channelwise_softmax.cpp b/src/layers/misc/channelwise_softmax.cpp
new file mode 100644
index 00000000000..6b76f0287d5
--- /dev/null
+++ b/src/layers/misc/channelwise_softmax.cpp
@@ -0,0 +1,259 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE
+#include "lbann/layers/misc/channelwise_softmax.hpp"
+#include "lbann/utils/memory.hpp"
+
+namespace lbann {
+
+// =========================================================
+// Forward prop
+// =========================================================
+
+namespace {
+
+template <typename TensorDataType>
+void fp_impl(El::Int num_channels,
+             El::Int channel_size,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output) {
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(output.Matrix());
+
+  // Dimensions
+  const El::Int local_mini_batch_size = local_input.Width();
+
+  // Compute softmax shifts
+  //   shift = max(x_i)
+  LocalMat local_shifts(num_channels, local_mini_batch_size);
+  El::Fill(local_shifts, std::numeric_limits<TensorDataType>::lowest());
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      auto& maxval = local_shifts(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        maxval = std::max(maxval, local_input(i+j*channel_size,k));
+      }
+    }
+  }
+
+  // Compute softmax denominators
+  //   denom = sum( exp(x_i-shift) )
+  LocalMat local_denoms(num_channels, local_mini_batch_size);
+  El::Zero(local_denoms);
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& shift = local_shifts(j,k);
+      auto& denom = local_denoms(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        denom += std::exp(x-shift);
+      }
+    }
+  }
+
+  // Compute softmax
+  //   y_i = exp(x_i-shift) / denom
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& shift = local_shifts(j,k);
+      const auto& denom = local_denoms(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        auto& y = local_output(i+j*channel_size,k);
+        y = std::exp(x-shift) / denom;
+      }
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_softmax_layer<TensorDataType,Layout,Device>::fp_compute() {
+  const El::Int num_channels = this->get_output_dims().front();
+  const El::Int channel_size = this->get_output_size() / num_channels;
+  fp_impl(num_channels,
+          channel_size,
+          this->get_prev_activations(),
+          this->get_activations());
+}
+
+// =========================================================
+// Backprop
+// =========================================================
+
+namespace {
+
+template <typename TensorDataType>
+void bp_impl(El::Int num_channels,
+             El::Int channel_size,
+             const El::AbstractDistMatrix<TensorDataType>& output,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad) {
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_output = dynamic_cast<const LocalMat&>(output.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<LocalMat&>(input_grad.Matrix());
+
+  // Dimensions
+  const El::Int local_mini_batch_size = local_output.Width();
+
+  // dot(y,dL/dy)
+  LocalMat local_y_dot_dy(num_channels, local_mini_batch_size);
+  El::Zero(local_y_dot_dy);
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      auto& y_dot_dy = local_y_dot_dy(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& y = local_output(i+j*channel_size,k);
+        const auto& dy = local_output_grad(i+j*channel_size,k);
+        y_dot_dy += y * dy;
+      }
+    }
+  }
+
+  // dL/dx_i = y_i * ( dL/dy_i - dot(y,dL/dy) )
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& y_dot_dy = local_y_dot_dy(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& y = local_output(i+j*channel_size,k);
+        const auto& dy = local_output_grad(i+j*channel_size,k);
+        auto& dx = local_input_grad(i+j*channel_size,k);
+        dx = y * (dy - y_dot_dy);
+      }
+    }
+
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_softmax_layer<TensorDataType,Layout,Device>::bp_compute() {
+  const El::Int num_channels = this->get_output_dims().front();
+  const El::Int channel_size = this->get_output_size() / num_channels;
+  bp_impl(num_channels,
+          channel_size,
+          this->get_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals());
+}
+
+// =============================================
+// Builder function
+// =============================================
+
+namespace
+{
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR(
+      "Attempted to construct channelwise_softmax_layer ",
+      "with invalid parameters ",
+      "(TensorDataType=",TypeName<T>(),", ",
+      "Layout=",to_string(L),", ",
+      "Device=",to_string(D),")");
+    return nullptr;
+  }
+};
+
+template <El::Device Device>
+struct Builder<float,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = channelwise_softmax_layer<float,
+                                                data_layout::DATA_PARALLEL,
+                                                Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+
+template <El::Device Device>
+struct Builder<double,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = channelwise_softmax_layer<double,
+                                                data_layout::DATA_PARALLEL,
+                                                Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_channelwise_softmax_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const&)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  return BuilderType::Build(comm);
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#define PROTO(T)                                        \
+  template class channelwise_softmax_layer<             \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_GPU
+#define PROTO(T)                                        \
+  extern template class channelwise_softmax_layer<      \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#endif // LBANN_HAS_GPU
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(channelwise_softmax, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+} // namespace lbann
diff --git a/src/layers/misc/channelwise_softmax.cu b/src/layers/misc/channelwise_softmax.cu
new file mode 100644
index 00000000000..641d7708e7e
--- /dev/null
+++ b/src/layers/misc/channelwise_softmax.cu
@@ -0,0 +1,502 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE
+#include "lbann/layers/misc/channelwise_softmax.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+using Size3 = cuda::array<size_t,3>;
+
+/** @brief Max functor */
+template <class T>
+struct max_op {
+  __device__ __forceinline__
+  DataType operator()(const T& x1, const T& x2) const {
+    return cuda::max(x1, x2);
+  }
+};
+
+} // namespace <anon>
+
+// =========================================================
+// Forward prop
+// =========================================================
+
+namespace {
+
+/** @brief Max reduction over last dimension of 3D tensor.
+ *
+ *  Each CUDA block computes the max over a subset of tensor entries
+ *  in @c vals and outputs the result to @c maxvals. This should be
+ *  repeated multiple times to fully reduce the last tensor dimension.
+ *
+ *  Block dimensions: bdimx x 1 x 1
+ *
+ *  Grid dimensions: (vals_dims[2] / bdimx) x vals_dims[1] x vals_dims[0]
+ *
+ *  maxvals: vals_dims[0] x vals_dims[1] x (vals_dims[2] / bdimx)
+ */
+template <typename TensorDataType, size_t bdimx>
+__global__ void fp_max_kernel(
+  Size3 vals_dims,
+  const TensorDataType* __restrict__ vals_buffer,
+  Size3 vals_strides,
+  TensorDataType* __restrict__ maxvals_buffer,
+  Size3 maxvals_strides) {
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x;
+  const size_t bidx = blockIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  for (size_t k = gidz; k < vals_dims[0]; k += nthreadsz) {
+    for (size_t j = gidy; j < vals_dims[1]; j += nthreadsy) {
+
+      // Find largest value for each thread
+      TensorDataType maxval{-cuda::infinity<TensorDataType>()};
+      for (size_t i = gidx; i < vals_dims[2]; i += nthreadsx) {
+        const auto& val = vals_buffer[k * vals_strides[0]
+                                      + j * vals_strides[1]
+                                      + i * vals_strides[2]];
+        maxval = cuda::max(maxval, val);
+      }
+
+      // Find largest value for each block
+      maxval = cuda::block_reduce<bdimx,bdimy,bdimz,TensorDataType,max_op<TensorDataType>>(maxval);
+      if (tid == 0) {
+        const auto& pos = (k * maxvals_strides[0]
+                           + j * maxvals_strides[1]
+                           + bidx * maxvals_strides[2]);
+        maxvals_buffer[pos] = maxval;
+      }
+
+    }
+  }
+
+}
+
+/** Compute softmax denominator.
+ *
+ *  denom = sum( exp(x_i-shift) )
+ *
+ *  Block dimensions: bdimx x 1 x 1
+ *
+ *  Grid dimensions: (input_dims[2] / bdimx) x input_dims[1] x input_dims[0]
+ *
+ *  shifts and denoms are fully-packed 2D tensors with dimensions of
+ *  input_dims[0] x input_dims[1].
+ */
+template <typename TensorDataType, size_t bdimx>
+__global__ void fp_denom_kernel(
+  Size3 input_dims,
+  const TensorDataType* __restrict__ input_buffer,
+  Size3 input_strides,
+  const TensorDataType* __restrict__ shifts,
+  TensorDataType* __restrict__ denoms) {
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  for (size_t k = gidz; k < input_dims[0]; k += nthreadsz) {
+    for (size_t j = gidy; j < input_dims[1]; j += nthreadsy) {
+
+      // Compute contribution from each thread
+      const auto& shift = shifts[j + k*input_dims[1]];
+      TensorDataType denom{0.};
+      for (size_t i = gidx; i < input_dims[2]; i += nthreadsx) {
+        const auto& x = input_buffer[k * input_strides[0]
+                                     + j * input_strides[1]
+                                     + i * input_strides[2]];
+        denom += cuda::exp(x-shift);
+      }
+
+      // Compute contribution from each block
+      denom = cuda::block_reduce<bdimx,bdimy,bdimz>(denom);
+      if (tid == 0) {
+        cuda::atomic_add(&denoms[j+k*input_dims[1]], denom);
+      }
+
+    }
+  }
+
+}
+
+/** Compute softmax.
+ *
+ *  y_i = exp(x_i-shift) / denom
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (input_dims[2] / bdimx) x (input_dims[1] / bdimy) x (input_dims[0] / bdimz)
+ *
+ *  shifts and denoms are fully-packed 2D tensors with dimensions of
+ *  input_dims[0] x input_dims[1].
+ */
+template <typename TensorDataType>
+__global__ void fp_output_kernel(
+  Size3 input_dims,
+  const TensorDataType* __restrict__ input_buffer,
+  Size3 input_strides,
+  TensorDataType* __restrict__ output_buffer,
+  Size3 output_strides,
+  const TensorDataType* __restrict__ shifts,
+  const TensorDataType* __restrict__ denoms) {
+
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+  for (size_t k = gidz; k < input_dims[0]; k += nthreadsz) {
+    for (size_t j = gidy; j < input_dims[1]; j += nthreadsy) {
+      const auto& shift = shifts[j + k*input_dims[1]];
+      const auto& denom = denoms[j + k*input_dims[1]];
+      for (size_t i = gidx; i < input_dims[2]; i += nthreadsx) {
+        const auto& x = input_buffer[k * input_strides[0]
+                                     + j * input_strides[1]
+                                     + i * input_strides[2]];
+        auto& y = output_buffer[k * output_strides[0]
+                                + j * output_strides[1]
+                                + i * output_strides[2]];
+        y = cuda::exp(x-shift) / denom;
+      }
+    }
+  }
+
+}
+
+/** @brief Forward prop */
+template <typename TensorDataType>
+void fp_impl(size_t num_channels,
+             size_t channel_size,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output) {
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(output.Matrix());
+
+  // Dimensions
+  const size_t local_mini_batch_size = local_input.Width();
+  // const Size3 input_dims{local_mini_batch_size, num_channels, channel_size};
+
+  // Compute softmax shifts
+  LocalMat local_shifts;
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    LocalMat maxvals(grid_dims.x * num_channels, local_mini_batch_size);
+    fp_max_kernel<TensorDataType,block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        {local_mini_batch_size, num_channels, channel_size},
+        local_input.LockedBuffer(),
+        {static_cast<size_t>(local_input.LDim()), channel_size, 1},
+        maxvals.Buffer(),
+        {static_cast<size_t>(maxvals.LDim()), grid_dims.x, 1});
+    while (grid_dims.x > 1) {
+      const size_t prev_dim = grid_dims.x;
+      grid_dims.x = (prev_dim + block_size - 1) / block_size;
+      const LocalMat prev_maxvals(std::move(maxvals));
+      maxvals.Resize(grid_dims.x * num_channels, local_mini_batch_size);
+      fp_max_kernel<TensorDataType,block_size>
+        <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+          {local_mini_batch_size, num_channels, prev_dim},
+          prev_maxvals.LockedBuffer(),
+          {static_cast<size_t>(prev_maxvals.LDim()), prev_dim, 1},
+          maxvals.Buffer(),
+          {static_cast<size_t>(maxvals.LDim()), grid_dims.x, 1});
+    }
+    local_shifts = std::move(maxvals);
+  }
+
+  // Compute softmax denominators
+  LocalMat local_denoms(num_channels, local_mini_batch_size);
+  El::Zero(local_denoms);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    fp_denom_kernel<TensorDataType,block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        {local_mini_batch_size, num_channels, channel_size},
+        local_input.LockedBuffer(),
+        {static_cast<size_t>(local_input.LDim()), channel_size, 1},
+        local_shifts.LockedBuffer(),
+        local_denoms.Buffer());
+  }
+
+  // Compute softmax
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    fp_output_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        {local_mini_batch_size, num_channels, channel_size},
+        local_input.LockedBuffer(),
+        {static_cast<size_t>(local_input.LDim()), channel_size, 1},
+        local_output.Buffer(),
+        {static_cast<size_t>(local_output.LDim()), channel_size, 1},
+        local_shifts.LockedBuffer(),
+        local_denoms.LockedBuffer());
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_softmax_layer<TensorDataType,Layout,Device>::fp_compute() {
+  const size_t num_channels = this->get_output_dims().front();
+  const size_t channel_size = this->get_output_size() / num_channels;
+  fp_impl(num_channels,
+          channel_size,
+          this->get_prev_activations(),
+          this->get_activations());
+}
+
+// =========================================================
+// Backprop
+// =========================================================
+
+namespace {
+
+/** Compute dot product between output and gradient w.r.t. output.
+ *
+ *  Block dimensions: bdimx x 1 x 1
+ *
+ *  Grid dimensions: (output_dims[2] / bdimx) x output_dims[1] x output_dims[0]
+ *
+ *  y_dot_dy is a fully-packed 2D tensor with dimensions of
+ *  output_dims[0] x output_dims[1].
+ */
+template <typename TensorDataType, size_t bdimx>
+__global__ void bp_y_dot_dy_kernel(
+  Size3 output_dims,
+  const TensorDataType* __restrict__ output_buffer,
+  Size3 output_strides,
+  const TensorDataType* __restrict__ output_grad_buffer,
+  Size3 output_grad_strides,
+  TensorDataType* __restrict__ y_dot_dy) {
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  for (size_t k = gidz; k < output_dims[0]; k += nthreadsz) {
+    for (size_t j = gidy; j < output_dims[1]; j += nthreadsy) {
+
+      // Compute contribution from each thread
+      TensorDataType _y_dot_dy{0.};
+      for (size_t i = gidx; i < output_dims[2]; i += nthreadsx) {
+        const auto& y = output_buffer[k * output_strides[0]
+                                      + j * output_strides[1]
+                                      + i * output_strides[2]];
+        const auto& dy = output_grad_buffer[k * output_grad_strides[0]
+                                            + j * output_grad_strides[1]
+                                            + i * output_grad_strides[2]];
+        _y_dot_dy += y * dy;
+      }
+
+      // Compute contribution from each block
+      _y_dot_dy = cuda::block_reduce<bdimx,bdimy,bdimz>(_y_dot_dy);
+      if (tid == 0) {
+        cuda::atomic_add(&y_dot_dy[j+k*output_dims[1]], _y_dot_dy);
+      }
+
+    }
+  }
+
+}
+
+/** Compute gradient w.r.t. input.
+ *
+ *  dL/dx_i = y_i * ( dL/dy_i - dot(y,dL/dy) )
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (output_dims[2] / bdimx) x (output_dims[1] / bdimy) x (output_dims[0] / bdimz)
+ *
+ *  y_dot_dy is a fully-packed 2D tensor with dimensions of
+ *  output_dims[0] x output_dims[1].
+ */
+template <typename TensorDataType>
+__global__ void bp_input_grad_kernel(
+  Size3 output_dims,
+  const TensorDataType* __restrict__ output_buffer,
+  Size3 output_strides,
+  const TensorDataType* __restrict__ output_grad_buffer,
+  Size3 output_grad_strides,
+  TensorDataType* __restrict__ input_grad_buffer,
+  Size3 input_grad_strides,
+  const TensorDataType* __restrict__ y_dot_dy) {
+
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+  for (size_t k = gidz; k < output_dims[0]; k += nthreadsz) {
+    for (size_t j = gidy; j < output_dims[1]; j += nthreadsy) {
+      const auto& _y_dot_dy = y_dot_dy[j + k*output_dims[1]];
+      for (size_t i = gidx; i < output_dims[2]; i += nthreadsx) {
+        const auto& y = output_buffer[k * output_strides[0]
+                                      + j * output_strides[1]
+                                      + i * output_strides[2]];
+        const auto& dy = output_grad_buffer[k * output_grad_strides[0]
+                                            + j * output_grad_strides[1]
+                                            + i * output_grad_strides[2]];
+        auto& dx = input_grad_buffer[k * input_grad_strides[0]
+                                     + j * input_grad_strides[1]
+                                     + i * input_grad_strides[2]];
+        dx = y * (dy - _y_dot_dy);
+      }
+    }
+  }
+
+}
+
+/** @brief Backprop */
+template <typename TensorDataType>
+void bp_impl(size_t num_channels,
+             size_t channel_size,
+             const El::AbstractDistMatrix<TensorDataType>& output,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad) {
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_output = dynamic_cast<const LocalMat&>(output.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<LocalMat&>(input_grad.Matrix());
+
+  // Dimensions
+  const size_t local_mini_batch_size = local_output.Width();
+
+  // dot(y,dL/dy)
+  LocalMat local_y_dot_dy(num_channels, local_mini_batch_size);
+  El::Zero(local_y_dot_dy);
+  if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    bp_y_dot_dy_kernel<TensorDataType,block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        {local_mini_batch_size, num_channels, channel_size},
+        local_output.LockedBuffer(),
+        {static_cast<size_t>(local_output.LDim()), channel_size, 1},
+        local_output_grad.LockedBuffer(),
+        {static_cast<size_t>(local_output_grad.LDim()), channel_size, 1},
+        local_y_dot_dy.Buffer());
+  }
+
+  // Compute gradient w.r.t. input
+  if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    bp_input_grad_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        {local_mini_batch_size, num_channels, channel_size},
+        local_output.LockedBuffer(),
+        {static_cast<size_t>(local_output.LDim()), channel_size, 1},
+        local_output_grad.LockedBuffer(),
+        {static_cast<size_t>(local_output_grad.LDim()), channel_size, 1},
+        local_input_grad.Buffer(),
+        {static_cast<size_t>(local_input_grad.LDim()), channel_size, 1},
+        local_y_dot_dy.LockedBuffer());
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void channelwise_softmax_layer<TensorDataType,Layout,Device>::bp_compute() {
+  const size_t num_channels = this->get_output_dims().front();
+  const size_t channel_size = this->get_output_size() / num_channels;
+  bp_impl(num_channels,
+          channel_size,
+          this->get_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals());
+}
+
+// =========================================================
+// Explicit template instantiation
+// =========================================================
+
+#define PROTO(T)                                        \
+  template class channelwise_softmax_layer<             \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/covariance.cpp b/src/layers/misc/covariance.cpp
index 5387a7f5605..be133b36f08 100644
--- a/src/layers/misc/covariance.cpp
+++ b/src/layers/misc/covariance.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_COVARIANCE_LAYER_INSTANTIATE
 #include "lbann/layers/misc/covariance.hpp"
 
 namespace lbann {
@@ -34,18 +35,20 @@ namespace {
  *  We use a two-pass algorithm since it is more numerically stable
  *  than the naive single-pass algorithm.
  */
-void fp_cpu(const AbsDistMat& input0,
-            const AbsDistMat& input1,
-            AbsDistMat& output,
-            AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void fp_cpu(const El::AbstractDistMatrix<TensorDataType>& input0,
+            const El::AbstractDistMatrix<TensorDataType>& input1,
+            El::AbstractDistMatrix<TensorDataType>& output,
+            El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
 
   // Local matrices
-  const auto& local_input0 = static_cast<const CPUMat&>(input0.LockedMatrix());
-  const auto& local_input1 = static_cast<const CPUMat&>(input1.LockedMatrix());
-  auto& local_means = static_cast<CPUMat&>(means.Matrix());
-  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  const auto& local_input0 = static_cast<const CPUMatType&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const CPUMatType&>(input1.LockedMatrix());
+  auto& local_means = static_cast<CPUMatType&>(means.Matrix());
+  auto& local_workspace = static_cast<CPUMatType&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input0.Height();
@@ -59,7 +62,8 @@ void fp_cpu(const AbsDistMat& input0,
   means.Resize(2, width);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType sum0 = 0, sum1 = 0;
+    TensorDataType sum0 = El::TypeTraits<TensorDataType>::Zero(),
+      sum1 = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_height; ++row) {
       sum0 += local_input0(row, col);
       sum1 += local_input1(row, col);
@@ -77,7 +81,7 @@ void fp_cpu(const AbsDistMat& input0,
   for (El::Int col = 0; col < local_width; ++col) {
     const auto& mean0 = local_means(0, col);
     const auto& mean1 = local_means(1, col);
-    DataType sum = 0;
+    TensorDataType sum = El::TypeTraits<TensorDataType>::Zero();
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& x0 = local_input0(row, col);
       const auto& x1 = local_input1(row, col);
@@ -93,22 +97,24 @@ void fp_cpu(const AbsDistMat& input0,
 /** CPU backprop implementation.
  *  Means have already been computed in forward prop.
  */
-void bp_cpu(const AbsDistMat& input0,
-            const AbsDistMat& input1,
-            const AbsDistMat& gradient_wrt_output,
-            AbsDistMat& gradient_wrt_input0,
-            AbsDistMat& gradient_wrt_input1,
-            const AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void bp_cpu(const El::AbstractDistMatrix<TensorDataType>& input0,
+            const El::AbstractDistMatrix<TensorDataType>& input1,
+            const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input0,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input1,
+            const El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
 
   // Local matrices
-  const auto& local_input0 = static_cast<const CPUMat&>(input0.LockedMatrix());
-  const auto& local_input1 = static_cast<const CPUMat&>(input1.LockedMatrix());
-  auto& local_gradient_wrt_input0 = static_cast<CPUMat&>(gradient_wrt_input0.Matrix());
-  auto& local_gradient_wrt_input1 = static_cast<CPUMat&>(gradient_wrt_input1.Matrix());
-  const auto& local_means = static_cast<const CPUMat&>(means.LockedMatrix());
-  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  const auto& local_input0 = static_cast<const CPUMatType&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const CPUMatType&>(input1.LockedMatrix());
+  auto& local_gradient_wrt_input0 = static_cast<CPUMatType&>(gradient_wrt_input0.Matrix());
+  auto& local_gradient_wrt_input1 = static_cast<CPUMatType&>(gradient_wrt_input1.Matrix());
+  const auto& local_means = static_cast<const CPUMatType&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<CPUMatType&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input0.Height();
@@ -119,7 +125,7 @@ void bp_cpu(const AbsDistMat& input0,
   El::Copy(gradient_wrt_output, workspace);
 
   // Compute gradients w.r.t. input
-  const DataType scale = DataType(1) / (biased? height : height - 1);
+  const TensorDataType scale = El::TypeTraits<TensorDataType>::One() / El::To<TensorDataType>(biased ? height : height - 1);
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int row = 0; row < local_height; ++row) {
@@ -139,52 +145,33 @@ void bp_cpu(const AbsDistMat& input0,
 
 } // namespace
 
-template <>
-void covariance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void covariance_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_cpu(this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void covariance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  bp_cpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_prev_error_signals(),
-         get_error_signals(0),
-         get_error_signals(1),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void covariance_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_cpu(this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_prev_error_signals(),
+         this->get_error_signals(0),
+         this->get_error_signals(1),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define PROTO(T)                     \
+  template class covariance_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class covariance_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  bp_cpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_prev_error_signals(),
-         get_error_signals(0),
-         get_error_signals(1),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/misc/covariance.cu b/src/layers/misc/covariance.cu
index 4d1b544922c..91c906b676c 100644
--- a/src/layers/misc/covariance.cu
+++ b/src/layers/misc/covariance.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_COVARIANCE_LAYER_INSTANTIATE
 #include "lbann/layers/misc/covariance.hpp"
 
 namespace lbann {
@@ -37,15 +38,15 @@ namespace {
  *  where the first row corresponds to 'input0' and the second row to
  *  'input1'.
  */
-template <El::Int block_size>
+template <typename TensorDataType, El::Int block_size>
 __global__ void mean_contribution_kernel(El::Int height,
                                          El::Int width,
-                                         DataType scale,
-                                         const DataType* __restrict__ input0,
+                                         TensorDataType scale,
+                                         const TensorDataType* __restrict__ input0,
                                          El::Int input0_ldim,
-                                         const DataType* __restrict__ input1,
+                                         const TensorDataType* __restrict__ input1,
                                          El::Int input1_ldim,
-                                         DataType* __restrict__ contribution) {
+                                         TensorDataType* __restrict__ contribution) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -57,8 +58,8 @@ __global__ void mean_contribution_kernel(El::Int height,
   for (El::Int col = bidy; col < width; col += gridDim.y) {
 
     // Compute contributions for each thread
-    DataType private_contribution0 = 0;
-    DataType private_contribution1 = 0;
+    TensorDataType private_contribution0 = 0;
+    TensorDataType private_contribution1 = 0;
     for (El::Int row = gidx; row < height; row += nthreadsx) {
       private_contribution0 += input0[row + col * input0_ldim];
       private_contribution1 += input1[row + col * input1_ldim];
@@ -66,8 +67,8 @@ __global__ void mean_contribution_kernel(El::Int height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution0[block_size];
-    __shared__ DataType shared_contribution1[block_size];
+    __shared__ TensorDataType shared_contribution0[block_size];
+    __shared__ TensorDataType shared_contribution1[block_size];
     shared_contribution0[tid] = private_contribution0;
     shared_contribution1[tid] = private_contribution1;
     for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
@@ -89,16 +90,16 @@ __global__ void mean_contribution_kernel(El::Int height,
 }
 
 /** Compute local contributions to covariances. */
-template <El::Int block_size>
+template <typename TensorDataType, El::Int block_size>
 __global__ void covariance_contribution_kernel(El::Int height,
                                                El::Int width,
-                                               DataType scale,
-                                               const DataType* __restrict__ input0,
+                                               TensorDataType scale,
+                                               const TensorDataType* __restrict__ input0,
                                                El::Int input0_ldim,
-                                               const DataType* __restrict__ input1,
+                                               const TensorDataType* __restrict__ input1,
                                                El::Int input1_ldim,
-                                               const DataType* __restrict__ means,
-                                               DataType* __restrict__ contribution) {
+                                               const TensorDataType* __restrict__ means,
+                                               TensorDataType* __restrict__ contribution) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -112,7 +113,7 @@ __global__ void covariance_contribution_kernel(El::Int height,
     const auto& mean1 = means[2*col+1];
 
     // Compute contributions for each thread
-    DataType private_contribution = 0;
+    TensorDataType private_contribution = 0;
     for (El::Int row = gidx; row < height; row += nthreadsx) {
       const auto& x0 = input0[row + col * input0_ldim];
       const auto& x1 = input1[row + col * input1_ldim];
@@ -121,7 +122,7 @@ __global__ void covariance_contribution_kernel(El::Int height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -139,19 +140,20 @@ __global__ void covariance_contribution_kernel(El::Int height,
 }
 
 /** Compute gradients w.r.t. inputs. */
+template <typename TensorDataType>
 __global__
 void covariance_backprop_kernel(El::Int height,
                                 El::Int width,
-                                DataType scale,
-                                const DataType* __restrict__ gradient_wrt_output,
-                                const DataType* __restrict__ input0,
+                                TensorDataType scale,
+                                const TensorDataType* __restrict__ gradient_wrt_output,
+                                const TensorDataType* __restrict__ input0,
                                 El::Int input0_ldim,
-                                const DataType* __restrict__ input1,
+                                const TensorDataType* __restrict__ input1,
                                 El::Int input1_ldim,
-                                const DataType* __restrict__ means,
-                                DataType* __restrict__ gradient_wrt_input0,
+                                const TensorDataType* __restrict__ means,
+                                TensorDataType* __restrict__ gradient_wrt_input0,
                                 El::Int gradient_wrt_input0_ldim,
-                                DataType* __restrict__ gradient_wrt_input1,
+                                TensorDataType* __restrict__ gradient_wrt_input1,
                                 El::Int gradient_wrt_input1_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -175,18 +177,19 @@ void covariance_backprop_kernel(El::Int height,
  *  We use a two-pass algorithm since it is more numerically stable
  *  than the naive single-pass algorithm.
  */
-void fp_gpu(const AbsDistMat& input0,
-            const AbsDistMat& input1,
-            AbsDistMat& output,
-            AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void fp_gpu(const El::AbstractDistMatrix<TensorDataType>& input0,
+            const El::AbstractDistMatrix<TensorDataType>& input1,
+            El::AbstractDistMatrix<TensorDataType>& output,
+            El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
 
   // Local matrices
-  const auto& local_input0 = static_cast<const GPUMat&>(input0.LockedMatrix());
-  const auto& local_input1 = static_cast<const GPUMat&>(input1.LockedMatrix());
-  auto& local_means = static_cast<GPUMat&>(means.Matrix());
-  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  const auto& local_input0 = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input1.LockedMatrix());
+  auto& local_means = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(means.Matrix());
+  auto& local_workspace = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input0.Height();
@@ -204,8 +207,8 @@ void fp_gpu(const AbsDistMat& input0,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    const auto& scale = DataType(1) / height;
-    mean_contribution_kernel<block_size>
+    const auto& scale = El::TypeTraits<TensorDataType>::One() / TensorDataType(height);
+    mean_contribution_kernel<TensorDataType, block_size>
       <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
         local_height, local_width, scale,
         local_input0.LockedBuffer(), local_input0.LDim(),
@@ -224,8 +227,8 @@ void fp_gpu(const AbsDistMat& input0,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    const auto& scale = DataType(1) / (biased ? height : height - 1);
-    covariance_contribution_kernel<block_size>
+    const auto& scale = El::TypeTraits<TensorDataType>::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1));
+    covariance_contribution_kernel<TensorDataType, block_size>
       <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
         local_height, local_width, scale,
         local_input0.LockedBuffer(), local_input0.LDim(),
@@ -241,22 +244,23 @@ void fp_gpu(const AbsDistMat& input0,
 /** GPU backprop implementation.
  *  Means have already been computed in forward prop.
  */
-void bp_gpu(const AbsDistMat& input0,
-            const AbsDistMat& input1,
-            const AbsDistMat& gradient_wrt_output,
-            AbsDistMat& gradient_wrt_input0,
-            AbsDistMat& gradient_wrt_input1,
-            const AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void bp_gpu(const El::AbstractDistMatrix<TensorDataType>& input0,
+            const El::AbstractDistMatrix<TensorDataType>& input1,
+            const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input0,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input1,
+            const El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
 
   // Local matrices
-  const auto& local_input0 = static_cast<const GPUMat&>(input0.LockedMatrix());
-  const auto& local_input1 = static_cast<const GPUMat&>(input1.LockedMatrix());
-  auto& local_gradient_wrt_input0 = static_cast<GPUMat&>(gradient_wrt_input0.Matrix());
-  auto& local_gradient_wrt_input1 = static_cast<GPUMat&>(gradient_wrt_input1.Matrix());
-  const auto& local_means = static_cast<const GPUMat&>(means.LockedMatrix());
-  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  const auto& local_input0 = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input1.LockedMatrix());
+  auto& local_gradient_wrt_input0 = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_input0.Matrix());
+  auto& local_gradient_wrt_input1 = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_input1.Matrix());
+  const auto& local_means = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input0.Height();
@@ -267,11 +271,11 @@ void bp_gpu(const AbsDistMat& input0,
   El::Copy(gradient_wrt_output, workspace);
 
   // Compute gradients w.r.t. input
-  const DataType scale = DataType(1) / (biased ? height : height - 1);
+  const TensorDataType scale = El::TypeTraits<TensorDataType>::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1));
   constexpr El::Int block_size = 256;
   El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
   if (grid_size > 0) {
-    covariance_backprop_kernel
+    covariance_backprop_kernel<TensorDataType>
       <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
         local_height, local_width, scale,
         local_workspace.LockedBuffer(),
@@ -286,52 +290,33 @@ void bp_gpu(const AbsDistMat& input0,
 
 } // namespace
 
-template <>
-void covariance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void covariance_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_gpu(this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_activations(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void covariance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  bp_gpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_prev_error_signals(),
-         get_error_signals(0),
-         get_error_signals(1),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void covariance_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_gpu(this->get_prev_activations(0),
+         this->get_prev_activations(1),
+         this->get_prev_error_signals(),
+         this->get_error_signals(0),
+         this->get_error_signals(1),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define PROTO(T)                     \
+  template class covariance_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class covariance_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  bp_gpu(get_prev_activations(0),
-         get_prev_activations(1),
-         get_prev_error_signals(),
-         get_error_signals(0),
-         get_error_signals(1),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/misc/dist_embedding.cpp b/src/layers/misc/dist_embedding.cpp
new file mode 100644
index 00000000000..735ca420f8e
--- /dev/null
+++ b/src/layers/misc/dist_embedding.cpp
@@ -0,0 +1,457 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/dist_embedding.hpp"
+
+#include "lbann/weights/weights_helpers.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include <layers.pb.h>
+
+// =========================================================
+// CPU layer implementation
+// =========================================================
+
+#ifdef LBANN_HAS_SHMEM
+#include <shmem.h>
+namespace lbann
+{
+
+// ---------------------------------------------
+// Life cycle and setup
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>::~dist_embedding_layer()
+{
+  shmem_free(m_embeddings_buffer);
+  shmem_free(m_workspace_buffer);
+  shmem_free(m_metadata_buffer);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::attach_embeddings_to_shmem_buffer() {
+  if (m_embeddings_buffer != nullptr || m_embeddings_buffer_size != 0) {
+    LBANN_ERROR("attempted to attach embedding matrix ",
+                "to OpenSHMEM buffer multiple times");
+  }
+
+  // Embedding weights matrix
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+  auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
+  const auto dist = embeddings.DistData();
+  if (dist.device != El::Device::CPU) {
+    LBANN_ERROR("attempted to attach non-CPU matrix to OpenSHMEM buffer");
+  }
+  if (shmem_addr_accessible(embeddings.LockedBuffer(), shmem_my_pe())) {
+    return;
+  }
+
+  // Calculate size of SHMEM buffer
+  const auto col_comm_size = El::mpi::Size(embeddings.ColComm());
+  const auto row_comm_size = El::mpi::Size(embeddings.RowComm());
+  const auto height = embeddings.Height();
+  const auto width = embeddings.Width();
+  const auto local_height = (height + col_comm_size - 1) / col_comm_size;
+  const auto local_width = (width + row_comm_size - 1) / row_comm_size;
+  m_embeddings_buffer_size = local_height * local_width * sizeof(TensorDataType);
+  if (m_embeddings_buffer_size == 0) {
+    return;
+  }
+
+  // Allocate SHMEM buffer
+  m_embeddings_buffer = reinterpret_cast<TensorDataType*>(
+    shmem_malloc(m_embeddings_buffer_size));
+  if (m_embeddings_buffer == nullptr) {
+    LBANN_ERROR("failed to allocate OpenSHMEM buffer");
+  }
+
+  // Attach matrix to SHMEM buffer
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> orig_mat(
+    embeddings.Construct(embeddings.Grid(), embeddings.Root()));
+  *orig_mat = std::move(embeddings);
+  embeddings.Empty();
+  embeddings.AlignWith(dist);
+  dynamic_cast<El::ElementalMatrix<TensorDataType>&>(embeddings).Attach(
+    height, width,
+    *dist.grid, dist.colAlign, dist.rowAlign,
+    m_embeddings_buffer, local_height, dist.root);
+  El::Copy(*orig_mat, embeddings);
+
+}
+
+// ---------------------------------------------
+// Forward prop
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::fp_compute() {
+
+  // Data matrices
+  // Note: Make sure to get original weight values since they are in
+  // NVSHMEM buffer.
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+  const auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
+  const auto& input = this->get_prev_activations();
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(this->get_local_activations());
+
+  // Dimensions
+  const size_t input_size = this->get_input_size();
+  const size_t output_size = this->get_output_size();
+  const size_t mini_batch_size = input.Width();
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // Barrier to handle gradient checking
+  /// @todo Think of a way to avoid this synchronization
+  if (m_barrier_in_forward_prop) {
+    shmem_barrier_all();
+  }
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure embeddings are up-to-date and SHMEM workspaces
+  // are safe to reset.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize SHMEM buffer for communicating embedding vectors
+  if (m_workspace_buffer_size < output_size * mini_batch_size) {
+    m_workspace_buffer_size = output_size * mini_batch_size;
+    m_workspace_buffer = reinterpret_cast<TensorDataType*>(
+      shmem_realloc(
+        m_workspace_buffer,
+        m_workspace_buffer_size*sizeof(vector_metadata)
+        )
+      );
+  }
+  LocalMat workspace(
+    m_embedding_dim,
+    input_size * mini_batch_size,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Initialize SHMEM buffer for embedding vector metadata
+  if (m_metadata_buffer_size < input_size * mini_batch_size) {
+    m_metadata_buffer_size = input_size * mini_batch_size;
+    m_metadata_buffer = reinterpret_cast<vector_metadata*>(
+      shmem_realloc(
+        m_metadata_buffer,
+        m_metadata_buffer_size*sizeof(vector_metadata))
+      );
+  }
+  std::fill(
+    m_metadata_buffer,
+    m_metadata_buffer+m_metadata_buffer_size,
+    vector_metadata());
+
+  // Get embedding vectors from owner processes
+  const size_t rank = comm.get_rank_in_trainer();
+  for (size_t j=0; j<local_mini_batch_size; ++j) {
+    for (size_t i=0; i<input_size; ++i) {
+      const auto& global_index = static_cast<size_t>(std::floor(local_input(i,j)));
+      const auto& global_j = input.GlobalCol(j);
+
+      // Figure out which process owns embedding vector
+      auto& m = m_metadata_buffer[i + global_j*input_size];
+      m.source_rank = embeddings.Owner(0, global_index);
+      m.source_index = embeddings.LocalCol(global_index, m.source_rank);
+      m.target_rank = rank;
+      m.target_index = i + global_j*input_size;
+      m.is_active = true;
+
+      // Get embedding vector from owner process
+      shmem_getmem_nbi(
+        workspace.Buffer(0, m.target_index),
+        embeddings.LockedBuffer(0, m.source_index),
+        m_embedding_dim*sizeof(TensorDataType),
+        m.source_rank);
+
+    }
+  }
+  shmem_quiet();
+
+  // Copy embedding vectors from workspace to output tensor
+  for (size_t j=0; j<local_mini_batch_size; ++j) {
+    for (size_t i=0; i<input_size; ++i) {
+      const auto& global_j = input.GlobalCol(j);
+      const auto* x = workspace.LockedBuffer(0, i + global_j*input_size);
+      auto* y = local_output.Buffer(i*m_embedding_dim, j);
+      std::copy(x, x+m_embedding_dim, y);
+    }
+  }
+
+  // Non-blocking barrier
+  // Note: SHMEM workspaces are ready to recieve gradients.
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+}
+
+// ---------------------------------------------
+// Backprop
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::bp_compute() {
+
+  // Data matrices
+  const auto& input = this->get_prev_activations();
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(this->get_local_prev_error_signals());
+
+  // Dimensions
+  const size_t input_size = this->get_input_size();
+  const size_t mini_batch_size = input.Width();
+  const size_t local_mini_batch_size = local_output_grad.Width();
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure SHMEM workspaces are ready to recieve gradients.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize SHMEM buffer for gradient w.r.t. embeddings
+  LocalMat workspace(
+    m_embedding_dim,
+    input_size * mini_batch_size,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Send gradients to owner processes
+  for (size_t j=0; j<local_mini_batch_size; ++j) {
+    for (size_t i=0; i<input_size; ++i) {
+      const auto& global_j = input.GlobalCol(j);
+      auto& m = m_metadata_buffer[i + global_j*input_size];
+      shmem_putmem_nbi(
+        workspace.Buffer(0, i+global_j*input_size),
+        local_output_grad.LockedBuffer(i*m_embedding_dim, j),
+        m_embedding_dim*sizeof(TensorDataType),
+        m.source_rank);
+      shmem_putmem_nbi(
+        &m,
+        &m,
+        sizeof(vector_metadata),
+        m.source_rank);
+    }
+  }
+  shmem_quiet();
+
+  // Non-blocking barrier
+  // Note: Gradients have been sent.
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+  // Use dense optimizer if needed
+  if (!m_sparse_sgd) {
+
+    // Create buffer for dense gradients
+    const auto& embeddings = this->weights_values(0);
+    std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> embeddings_grad(
+      embeddings.Construct(embeddings.Grid(), embeddings.Root()));
+    embeddings_grad->AlignWith(embeddings);
+    El::Zeros(*embeddings_grad, embeddings.Height(), embeddings.Width());
+    auto& local_embeddings_grad = dynamic_cast<LocalMat&>(embeddings_grad->Matrix());
+
+    // Apply SGD step to convert sparse gradients to dense gradients
+    apply_sparse_sgd_step(
+      input_size * mini_batch_size,
+      local_embeddings_grad);
+
+    // Send dense gradients to dense optimizer
+    auto* opt = this->get_weights(0).get_optimizer();
+    if (opt != nullptr) {
+      opt->add_to_gradient(*embeddings_grad);
+    }
+
+  }
+
+}
+
+// ---------------------------------------------
+// Sparse SGD
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::apply_sparse_sgd_step(
+  size_t num_gradients,
+  LocalMat& local_embeddings) {
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure gradients have been received.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize SHMEM buffer for gradient w.r.t. embeddings
+  LocalMat local_embeddings_grad(
+    m_embedding_dim,
+    num_gradients,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Sparse SGD on local embeddings
+  const size_t rank = comm.get_rank_in_trainer();
+  const size_t num_omp_threads = omp_get_num_threads();
+  const size_t embeddings_per_thread
+    = (local_embeddings.Width() + num_omp_threads - 1) / num_omp_threads;
+  LBANN_OMP_PARALLEL_FOR
+  for (size_t thread = 0; thread < num_omp_threads; ++thread) {
+    const size_t index_start = thread * embeddings_per_thread;
+    const size_t index_end = (thread+1) * embeddings_per_thread;
+    for (size_t i=0; i<num_gradients; ++i) {
+      const auto& m = m_metadata_buffer[i];
+      if (m.is_active
+          && m.source_rank == rank
+          && index_start <= m.source_index
+          && m.source_index < index_end) {
+        const auto* dw = local_embeddings_grad.LockedBuffer(0, m.target_index);
+        auto* w = local_embeddings.Buffer(0, m.source_index);
+        EL_SIMD
+        for (size_t k = 0; k < m_embedding_dim; ++k) {
+          w[k] -= m_learning_rate * dw[k];
+        }
+      }
+    }
+  }
+
+}
+
+} // namespace lbann
+#endif // LBANN_HAS_SHMEM
+
+// =========================================================
+// Builder and explicit template instantiation
+// =========================================================
+
+namespace lbann
+{
+
+// ---------------------------------------------
+// Builder function
+// ---------------------------------------------
+
+namespace
+{
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR(
+      "Attempted to construct dist_embedding_layer ",
+      "with invalid parameters ",
+      "(TensorDataType=",TypeName<TensorDataType>(),", ",
+      "Layout=",to_string(Layout),", ",
+      "Device=",to_string(Device),")");
+    return nullptr;
+  }
+};
+
+template <>
+struct Builder<float,data_layout::DATA_PARALLEL,El::Device::CPU>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using TensorDataType = float;
+    constexpr data_layout Layout = data_layout::DATA_PARALLEL;
+    constexpr El::Device Device = El::Device::CPU;
+#ifdef LBANN_HAS_SHMEM
+    using LayerType = dist_embedding_layer<TensorDataType,Layout,Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+#else
+    LBANN_ERROR(
+      "Attempted to construct CPU dist_embedding_layer, ",
+      "but LBANN has not been built with OpenSHMEM support "
+      "(TensorDataType=",TypeName<TensorDataType>(),", ",
+      "Layout=",to_string(Layout),", ",
+      "Device=",to_string(Device),")");
+    return nullptr;
+#endif // LBANN_HAS_SHMEM
+  }
+};
+
+#ifdef LBANN_HAS_GPU
+template <>
+struct Builder<float,data_layout::DATA_PARALLEL,El::Device::GPU>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using TensorDataType = float;
+    constexpr data_layout Layout = data_layout::DATA_PARALLEL;
+    constexpr El::Device Device = El::Device::GPU;
+#ifdef LBANN_HAS_NVSHMEM
+    using LayerType = dist_embedding_layer<TensorDataType,Layout,Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+#else
+    LBANN_ERROR(
+      "Attempted to construct GPU dist_embedding_layer, ",
+      "but LBANN has not been built with NVSHMEM support "
+      "(TensorDataType=",TypeName<TensorDataType>(),", ",
+      "Layout=",to_string(Layout),", ",
+      "Device=",to_string(Device),")");
+    return nullptr;
+#endif // LBANN_HAS_NVSHMEM
+  }
+
+};
+#endif // LBANN_HAS_GPU
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_dist_embedding_layer_from_pbuf(
+  lbann_comm* comm,
+  const lbann_data::Layer& proto_layer)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, dist_embedding);
+  const auto& params = proto_layer.dist_embedding();
+  return BuilderType::Build(
+    comm,
+    params.num_embeddings(),
+    params.embedding_dim(),
+    params.sparse_sgd(),
+    params.learning_rate(),
+    params.barrier_in_forward_prop());
+}
+
+// ---------------------------------------------
+// Explicit template instantiation
+// ---------------------------------------------
+
+/// @todo fp16
+#ifdef LBANN_HAS_SHMEM
+template class dist_embedding_layer<
+  float, data_layout::DATA_PARALLEL, El::Device::CPU>;
+#endif // LBANN_HAS_SHMEM
+#if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+extern template class dist_embedding_layer<
+  float, data_layout::DATA_PARALLEL, El::Device::GPU>;
+#endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
+
+#define PROTO_DEVICE(T, Device)                         \
+  LBANN_LAYER_BUILDER_ETI(dist_embedding, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/dist_embedding.cu b/src/layers/misc/dist_embedding.cu
new file mode 100644
index 00000000000..557cc04a499
--- /dev/null
+++ b/src/layers/misc/dist_embedding.cu
@@ -0,0 +1,701 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/dist_embedding.hpp"
+#ifdef LBANN_HAS_NVSHMEM
+
+#include "lbann/utils/cuda.hpp"
+#include "lbann/utils/nvshmem.hpp"
+
+namespace lbann
+{
+namespace
+{
+
+// Typedefs
+using Size2 = cuda::array<size_t, 2>;
+template <typename T>
+using VectorMetadata = typename dist_embedding_layer<T,data_layout::DATA_PARALLEL,El::Device::GPU>::vector_metadata;
+
+/** Copy between two device buffers, using all threads in a warp. */
+template <typename T> __device__ __forceinline__
+T* memcpy_warp(T* __restrict__ dest, const T* __restrict__ src, size_t n) {
+  constexpr size_t warp_size = 32;
+  for (size_t i = threadIdx.x; i < n; i += warp_size) {
+    dest[i] = src[i];
+  }
+  __syncwarp();
+  return dest;
+}
+
+/** See El::AbstractDistMatrix::ColOwner. */
+__device__ __forceinline__
+size_t distmat_index_owner(size_t global_index, size_t align, size_t stride) {
+  return (global_index + align) % stride;
+}
+
+/** See El::AbstractDistMatrix::GlobalCol. */
+__device__ __forceinline__
+size_t distmat_global_index(size_t local_index, size_t shift, size_t stride) {
+  return shift + local_index * stride;
+}
+
+/** See El::AbstractDistMatrix::LocalCol. */
+__device__ __forceinline__
+size_t distmat_local_index(size_t global_index, size_t rank, size_t align, size_t stride) {
+  auto shift = (stride + rank - align) % stride;
+  if (global_index > shift) {
+    return (global_index - shift - 1) / stride + 1;
+  }
+  else {
+    return 0;
+  }
+}
+
+/** Launch a CUDA kernel.
+ *
+ *  @todo Check that argument types match kernel signature.
+ */
+template <typename Kernel, typename... Args>
+inline void launch_cuda_kernel(
+  const Kernel& kernel,
+  dim3 grid_dims,
+  dim3 block_dims,
+  size_t shared_mem,
+  cudaStream_t stream,
+  Args... args) {
+  void* arg_list[] = {
+    const_cast<void*>(reinterpret_cast<const void*>(&args))...
+  };
+  CHECK_CUDA(
+    cudaLaunchKernel(
+      reinterpret_cast<const void*>(&kernel),
+      grid_dims,
+      block_dims,
+      arg_list,
+      shared_mem,
+      stream));
+}
+
+/** Launch a collective NVSHMEM kernel.
+ *
+ *  Needed for device-side NVSHMEM synchronization calls like
+ *  nvshmem_wait. If grid_dims is zero, then the NVSHMEM will launch
+ *  with the largest available grid.
+ *
+ *  @todo Check that argument types match kernel signature.
+ */
+template <typename Kernel, typename... Args>
+inline void launch_nvshmem_collective_kernel(
+  const Kernel& kernel,
+  dim3 grid_dims,
+  dim3 block_dims,
+  size_t shared_mem,
+  cudaStream_t stream,
+  Args... args) {
+  if (grid_dims.x == 0) {
+    grid_dims.y = 0;
+    grid_dims.z = 0;
+  }
+  void* arg_list[] = {
+    const_cast<void*>(reinterpret_cast<const void*>(&args))...
+  };
+  auto status = nvshmemx_collective_launch(
+    reinterpret_cast<const void*>(&kernel),
+    grid_dims,
+    block_dims,
+    arg_list,
+    shared_mem,
+    stream);
+  if (status != 0) {
+    LBANN_ERROR(
+      "Failed to launch NVSHMEM collective kernel ",
+      "(error ",status,")");
+  }
+}
+
+} // namespace <anon>
+
+// ---------------------------------------------
+// Life cycle and setup
+// ---------------------------------------------
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+dist_embedding_layer<TensorDataType,Layout,Device>::~dist_embedding_layer()
+{
+  if (m_embeddings_buffer != nullptr) {
+    nvshmem_free(m_embeddings_buffer);
+  }
+  if (m_workspace_buffer != nullptr) {
+    nvshmem_free(m_workspace_buffer);
+  }
+  if (m_metadata_buffer != nullptr) {
+    nvshmem_free(m_metadata_buffer);
+  }
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::attach_embeddings_to_shmem_buffer() {
+  if (m_embeddings_buffer != nullptr || m_embeddings_buffer_size != 0) {
+    LBANN_ERROR("attempted to attach embedding matrix ",
+                "to NVSHMEM buffer multiple times");
+  }
+
+  // Embedding weights matrix
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+  auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
+  const auto dist = embeddings.DistData();
+  if (dist.device != El::Device::GPU) {
+    LBANN_ERROR("attempted to attach non-GPU matrix to NVSHMEM buffer");
+  }
+#if 0 // nvshmem_addr_accessible is not supported as of NVSHMEM 1.4
+  if (nvshmem_addr_accessible(embeddings.LockedBuffer(), nvshmem_my_pe())) {
+    return;
+  }
+#endif
+
+  // Calculate size of NVSHMEM buffer
+  const auto col_comm_size = El::mpi::Size(embeddings.ColComm());
+  const auto row_comm_size = El::mpi::Size(embeddings.RowComm());
+  const auto height = embeddings.Height();
+  const auto width = embeddings.Width();
+  const auto local_height = (height + col_comm_size - 1) / col_comm_size;
+  const auto local_width = (width + row_comm_size - 1) / row_comm_size;
+  m_embeddings_buffer_size = local_height * local_width * sizeof(TensorDataType);
+  if (m_embeddings_buffer_size == 0) {
+    return;
+  }
+
+  // Allocate NVSHMEM buffer
+  m_embeddings_buffer = nvshmem::malloc<TensorDataType>(m_embeddings_buffer_size);
+
+  // Attach matrix to NVSHMEM buffer
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> orig_mat(
+    embeddings.Construct(embeddings.Grid(), embeddings.Root()));
+  *orig_mat = std::move(embeddings);
+  embeddings.Empty();
+  embeddings.AlignWith(dist);
+  dynamic_cast<El::ElementalMatrix<TensorDataType>&>(embeddings).Attach(
+    height, width,
+    *dist.grid, dist.colAlign, dist.rowAlign,
+    m_embeddings_buffer, local_height, dist.root);
+  El::Copy(*orig_mat, embeddings);
+
+}
+
+// ---------------------------------------------
+// Forward prop
+// ---------------------------------------------
+
+namespace
+{
+
+/** Request embedding vectors from owner processes.
+ *
+ *  Block dimensions: 32 x 1 x 1
+ *
+ *  Grid dimensions: input_dims[1] x input_dims[0] x 1
+ */
+template <typename T>
+__global__ void request_embeddings_kernel(
+  size_t embedding_dim,
+  Size2 input_dims,
+  const T* __restrict__ input,
+  Size2 input_strides,
+  const T* __restrict__ embeddings,
+  Size2 embeddings_strides,
+  VectorMetadata<T>* __restrict__ metadata,
+  Size2 metadata_strides,
+  T* __restrict__ workspace,
+  Size2 workspace_strides,
+  size_t rank,
+  size_t input_rowshift,
+  size_t input_rowstride,
+  size_t embeddings_rowalign,
+  size_t embeddings_rowstride) {
+
+  // Indices
+  const size_t bidx = blockIdx.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nblocksx = gridDim.x;
+  const size_t nblocksy = gridDim.y;
+
+  const size_t i_per_block = (input_dims[1] + nblocksx - 1) / nblocksx;
+  const size_t i_start = bidx * i_per_block;
+  const size_t i_end = cuda::min((bidx+1) * i_per_block, input_dims[1]);
+  for (size_t j = bidy; j < input_dims[0]; j += nblocksy) {
+    for (size_t i = i_start; i < i_end; ++i) {
+      const auto& global_j = distmat_global_index(j, input_rowshift, input_rowstride);
+
+      // Get embedding vector index
+      const auto& global_index_float = input[i*input_strides[1] + j*input_strides[0]];
+      const auto& global_index = static_cast<size_t>(cuda::floor(global_index_float));
+
+      // Figure out which process owns embedding vector
+      __shared__ unsigned char metadata_shared[sizeof(VectorMetadata<T>)];
+      auto& m = *reinterpret_cast<VectorMetadata<T>*>(metadata_shared);
+      if (threadIdx.x == 0) {
+        m.source_rank = distmat_index_owner(global_index, embeddings_rowalign, embeddings_rowstride);
+        m.source_index = distmat_local_index(global_index, m.source_rank, embeddings_rowalign, embeddings_rowstride);
+        m.target_rank = rank;
+        m.target_index = i + global_j*input_dims[1];
+        m.is_active = true;
+        metadata[i*metadata_strides[1] + global_j*metadata_strides[0]] = m;
+      }
+      __syncwarp();
+
+      // Get embedding vector from owner process
+      nvshmemx_getmem_nbi_warp(
+        &workspace[m.target_index * workspace_strides[0]],
+        &embeddings[m.source_index * embeddings_strides[0]],
+        embedding_dim*sizeof(T),
+        m.source_rank);
+
+    }
+  }
+
+}
+
+/** Copy embedding vectors to output tensor.
+ *
+ *  Block dimensions: 32 x 1 x 1
+ *
+ *  Grid dimensions: input_dims[1] x input_dims[0] x 1
+ */
+template <typename T>
+__global__ void copy_embeddings_kernel(
+  size_t embedding_dim,
+  Size2 input_dims,
+  const VectorMetadata<T>* __restrict__ metadata,
+  Size2 metadata_strides,
+  const T* __restrict__ workspace,
+  Size2 workspace_strides,
+  T* __restrict__ output,
+  Size2 output_strides,
+  size_t input_rowshift,
+  size_t input_rowstride) {
+
+  // Indices
+  const size_t bidx = blockIdx.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nblocksx = gridDim.x;
+  const size_t nblocksy = gridDim.y;
+
+  const size_t i_per_block = (input_dims[1] + nblocksx - 1) / nblocksx;
+  const size_t i_start = bidx * i_per_block;
+  const size_t i_end = cuda::min((bidx+1) * i_per_block, input_dims[1]);
+  for (size_t j = bidy; j < input_dims[0]; j += nblocksy) {
+    for (size_t i = i_start; i < i_end; ++i) {
+      const auto& global_j = distmat_global_index(j, input_rowshift, input_rowstride);
+      const auto& m = metadata[i*metadata_strides[1] + global_j*metadata_strides[0]];
+      memcpy_warp(
+        &output[i*embedding_dim + j*output_strides[0]],
+        &workspace[m.target_index * workspace_strides[0]],
+        embedding_dim);
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::fp_compute() {
+
+  // Data matrices
+  // Note: Make sure to get original weight values since they are in
+  // SHMEM buffer.
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+  const auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
+  const auto& input = this->get_prev_activations();
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(this->get_local_activations());
+
+  // Dimensions
+  const size_t input_size = this->get_input_size();
+  const size_t output_size = this->get_output_size();
+  const size_t mini_batch_size = input.Width();
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  nvshmem::initialize();
+
+  // Barrier to handle gradient checking
+  /// @todo Think of a way to avoid this synchronization
+  if (m_barrier_in_forward_prop) {
+    nvshmemx_barrier_all_on_stream(stream);
+  }
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure embeddings are up-to-date and NVSHMEM workspaces
+  // are safe to reset.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize NVSHMEM buffer for communicating embedding vectors
+  if (m_workspace_buffer_size < output_size * mini_batch_size) {
+    m_workspace_buffer_size = output_size * mini_batch_size;
+    m_workspace_buffer = nvshmem::realloc(m_workspace_buffer,
+                                          m_workspace_buffer_size);
+  }
+  LocalMat workspace(
+    m_embedding_dim,
+    input_size * mini_batch_size,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Initialize NVSHMEM buffer for embedding vector metadata
+  if (m_metadata_buffer_size < input_size * mini_batch_size) {
+    m_metadata_buffer_size = input_size * mini_batch_size;
+    m_metadata_buffer = nvshmem::realloc(m_metadata_buffer,
+                                         m_metadata_buffer_size);
+  }
+  CHECK_CUDA(
+    cudaMemsetAsync(
+      m_metadata_buffer,
+      0,
+      m_metadata_buffer_size*sizeof(vector_metadata),
+      stream));
+
+  // Request embedding vectors from owning processes
+  const size_t rank = comm.get_rank_in_trainer();
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 32;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = input_size;
+    grid_dims.y = local_mini_batch_size;
+    launch_cuda_kernel(
+      request_embeddings_kernel<TensorDataType>,
+      grid_dims,
+      block_dims,
+      0,
+      stream,
+      m_embedding_dim,
+      Size2{local_mini_batch_size, input_size},
+      local_input.LockedBuffer(),
+      Size2{size_t(local_input.LDim()), 1},
+      embeddings.LockedBuffer(),
+      Size2{size_t(embeddings.LDim()), 1},
+      m_metadata_buffer,
+      Size2{input_size, 1},
+      workspace.Buffer(),
+      Size2{size_t(workspace.LDim()), 1},
+      size_t(rank),
+      size_t(input.RowShift()),
+      size_t(input.RowStride()),
+      size_t(embeddings.RowAlign()),
+      size_t(embeddings.RowStride()));
+  }
+  nvshmemx_quiet_on_stream(stream);
+
+  // Copy embedding vectors to output tensor
+  if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 32;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = input_size;
+    grid_dims.y = local_mini_batch_size;
+    launch_cuda_kernel(
+      copy_embeddings_kernel<TensorDataType>,
+      grid_dims,
+      block_dims,
+      0,
+      stream,
+      m_embedding_dim,
+      Size2{local_mini_batch_size, input_size},
+      m_metadata_buffer,
+      Size2{input_size, 1},
+      workspace.LockedBuffer(),
+      Size2{size_t(workspace.LDim()), 1},
+      local_output.Buffer(),
+      Size2{size_t(local_output.LDim()), 1},
+      size_t(input.RowShift()),
+      size_t(input.RowStride()));
+  }
+
+  // Non-blocking barrier
+  // Note: NVSHMEM workspaces are ready to recieve gradients.
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+}
+
+// ---------------------------------------------
+// Backprop
+// ---------------------------------------------
+
+namespace
+{
+
+/** Send gradients to owner processes.
+ *
+ *  Block dimensions: 32 x 1 x 1
+ *
+ *  Grid dimensions: input_dims[1] x input_dims[0] x 1
+ */
+template <typename T>
+__global__ void send_gradients_kernel(
+  size_t embedding_dim,
+  Size2 input_dims,
+  const T* __restrict__ output_grad,
+  Size2 output_grad_strides,
+  VectorMetadata<T>* __restrict__ metadata,
+  Size2 metadata_strides,
+  T* __restrict__ workspace,
+  Size2 workspace_strides,
+  size_t input_rowshift,
+  size_t input_rowstride) {
+
+  // Indices
+  const size_t bidx = blockIdx.x;
+  const size_t bidy = blockIdx.y;
+  const size_t nblocksx = gridDim.x;
+  const size_t nblocksy = gridDim.y;
+
+  // Assign metadata to CUDA blocks
+  const size_t i_per_block = (input_dims[1] + nblocksx - 1) / nblocksx;
+  const size_t i_start = bidx * i_per_block;
+  const size_t i_end = cuda::min((bidx+1) * i_per_block, input_dims[1]);
+
+  // Send gradients to owner processes
+  for (size_t j = bidy; j < input_dims[0]; j += nblocksy) {
+    for (size_t i = i_start; i < i_end; ++i) {
+      const auto& global_j = distmat_global_index(j, input_rowshift, input_rowstride);
+      auto& m = metadata[i*metadata_strides[1] + global_j*metadata_strides[0]];
+      auto* workspace_ptr = &workspace[m.target_index * workspace_strides[0]];
+      memcpy_warp(
+        workspace_ptr,
+        &output_grad[i*embedding_dim + j*output_grad_strides[0]],
+        embedding_dim);
+      if (m.source_rank != m.target_rank) {
+        nvshmemx_putmem_nbi_warp(
+          workspace_ptr,
+          workspace_ptr,
+          embedding_dim*sizeof(T),
+          m.source_rank);
+        nvshmemx_putmem_nbi_warp(
+          &m,
+          &m,
+          sizeof(VectorMetadata<T>),
+          m.source_rank);
+      }
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::bp_compute() {
+
+  // Data matrices
+  const auto& input = this->get_prev_activations();
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(this->get_local_prev_error_signals());
+
+  // Dimensions
+  const size_t input_size = this->get_input_size();
+  const size_t mini_batch_size = input.Width();
+  const size_t local_mini_batch_size = local_output_grad.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure NVSHMEM workspaces are ready to recieve gradients.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize NVSHMEM buffer for gradient w.r.t. embeddings
+  LocalMat workspace(
+    m_embedding_dim,
+    input_size * mini_batch_size,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Send gradients to owner processes
+  if (!local_output_grad.IsEmpty()) {
+    constexpr size_t block_size = 32;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = input_size;
+    grid_dims.y = local_mini_batch_size;
+    launch_cuda_kernel(
+      send_gradients_kernel<TensorDataType>,
+      grid_dims,
+      block_dims,
+      0,
+      stream,
+      m_embedding_dim,
+      Size2{local_mini_batch_size, input_size},
+      local_output_grad.LockedBuffer(),
+      Size2{size_t(local_output_grad.LDim()), 1},
+      m_metadata_buffer,
+      Size2{input_size, 1},
+      workspace.Buffer(),
+      Size2{size_t(workspace.LDim()), 1},
+      size_t(input.RowShift()),
+      size_t(input.RowStride()));
+  }
+  nvshmemx_quiet_on_stream(stream);
+
+  // Non-blocking barrier
+  // Note: Gradients have been sent.
+  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
+
+  // Use dense optimizer if needed
+  if (!m_sparse_sgd) {
+
+    // Create buffer for dense gradients
+    const auto& embeddings = this->weights_values(0);
+    std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> embeddings_grad(
+      embeddings.Construct(embeddings.Grid(), embeddings.Root()));
+    embeddings_grad->AlignWith(embeddings);
+    El::Zeros(*embeddings_grad, embeddings.Height(), embeddings.Width());
+    auto& local_embeddings_grad = dynamic_cast<LocalMat&>(embeddings_grad->Matrix());
+
+    // Apply SGD step to convert sparse gradients to dense gradients
+    apply_sparse_sgd_step(
+      input_size * mini_batch_size,
+      local_embeddings_grad);
+
+    // Send dense gradients to dense optimizer
+    auto* opt = this->get_weights(0).get_optimizer();
+    if (opt != nullptr) {
+      opt->add_to_gradient(*embeddings_grad);
+    }
+
+  }
+
+}
+
+// ---------------------------------------------
+// Sparse SGD
+// ---------------------------------------------
+
+namespace
+{
+
+/** Sparse SGD on local embeddings.
+ *
+ *  Block dimensions: 32 x 1 x 1
+ *
+ *  Grid dimensions: num_gradients x 1 x 1
+ */
+template <typename T>
+__global__ void sgd_kernel(
+  T learning_rate,
+  size_t embedding_dim,
+  size_t num_gradients,
+  const VectorMetadata<T>* __restrict__ metadata,
+  const T* __restrict__ embeddings_grad,
+  Size2 embeddings_grad_strides,
+  T* __restrict__ embeddings,
+  Size2 embeddings_strides,
+  size_t rank) {
+
+  // Indices
+  const size_t tid = threadIdx.x;
+  const size_t bid = blockIdx.x;
+  const size_t nblocks = gridDim.x;
+  constexpr size_t warp_size = 32;
+
+  // Assign requests to CUDA blocks
+  const size_t gradients_per_block = (num_gradients + nblocks - 1) / nblocks;
+  const size_t i_start = bid * gradients_per_block;
+  const size_t i_end = cuda::min((bid+1) * gradients_per_block, num_gradients);
+
+  for (size_t i = i_start; i < i_end; ++i) {
+    const auto& m = metadata[i];
+    if (m.is_active && m.source_rank == rank) {
+
+      // Update embedding vector with gradient
+      const auto* __restrict__ dw = &embeddings_grad[m.target_index * embeddings_grad_strides[0]];
+      auto* __restrict__ w = &embeddings[m.source_index * embeddings_strides[0]];
+      for (size_t k = tid; k < embedding_dim; k += warp_size) {
+        cuda::atomic_add(&w[k], -learning_rate * dw[k]);
+      }
+
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void dist_embedding_layer<TensorDataType,Layout,Device>::apply_sparse_sgd_step(
+  size_t num_gradients,
+  LocalMat& local_embeddings) {
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+
+  // Synchronize non-blocking barrier
+  // Note: Make sure gradients have been received.
+  auto& comm = *this->get_comm();
+  comm.wait(m_nb_barrier_request);
+
+  // Initialize SHMEM buffer for gradient w.r.t. embeddings
+  LocalMat local_embeddings_grad(
+    m_embedding_dim,
+    num_gradients,
+    m_workspace_buffer,
+    m_embedding_dim);
+
+  // Sparse SGD on local embeddings
+  const size_t rank = comm.get_rank_in_trainer();
+  constexpr size_t block_size = 32;
+  const size_t grid_size = num_gradients;
+  launch_cuda_kernel(
+    sgd_kernel<TensorDataType>,
+    grid_size,
+    block_size,
+    0,
+    stream,
+    m_learning_rate,
+    m_embedding_dim,
+    num_gradients,
+    m_metadata_buffer,
+    local_embeddings_grad.LockedBuffer(),
+    Size2{size_t(local_embeddings_grad.LDim()), 1},
+    local_embeddings.Buffer(),
+    Size2{size_t(local_embeddings.LDim()), 1},
+    rank);
+
+}
+
+// ---------------------------------------------
+// Explicit template instantiation
+// ---------------------------------------------
+
+/// @todo fp16
+template class dist_embedding_layer<
+  float, data_layout::DATA_PARALLEL, El::Device::GPU>;
+
+} // namespace lbann
+#endif // LBANN_HAS_NVSHMEM
diff --git a/src/layers/misc/mini_batch_index.cpp b/src/layers/misc/mini_batch_index.cpp
new file mode 100644
index 00000000000..92ea749c552
--- /dev/null
+++ b/src/layers/misc/mini_batch_index.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE
+#include "lbann/layers/misc/mini_batch_index.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class mini_batch_index_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class mini_batch_index_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/misc/mini_batch_size.cpp b/src/layers/misc/mini_batch_size.cpp
new file mode 100644
index 00000000000..21535a0841c
--- /dev/null
+++ b/src/layers/misc/mini_batch_size.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE
+#include "lbann/layers/misc/mini_batch_size.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class mini_batch_size_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class mini_batch_size_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/misc/one_hot.cpp b/src/layers/misc/one_hot.cpp
new file mode 100644
index 00000000000..ac1458810ed
--- /dev/null
+++ b/src/layers/misc/one_hot.cpp
@@ -0,0 +1,62 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ONE_HOT_LAYER_INSTANTIATE
+#include "lbann/layers/misc/one_hot.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void one_hot_layer<TensorDataType, Layout, Device>::fp_compute() {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input =
+    dynamic_cast<const CPUMatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<CPUMatType&>(this->get_local_activations());
+  const El::Int local_height = local_output.Height();
+  const El::Int local_width = local_output.Width();
+
+  // Populate one-hot vectors
+  El::Zero(local_output);
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto& ind = local_input(0, col);
+    if (El::TypeTraits<TensorDataType>::Zero() <= ind && ind < TensorDataType(local_height)) {
+      const El::Int row = static_cast<El::Int>(ind);
+      local_output(row, col) = El::TypeTraits<TensorDataType>::One();
+    }
+  }
+
+}
+
+#define PROTO(T)                     \
+  template class one_hot_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/one_hot.cu b/src/layers/misc/one_hot.cu
new file mode 100644
index 00000000000..2ebfb92a9a8
--- /dev/null
+++ b/src/layers/misc/one_hot.cu
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ONE_HOT_LAYER_INSTANTIATE
+#include "lbann/layers/misc/one_hot.hpp"
+
+namespace lbann {
+
+namespace {
+
+/**
+ *  On input, output is assumed to be filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (width / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void fp_kernel(unsigned long long height,
+                          unsigned long long width,
+                          const TensorDataType* __restrict__ indices,
+                          unsigned long long indices_stride,
+                          TensorDataType* __restrict__ output,
+                          unsigned long long output_ldim) {
+  const unsigned long long gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const unsigned long long nthreads = blockDim.x * gridDim.x;
+  for (unsigned long long col = gid; col < width; col += nthreads) {
+    const auto& ind = indices[col*indices_stride];
+    if (TensorDataType(0.f) <= ind && ind < TensorDataType(height)) {
+      const unsigned long long row = static_cast<unsigned long long>(ind);
+      output[row+col*output_ldim] = TensorDataType(1.f);
+    }
+  }
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void one_hot_layer<TensorDataType, Layout, Device>::fp_compute() {
+
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local matrices
+  const auto& local_input =
+    dynamic_cast<const GPUMatType&>(this->get_local_prev_activations());
+  auto& local_output = dynamic_cast<GPUMatType&>(this->get_local_activations());
+
+  // Populate one-hot vectors
+  El::Zero(local_output);
+  if (!local_output.IsEmpty()) {
+    const size_t local_height = local_output.Height();
+    const size_t local_width = local_output.Width();
+    constexpr size_t block_size = 64;
+    const size_t grid_size = (local_width + block_size - 1) / block_size;
+    fp_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        local_input.LockedBuffer(),
+        local_input.LDim(),
+        local_output.Buffer(),
+        local_output.LDim());
+  }
+
+}
+
+#define PROTO(T)                     \
+  template class one_hot_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/misc/variance.cpp b/src/layers/misc/variance.cpp
index 49e4f7e8f9e..def29172461 100644
--- a/src/layers/misc/variance.cpp
+++ b/src/layers/misc/variance.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_VARIANCE_LAYER_INSTANTIATE
 #include "lbann/layers/misc/variance.hpp"
 
 namespace lbann {
@@ -34,16 +35,18 @@ namespace {
  *  We use a two-pass algorithm since it is more numerically stable
  *  than the naive single-pass algorithm.
  */
-void fp_cpu(const AbsDistMat& input,
-            AbsDistMat& output,
-            AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void fp_cpu(const El::AbstractDistMatrix<TensorDataType>& input,
+            El::AbstractDistMatrix<TensorDataType>& output,
+            El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
 
   // Local matrices
-  const auto& local_input = static_cast<const CPUMat&>(input.LockedMatrix());
-  auto& local_means = static_cast<CPUMat&>(means.Matrix());
-  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  const auto& local_input = static_cast<const CPUMatType&>(input.LockedMatrix());
+  auto& local_means = static_cast<CPUMatType&>(means.Matrix());
+  auto& local_workspace = static_cast<CPUMatType&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input.Height();
@@ -57,7 +60,7 @@ void fp_cpu(const AbsDistMat& input,
   means.Resize(1, width);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    DataType sum = 0;
+    TensorDataType sum = El::To<TensorDataType>(0);
     for (El::Int row = 0; row < local_height; ++row) {
       sum += local_input(row, col);
     }
@@ -72,7 +75,7 @@ void fp_cpu(const AbsDistMat& input,
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
     const auto& mean = local_means(0, col);
-    DataType sum = 0;
+    TensorDataType sum = El::To<TensorDataType>(0);
     for (El::Int row = 0; row < local_height; ++row) {
       const auto& diff = local_input(row, col) - mean;
       sum += diff * diff;
@@ -87,18 +90,20 @@ void fp_cpu(const AbsDistMat& input,
 /** CPU backprop implementation.
  *  Means have already been computed in forward prop.
  */
-void bp_cpu(const AbsDistMat& input,
-            const AbsDistMat& gradient_wrt_output,
-            AbsDistMat& gradient_wrt_input,
-            const AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void bp_cpu(const El::AbstractDistMatrix<TensorDataType>& input,
+            const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+            const El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
 
   // Local matrices
-  const auto& local_input = static_cast<const CPUMat&>(input.LockedMatrix());
-  auto& local_gradient_wrt_input = static_cast<CPUMat&>(gradient_wrt_input.Matrix());
-  const auto& local_means = static_cast<const CPUMat&>(means.LockedMatrix());
-  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  const auto& local_input = static_cast<const CPUMatType&>(input.LockedMatrix());
+  auto& local_gradient_wrt_input = static_cast<CPUMatType&>(gradient_wrt_input.Matrix());
+  const auto& local_means = static_cast<const CPUMatType&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<CPUMatType&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input.Height();
@@ -109,7 +114,8 @@ void bp_cpu(const AbsDistMat& input,
   El::Copy(gradient_wrt_output, workspace);
 
   // Compute gradients w.r.t. input
-  const DataType scale = DataType(2) / (biased? height : height - 1);
+  const TensorDataType scale = TensorDataType(2)
+    / (biased? El::To<TensorDataType>(height) : El::To<TensorDataType>(height - 1));
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int row = 0; row < local_height; ++row) {
@@ -125,46 +131,30 @@ void bp_cpu(const AbsDistMat& input,
 
 } // namespace
 
-template <>
-void variance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(get_prev_activations(),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void variance_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_cpu(this->get_prev_activations(),
+         this->get_activations(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void variance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  bp_cpu(get_prev_activations(),
-         get_prev_error_signals(),
-         get_error_signals(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void variance_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_cpu(this->get_prev_activations(),
+         this->get_prev_error_signals(),
+         this->get_error_signals(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void variance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(get_prev_activations(),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define PROTO(T)                     \
+  template class variance_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class variance_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void variance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
-  bp_cpu(get_prev_activations(),
-         get_prev_error_signals(),
-         get_error_signals(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/misc/variance.cu b/src/layers/misc/variance.cu
index 02ace16d465..8c70b7bb9aa 100644
--- a/src/layers/misc/variance.cu
+++ b/src/layers/misc/variance.cu
@@ -24,20 +24,21 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_VARIANCE_LAYER_INSTANTIATE
 #include "lbann/layers/misc/variance.hpp"
 
 namespace lbann {
 
 namespace {
 
-template <El::Int block_size>
+template <typename TensorDataType, El::Int block_size>
 __global__ void variance_contribution_kernel(El::Int height,
                                              El::Int width,
-                                             DataType scale,
-                                             const DataType* __restrict__ input,
+                                             TensorDataType scale,
+                                             const TensorDataType* __restrict__ input,
                                              El::Int input_ldim,
-                                             const DataType* __restrict__ means,
-                                             DataType* __restrict__ contribution) {
+                                             const TensorDataType* __restrict__ means,
+                                             TensorDataType* __restrict__ contribution) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -50,7 +51,7 @@ __global__ void variance_contribution_kernel(El::Int height,
     const auto& mean = means[col];
 
     // Compute contributions for each thread
-    DataType private_contribution = 0;
+    TensorDataType private_contribution = 0;
     for (El::Int row = gidx; row < height; row += nthreadsx) {
       const auto& diff = input[row + col * input_ldim] - mean;
       private_contribution += diff * diff;
@@ -58,7 +59,7 @@ __global__ void variance_contribution_kernel(El::Int height,
 
     // Shared memory reduction to get contribution for each block
     /// @todo unroll loops
-    __shared__ DataType shared_contribution[block_size];
+    __shared__ TensorDataType shared_contribution[block_size];
     shared_contribution[tid] = private_contribution;
     for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
       __syncthreads();
@@ -75,15 +76,16 @@ __global__ void variance_contribution_kernel(El::Int height,
 
 }
 
+template <typename TensorDataType>
 __global__
 void variance_backprop_kernel(El::Int height,
                               El::Int width,
-                              DataType scale,
-                              const DataType* __restrict__ gradient_wrt_output,
-                              const DataType* __restrict__ input,
+                              TensorDataType scale,
+                              const TensorDataType* __restrict__ gradient_wrt_output,
+                              const TensorDataType* __restrict__ input,
                               El::Int input_ldim,
-                              const DataType* __restrict__ means,
-                              DataType* __restrict__ gradient_wrt_input,
+                              const TensorDataType* __restrict__ means,
+                              TensorDataType* __restrict__ gradient_wrt_input,
                               El::Int gradient_wrt_input_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int size = height * width;
@@ -103,16 +105,17 @@ void variance_backprop_kernel(El::Int height,
  *  We use a two-pass algorithm since it is more numerically stable
  *  than the naive single-pass algorithm.
  */
-void fp_gpu(const AbsDistMat& input,
-            AbsDistMat& output,
-            AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void fp_gpu(const El::AbstractDistMatrix<TensorDataType>& input,
+            El::AbstractDistMatrix<TensorDataType>& output,
+            El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
 
   // Local matrices
-  const auto& local_input = static_cast<const GPUMat&>(input.LockedMatrix());
-  auto& local_means = static_cast<GPUMat&>(means.Matrix());
-  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  const auto& local_input = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input.LockedMatrix());
+  auto& local_means = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(means.Matrix());
+  auto& local_workspace = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input.Height();
@@ -124,15 +127,15 @@ void fp_gpu(const AbsDistMat& input,
   means.Empty(false);
   means.AlignWith(input);
   means.Resize(1, width);
-  GPUMat ones;
+  El::Matrix<TensorDataType, El::Device::GPU> ones;
 #ifdef HYDROGEN_HAVE_CUB
   ones.SetMemoryMode(1); // Use CUB GPU memory pool
 #endif // HYDROGEN_HAVE_CUB
   ones.Resize(local_height, 1);
-  El::Fill(ones, DataType(1));
+  El::Fill(ones, El::TypeTraits<TensorDataType>::One());
   El::Gemv(El::TRANSPOSE,
-           DataType(1) / height, local_input, ones,
-           DataType(0), local_means);
+           El::TypeTraits<TensorDataType>::One() / TensorDataType(height), local_input, ones,
+           El::TypeTraits<TensorDataType>::Zero(), local_means);
   El::AllReduce(means, means.RedundantComm());
 
   // Compute column-wise variance
@@ -145,8 +148,8 @@ void fp_gpu(const AbsDistMat& input,
     block_dims.x = block_size;
     grid_dims.x = (local_height + block_size - 1) / block_size;
     grid_dims.y = local_width;
-    const auto& scale = DataType(1) / (biased ? height : height - 1);
-    variance_contribution_kernel<block_size>
+    const auto& scale = El::TypeTraits<TensorDataType>::One() / (biased ? TensorDataType(height) : TensorDataType(height - 1));
+    variance_contribution_kernel<TensorDataType, block_size>
       <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
         local_height, local_width, scale,
         local_input.LockedBuffer(), local_input.LDim(),
@@ -161,18 +164,19 @@ void fp_gpu(const AbsDistMat& input,
 /** GPU backprop implementation.
  *  Means have already been computed in forward prop.
  */
-void bp_gpu(const AbsDistMat& input,
-            const AbsDistMat& gradient_wrt_output,
-            AbsDistMat& gradient_wrt_input,
-            const AbsDistMat& means,
-            AbsDistMat& workspace,
+template <typename TensorDataType>
+void bp_gpu(const El::AbstractDistMatrix<TensorDataType>& input,
+            const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+            El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+            const El::AbstractDistMatrix<TensorDataType>& means,
+            El::AbstractDistMatrix<TensorDataType>& workspace,
             bool biased) {
 
   // Local matrices
-  const auto& local_input = static_cast<const GPUMat&>(input.LockedMatrix());
-  auto& local_gradient_wrt_input = static_cast<GPUMat&>(gradient_wrt_input.Matrix());
-  const auto& local_means = static_cast<const GPUMat&>(means.LockedMatrix());
-  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  const auto& local_input = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input.LockedMatrix());
+  auto& local_gradient_wrt_input = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_input.Matrix());
+  const auto& local_means = static_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(workspace.Matrix());
 
   // Dimensions
   const auto& height = input.Height();
@@ -183,11 +187,11 @@ void bp_gpu(const AbsDistMat& input,
   El::Copy(gradient_wrt_output, workspace);
 
   // Compute gradients w.r.t. input
-  const DataType scale = DataType(2) / (biased ? height : height - 1);
+  const TensorDataType scale = TensorDataType(2) / (biased ? TensorDataType(height) : TensorDataType(height - 1));
   constexpr El::Int block_size = 256;
   El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
   if (grid_size > 0) {
-    variance_backprop_kernel
+    variance_backprop_kernel<TensorDataType>
       <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
         local_height, local_width, scale,
         local_workspace.LockedBuffer(),
@@ -200,46 +204,30 @@ void bp_gpu(const AbsDistMat& input,
 
 } // namespace
 
-template <>
-void variance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(get_prev_activations(),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void variance_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_gpu(this->get_prev_activations(),
+         this->get_activations(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void variance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  bp_gpu(get_prev_activations(),
-         get_prev_error_signals(),
-         get_error_signals(),
-         *m_means,
-         *m_workspace,
-         m_biased);
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void variance_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_gpu(this->get_prev_activations(),
+         this->get_prev_error_signals(),
+         this->get_error_signals(),
+         *this->m_means,
+         *this->m_workspace,
+         this->m_biased);
 }
 
-template <>
-void variance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(get_prev_activations(),
-         get_activations(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define PROTO(T)                     \
+  template class variance_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class variance_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void variance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
-  bp_gpu(get_prev_activations(),
-         get_prev_error_signals(),
-         get_error_signals(),
-         *m_means,
-         *m_workspace,
-         m_biased);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/regularizers/CMakeLists.txt b/src/layers/regularizers/CMakeLists.txt
index a85f2e3d740..96de88cd348 100644
--- a/src/layers/regularizers/CMakeLists.txt
+++ b/src/layers/regularizers/CMakeLists.txt
@@ -1,12 +1,21 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
   batch_normalization.cpp
+  dropout.cpp
+  entrywise_batch_normalization.cpp
+  instance_norm.cpp
+  layer_norm.cpp
+  local_response_normalization.cpp
+  selu_dropout.cpp
   )
 
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
     batch_normalization.cu
+    entrywise_batch_normalization.cu
+    instance_norm.cu
+    layer_norm.cu
     )
 endif ()
 
diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp
index 5d1535f01c7..4380d196498 100644
--- a/src/layers/regularizers/batch_normalization.cpp
+++ b/src/layers/regularizers/batch_normalization.cpp
@@ -24,42 +24,45 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE
 #include "lbann/layers/regularizers/batch_normalization.hpp"
+#include "lbann/weights/weights_helpers.hpp"
 
 namespace lbann {
 
-template <>
-void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
-  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  const TensorDataType zero = El::TypeTraits<TensorDataType>::Zero();
+  const TensorDataType one = El::TypeTraits<TensorDataType>::One();
+  const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // Matrices
-  const auto& input = get_prev_activations();
+  const auto& input = this->get_prev_activations();
   const auto& local_input = input.LockedMatrix();
-  auto& local_output = get_local_activations();
+  auto& local_output = this->get_local_activations();
 
   // Matrix parameters
   const auto& width = input.Width();
   const auto& local_width = local_input.Width();
-  const auto& output_dims = get_output_dims();
+  const auto& output_dims = this->get_output_dims();
   const auto& num_channels = output_dims[0];
-  const auto& channel_size = get_output_size() / num_channels;
+  const auto& channel_size = this->get_output_size() / num_channels;
 
   // Compute statistics
   if (is_training) {
-
+    using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
     // Local matrices
-    auto& local_mean = m_mean->Matrix();
-    auto& local_var = m_var->Matrix();
-    auto& local_running_mean = this->m_weights[2]->get_values().Matrix();
-    auto& local_running_var = this->m_weights[3]->get_values().Matrix();
-
+    auto& local_mean = this->m_mean_v->Matrix();
+    auto& local_var = this->m_var_v->Matrix();
+    auto& local_running_mean =
+      ValuesGetter::mutable_values(this->get_weights(2)).Matrix();
+    auto& local_running_var =
+      ValuesGetter::mutable_values(this->get_weights(3)).Matrix();
     // Compute sums and sums of squares
     LBANN_OMP_PARALLEL_FOR
     for (El::Int channel = 0; channel < num_channels; ++channel) {
-      DataType sum = zero;
-      DataType sqsum = zero;
+      TensorDataType sum = zero;
+      TensorDataType sqsum = zero;
       const auto& row_start = channel * channel_size;
       const auto& row_end = (channel+1) * channel_size;
       for (El::Int col = 0; col < local_width; ++col) {
@@ -73,28 +76,27 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_
       local_var(channel, 0) = sqsum;
     }
     El::Int num_per_sum;
-    switch (m_stats_aggregation) {
-    case batch_normalization_stats_aggregation::global:
-      m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
-      m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
+    if (this->m_statistics_group_size == 0) {
+      // Global statistics aggregation; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var, this->m_mean_and_var->RedundantComm(),
+                        El::mpi::SUM);
       num_per_sum = channel_size * width;
-      break;
-    case batch_normalization_stats_aggregation::node_local:
-      m_comm->allreduce(*m_mean, m_comm->get_node_comm(), El::mpi::SUM);
-      m_comm->allreduce(*m_var, m_comm->get_node_comm(), El::mpi::SUM);
-      if (m_num_per_sum_cache.count(width) == 0) {
+    } else if (this->m_statistics_group_size == 1) {
+      // Local aggregation, no allreduce needed.
+      num_per_sum = channel_size * local_width;
+    } else {
+      // Grouped batchnorm. Allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var,
+                        this->m_comm->get_packed_group_comm(this->m_statistics_group_size),
+                        El::mpi::SUM);
+      if (this->m_num_per_sum_cache.count(width) == 0) {
         num_per_sum = channel_size * local_width;
-        num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm());
-        m_num_per_sum_cache[width] = num_per_sum;
+        num_per_sum = this->m_comm->allreduce(
+          num_per_sum, this->m_comm->get_packed_group_comm(this->m_statistics_group_size));
+        this->m_num_per_sum_cache[width] = num_per_sum;
       } else {
-        num_per_sum = m_num_per_sum_cache[width];
+        num_per_sum = this->m_num_per_sum_cache[width];
       }
-      break;
-    case batch_normalization_stats_aggregation::local:
-      num_per_sum = channel_size * local_width;
-      break;
-    default:
-      LBANN_ERROR("Unknown batch normalization stats aggregation");
     }
 
     // Compute minibatch statistics
@@ -103,30 +105,32 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_
     } else {
       LBANN_OMP_PARALLEL_FOR
       for (El::Int channel = 0; channel < num_channels; ++channel) {
-        const auto& mean = local_mean(channel, 0) / num_per_sum;
-        const auto& sqmean = local_var(channel, 0) / num_per_sum;
-        auto var = num_per_sum * (sqmean - mean * mean) / (num_per_sum - 1);
-        var = std::max(var, m_epsilon);
+        auto num_per_sum_dt = El::To<TensorDataType>(num_per_sum);
+        const auto& mean = local_mean(channel, 0) / num_per_sum_dt;
+        const auto& sqmean = local_var(channel, 0) / num_per_sum_dt;
+        auto var = num_per_sum_dt * (sqmean - mean * mean)
+          / (num_per_sum_dt - El::TypeTraits<TensorDataType>::One());
+        var = std::max(var, this->m_epsilon);
         local_mean(channel, 0) = mean;
         local_var(channel, 0) = var;
         auto& running_mean = local_running_mean(channel, 0);
         auto& running_var = local_running_var(channel, 0);
-        running_mean = m_decay * running_mean + (one - m_decay) * mean;
-        running_var = m_decay * running_var + (one - m_decay) * var;
+        running_mean = this->m_decay * running_mean + (one - this->m_decay) * mean;
+        running_var = this->m_decay * running_var + (one - this->m_decay) * var;
       }
     }
 
   }
 
   // Get matrices
-  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
-  const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix();
+  const auto& local_scale = this->weights_values(0).LockedMatrix();
+  const auto& local_bias = this->weights_values(1).LockedMatrix();
   const auto& local_mean = (is_training ?
-                            m_mean->LockedMatrix() :
-                            this->m_weights[2]->get_values().LockedMatrix());
+                            this->m_mean_v->LockedMatrix() :
+                            this->weights_values(2).LockedMatrix());
   const auto& local_var = (is_training ?
-                           m_var->LockedMatrix() :
-                           this->m_weights[3]->get_values().LockedMatrix());
+                           this->m_var_v->LockedMatrix() :
+                           this->weights_values(3).LockedMatrix());
 
   // Iterate through channels
   LBANN_OMP_PARALLEL_FOR
@@ -135,7 +139,7 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_
     // Get channel parameters
     const auto& mean = local_mean(channel, 0);
     const auto& var = local_var(channel, 0);
-    const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+    const TensorDataType inv_stdev = static_cast<TensorDataType>(1 / El::Sqrt(var + this->m_epsilon));
     const auto& scale = local_scale(channel, 0);
     const auto& bias = local_bias(channel, 0);
 
@@ -155,35 +159,33 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_
 
 }
 
-template <>
-void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  constexpr DataType one = 1;
-  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // Matrices
-  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_scale = this->weights_values(0).LockedMatrix();
   const auto& local_mean = (is_training ?
-                            m_mean->LockedMatrix() :
-                            this->m_weights[2]->get_values().LockedMatrix());
+                            this->m_mean_v->LockedMatrix() :
+                            this->weights_values(2).LockedMatrix());
   const auto& local_var = (is_training ?
-                           m_var->LockedMatrix() :
-                           this->m_weights[3]->get_values().LockedMatrix());
-  const auto& input = get_prev_activations();
+                           this->m_var_v->LockedMatrix() :
+                           this->weights_values(3).LockedMatrix());
+  const auto& input = this->get_prev_activations();
   const auto& local_input = input.LockedMatrix();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  auto& local_mean_gradient = m_mean_gradient->Matrix();
-  auto& local_var_gradient = m_var_gradient->Matrix();
-  auto& local_scale_gradient = m_scale_gradient->Matrix();
-  auto& local_bias_gradient = m_bias_gradient->Matrix();
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
+  auto& local_mean_gradient = this->m_mean_gradient_v->Matrix();
+  auto& local_var_gradient = this->m_var_gradient_v->Matrix();
+  auto& local_scale_gradient = this->m_scale_gradient->Matrix();
+  auto& local_bias_gradient = this->m_bias_gradient->Matrix();
 
   // Matrix parameters
-  const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
   const auto& width = input.Width();
   const auto& local_width = local_input.Width();
-  const auto& output_dims = get_output_dims();
+  const auto& output_dims = this->get_output_dims();
   const auto& num_channels = output_dims[0];
-  const auto& channel_size = get_output_size() / num_channels;
+  const auto& channel_size = this->get_output_size() / num_channels;
 
   // Compute local gradients
   LBANN_OMP_PARALLEL_FOR
@@ -193,12 +195,12 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_
     const auto& mean = local_mean(channel, 0);
     const auto& var = local_var(channel, 0);
     const auto& scale = local_scale(channel, 0);
-    const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+    const TensorDataType inv_stdev = static_cast<TensorDataType>(1 / El::Sqrt(var + this->m_epsilon));
     const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / 2;
-    DataType dmean = 0;
-    DataType dvar = 0;
-    DataType dscale = 0;
-    DataType dbias = 0;
+    TensorDataType dmean = El::TypeTraits<TensorDataType>::Zero();
+    TensorDataType dvar = El::TypeTraits<TensorDataType>::Zero();
+    TensorDataType dscale = El::TypeTraits<TensorDataType>::Zero();
+    TensorDataType dbias = El::TypeTraits<TensorDataType>::Zero();
 
     // Compute gradient contributions from local entries
     const auto& row_start = channel * channel_size;
@@ -224,52 +226,41 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_
 
   // Accumulate gradients
   if (is_training) {
-    if (m_stats_aggregation == batch_normalization_stats_aggregation::global) {
-      m_comm->allreduce(*m_mean_gradient,
-                        m_mean_gradient->RedundantComm(),
-                        El::mpi::SUM);
-      m_comm->allreduce(*m_var_gradient,
-                        m_var_gradient->RedundantComm(),
-                        El::mpi::SUM);
-    } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) {
-      m_comm->allreduce(*m_mean_gradient,
-                        m_comm->get_node_comm(),
+    if (this->m_statistics_group_size == 0) {
+      // Global aggregation; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var_gradient,
+                        this->m_mean_and_var_gradient->RedundantComm(),
                         El::mpi::SUM);
-      m_comm->allreduce(*m_var_gradient,
-                        m_comm->get_node_comm(),
+    } else if (this->m_statistics_group_size > 1) {
+      // Grouped batchnorm; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var_gradient,
+                        this->m_comm->get_packed_group_comm(this->m_statistics_group_size),
                         El::mpi::SUM);
     }
   } else {
-    El::Zero(*m_mean_gradient);
-    El::Zero(*m_var_gradient);
+    // Zero fused buffer.
+    El::Zero(*this->m_mean_and_var_gradient);
   }
-  optimizer* scale_optimizer = m_weights[0]->get_optimizer();
+  auto* scale_optimizer = this->get_weights(0).get_optimizer();
   if (scale_optimizer != nullptr) {
-    scale_optimizer->add_to_gradient(*m_scale_gradient,
-                                     one / effective_mini_batch_size,
-                                     true);
+    scale_optimizer->add_to_gradient(*this->m_scale_gradient, El::TypeTraits<TensorDataType>::One(), true);
   }
-  optimizer* bias_optimizer = m_weights[1]->get_optimizer();
+  auto* bias_optimizer = this->get_weights(1).get_optimizer();
   if (bias_optimizer != nullptr) {
-    bias_optimizer->add_to_gradient(*m_bias_gradient,
-                                    one / effective_mini_batch_size,
-                                    true);
+    bias_optimizer->add_to_gradient(*this->m_bias_gradient, El::TypeTraits<TensorDataType>::One(), true);
   }
 
   // Compute error signal
   El::Int num_per_sum;
-  switch (m_stats_aggregation) {
-  case batch_normalization_stats_aggregation::global:
+  if (this->m_statistics_group_size == 0) {
+    // Global statistics aggregation.
     num_per_sum = channel_size * width;
-    break;
-  case batch_normalization_stats_aggregation::node_local:
-    num_per_sum = m_num_per_sum_cache[width];  // This was computed in FP.
-    break;
-  case batch_normalization_stats_aggregation::local:
+  } else if (this->m_statistics_group_size == 1) {
+    // Local aggregation.
     num_per_sum = channel_size * local_width;
-    break;
-  default:
-    LBANN_ERROR("Unknown batch normalization stats aggregation");
+  } else {
+    // Grouped batchnorm.
+    num_per_sum = this->m_num_per_sum_cache[width];  // This was computed in FP.
   }
   if (num_per_sum <= 1) {
     El::Zero(local_gradient_wrt_input);
@@ -285,7 +276,7 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_
       const auto& dvar = local_var_gradient(channel, 0);
 
       // Compute useful constants
-      const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+      const TensorDataType inv_stdev = static_cast<TensorDataType>(1 / El::Sqrt(var + this->m_epsilon));
       const auto& dmean_term = dmean / num_per_sum;
       const auto& dvar_term = dvar * 2 / (num_per_sum - 1);
 
@@ -307,4 +298,10 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_
 
 }
 
+#define PROTO(T)                                      \
+  template class batch_normalization_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu
index ae679ec1c64..c81cb076fe2 100644
--- a/src/layers/regularizers/batch_normalization.cu
+++ b/src/layers/regularizers/batch_normalization.cu
@@ -24,7 +24,9 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE
 #include "lbann/layers/regularizers/batch_normalization.hpp"
+#include "lbann/weights/weights_helpers.hpp"
 #include "lbann/utils/cuda.hpp"
 
 namespace lbann {
@@ -34,13 +36,13 @@ namespace {
 /** CUDA kernel to compute channel sums.
  *  Sums and squares of sums are used to compute mean and variance.
  */
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void channel_sums_kernel(
   El::Int channel_height,
   El::Int width,
-  const DataType * __restrict__ data, El::Int data_ldim,
-        DataType * __restrict__ sums,
-        DataType * __restrict__ sqsums) {
+  const TensorDataType * __restrict__ data, El::Int data_ldim,
+        TensorDataType * __restrict__ sums,
+        TensorDataType * __restrict__ sqsums) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -48,12 +50,12 @@ __global__ void channel_sums_kernel(
   const El::Int bidy = blockIdx.y;
 
   // Initialize shared memory
-  __shared__ DataType shared_sums[block_size];
-  __shared__ DataType shared_sqsums[block_size];
+  __shared__ TensorDataType shared_sums[block_size];
+  __shared__ TensorDataType shared_sqsums[block_size];
 
   // Compute row sums in shared memory
-  DataType private_sum = 0;
-  DataType private_sqsum = 0;
+  TensorDataType private_sum = 0;
+  TensorDataType private_sqsum = 0;
   if (gidx < channel_height) {
     const auto& row = gidx + bidy * channel_height;
     for (El::Int col = 0; col < width; ++col) {
@@ -87,24 +89,26 @@ __global__ void channel_sums_kernel(
  *  On input, global_mean and global_var are assumed to contain sums
  *  and squares of sums, respectively.
  */
+template <typename TensorDataType>
 __global__ void compute_statistics_kernel(
   El::Int num_sums,
   El::Int num_per_sum,
-  DataType epsilon,
-  DataType decay,
-  DataType * __restrict__ global_mean,
-  DataType * __restrict__ global_var,
-  DataType * __restrict__ global_running_mean,
-  DataType * __restrict__ global_running_var) {
-  constexpr DataType one = 1;
+  TensorDataType epsilon,
+  TensorDataType decay,
+  TensorDataType * __restrict__ global_mean,
+  TensorDataType * __restrict__ global_var,
+  TensorDataType * __restrict__ global_running_mean,
+  TensorDataType * __restrict__ global_running_var) {
+
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
   for (El::Int i = gid; i < num_sums; i += num_threads) {
 
+    TensorDataType num_per_sum_dt = TensorDataType(num_per_sum);
     // Compute mean and variance
-    const auto& mean = global_mean[i] / num_per_sum;
-    const auto& sqmean = global_var[i] / num_per_sum;
-    auto var = num_per_sum * (sqmean - mean * mean) / (num_per_sum - 1);
+    const auto& mean = global_mean[i] / num_per_sum_dt;
+    const auto& sqmean = global_var[i] / num_per_sum_dt;
+    auto var = num_per_sum_dt * (sqmean - mean * mean) / TensorDataType(num_per_sum - 1);
     var = var > epsilon ? var : epsilon;
     global_mean[gid] = mean;
     global_var[gid] = var;
@@ -112,25 +116,25 @@ __global__ void compute_statistics_kernel(
     // Compute running statistics
     auto& running_mean = global_running_mean[gid];
     auto& running_var = global_running_var[gid];
-    running_mean = decay * running_mean + (one - decay) * mean;
-    running_var = decay * running_var + (one - decay) * var;
+    running_mean = decay * running_mean + (TensorDataType(1.0) - decay) * mean;
+    running_var = decay * running_var + (TensorDataType(1.0) - decay) * var;
 
   }
 
 }
 
 /** CUDA kernel to apply batch normalization. */
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void batch_normalization_kernel(
   El::Int channel_height,
   El::Int width,
-  const DataType * __restrict__ global_input, El::Int input_ldim,
-  const DataType * __restrict__ global_mean,
-  const DataType * __restrict__ global_var,
-  DataType epsilon,
-  const DataType * __restrict__ global_scale,
-  const DataType * __restrict__ global_bias,
-        DataType * __restrict__ global_output, El::Int output_ldim) {
+  const TensorDataType * __restrict__ global_input, El::Int input_ldim,
+  const TensorDataType * __restrict__ global_mean,
+  const TensorDataType * __restrict__ global_var,
+  TensorDataType epsilon,
+  const TensorDataType * __restrict__ global_scale,
+  const TensorDataType * __restrict__ global_bias,
+        TensorDataType * __restrict__ global_output, El::Int output_ldim) {
 
   // Indices
   const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -159,22 +163,22 @@ __global__ void batch_normalization_kernel(
 }
 
 /** CUDA kernel to compute gradients w.r.t. batch norm parameters. */
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void backprop1_kernel(
   El::Int channel_height,
   El::Int width,
-  const DataType * __restrict__ global_input,
+  const TensorDataType * __restrict__ global_input,
   El::Int input_ldim,
-  const DataType * __restrict__ global_gradient_wrt_output,
+  const TensorDataType * __restrict__ global_gradient_wrt_output,
   El::Int gradient_wrt_output_ldim,
-  const DataType * __restrict__ global_mean,
-  const DataType * __restrict__ global_var,
-  DataType epsilon,
-  const DataType * __restrict__ global_scale,
-        DataType * __restrict__ global_dscale,
-        DataType * __restrict__ global_dbias,
-        DataType * __restrict__ global_dmean,
-        DataType * __restrict__ global_dvar) {
+  const TensorDataType * __restrict__ global_mean,
+  const TensorDataType * __restrict__ global_var,
+  TensorDataType epsilon,
+  const TensorDataType * __restrict__ global_scale,
+        TensorDataType * __restrict__ global_dscale,
+        TensorDataType * __restrict__ global_dbias,
+        TensorDataType * __restrict__ global_dmean,
+        TensorDataType * __restrict__ global_dvar) {
 
   // Indices
   const El::Int tid = threadIdx.x;
@@ -182,10 +186,10 @@ __global__ void backprop1_kernel(
   const El::Int bidy = blockIdx.y;
 
   // Initialize shared memory
-  __shared__ DataType shared_dscale[block_size];
-  __shared__ DataType shared_dbias[block_size];
-  __shared__ DataType shared_dmean[block_size];
-  __shared__ DataType shared_dvar[block_size];
+  __shared__ TensorDataType shared_dscale[block_size];
+  __shared__ TensorDataType shared_dbias[block_size];
+  __shared__ TensorDataType shared_dmean[block_size];
+  __shared__ TensorDataType shared_dvar[block_size];
 
   // Copy batch normalization parameters to private memory
   const auto& mean = global_mean[bidy];
@@ -193,9 +197,9 @@ __global__ void backprop1_kernel(
   const auto& scale = global_scale[bidy];
 
   // Compute useful constants
-  constexpr DataType zero = 0;
+  const TensorDataType zero = TensorDataType(0);
   const auto& inv_stdev = cuda::rsqrt(var + epsilon);
-  const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / 2;
+  const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / TensorDataType(2);
 
   // Compute row-wise gradient contributions in shared memory
   auto dscale = zero;
@@ -243,22 +247,22 @@ __global__ void backprop1_kernel(
 }
 
 /** CUDA kernel to compute gradients w.r.t. input. */
-template <El::Int block_size>
+template <El::Int block_size, typename TensorDataType>
 __global__ void backprop2_kernel(
   El::Int channel_height,
   El::Int local_width,
   El::Int num_per_sum,
-  const DataType * __restrict__ global_input,
+  const TensorDataType * __restrict__ global_input,
   El::Int input_ldim,
-  const DataType * __restrict__ global_gradient_wrt_output,
+  const TensorDataType * __restrict__ global_gradient_wrt_output,
   El::Int gradient_wrt_output_ldim,
-  const DataType * __restrict__ global_mean,
-  const DataType * __restrict__ global_var,
-  DataType epsilon,
-  const DataType * __restrict__ global_scale,
-  const DataType * __restrict__ global_dmean,
-  const DataType * __restrict__ global_dvar,
-        DataType * __restrict__ global_gradient_wrt_input,
+  const TensorDataType * __restrict__ global_mean,
+  const TensorDataType * __restrict__ global_var,
+  TensorDataType epsilon,
+  const TensorDataType * __restrict__ global_scale,
+  const TensorDataType * __restrict__ global_dmean,
+  const TensorDataType * __restrict__ global_dvar,
+        TensorDataType * __restrict__ global_gradient_wrt_input,
   El::Int gradient_wrt_input_ldim) {
 
   // Indices
@@ -274,8 +278,8 @@ __global__ void backprop2_kernel(
 
   // Compute useful constants
   const auto& inv_stdev = cuda::rsqrt(var + epsilon);
-  const auto& dmean_term = dmean / num_per_sum;
-  const auto& dvar_term = dvar * 2 / (num_per_sum - 1);
+  const auto& dmean_term = dmean / TensorDataType(num_per_sum);
+  const auto& dvar_term = dvar * TensorDataType(2) / TensorDataType(num_per_sum - 1);
 
   // Apply batch normalization
   if (gidx < channel_height) {
@@ -293,35 +297,140 @@ __global__ void backprop2_kernel(
 
 } // namespace
 
-template <>
-void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-  constexpr DataType one = 1;
-  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+#ifdef LBANN_HAS_DISTCONV
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::fp_compute() {
+  assert_always(Dev == El::Device::GPU);
+  assert_always(T_layout == data_layout::DATA_PARALLEL);
+
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+
+  auto &l = dynamic_cast<batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+
+  const bool is_training =
+      l.m_model->get_execution_context().get_execution_mode() == execution_mode::training;
+  auto& local_running_mean =
+    ValuesGetter::mutable_values(this->get_weights(2)).Matrix();
+  auto& local_running_var =
+    ValuesGetter::mutable_values(this->get_weights(3)).Matrix();
+
+  assert0(dc::tensor::View(
+      m_scale, l.weights_values(0).LockedMatrix().LockedBuffer()));
+  assert0(dc::tensor::View(
+      m_bias, l.weights_values(1).LockedMatrix().LockedBuffer()));
+  assert0(dc::tensor::View(
+      m_running_mean, local_running_mean.Buffer()));
+  assert0(dc::tensor::View(
+      m_running_var, local_running_var.Buffer()));
+
+  m_bn->forward_stage1(this->get_prev_activations(), m_mean,
+                       m_var, is_training);
+
+  if (l.m_statistics_group_size == 0) {
+    l.m_comm->allreduce(*l.m_mean_and_var, l.m_mean_and_var->RedundantComm(),
+                        El::mpi::SUM);
+  } else if (l.m_statistics_group_size == 1) {
+    // Local aggregation
+  } else {
+    LBANN_ERROR("statics_group_size must be either 0 or 1 for now.");
+  }
+
+  m_bn->forward_stage2(this->get_prev_activations(),
+                       m_mean, m_var, m_running_mean,
+                       m_running_var, m_scale, m_bias,
+                       this->get_activations(), is_training);
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_distconv_adapter<TensorDataType, T_layout, Dev>::bp_compute() {
+  assert_always(Dev == El::Device::GPU);
+  assert_always(T_layout == data_layout::DATA_PARALLEL);
+
+  auto &l = dynamic_cast<batch_normalization_layer<
+    TensorDataType, T_layout, Dev>&>(this->layer());
+
+  // Check execution mode
+  const bool is_training =
+      l.m_model->get_execution_context().get_execution_mode() == execution_mode::training;
+  assert_always(is_training);
+
+  assert0(dc::tensor::View(
+      m_scale, l.weights_values(0).LockedMatrix().LockedBuffer()));
+
+  m_bn->backward_stage1(this->get_prev_activations(),
+                        this->get_prev_error_signals(),
+                        m_mean, m_var, m_scale,
+                        m_scale_gradient, m_bias_gradient,
+                        m_mean_gradient, m_var_gradient);
+
+  // Verbatim copy from bp_compute_gpu
+  // Accumulate gradients
+  if (is_training) {
+    if (l.m_statistics_group_size == 0) {
+      l.m_comm->allreduce(*l.m_mean_and_var_gradient,
+                          l.m_mean_and_var_gradient->RedundantComm(),
+                          El::mpi::SUM);
+    }
+  } else {
+    Zero(*l.m_mean_and_var_gradient);
+  }
+
+  auto* scale_optimizer = l.get_weights(0).get_optimizer();
+  if (scale_optimizer != nullptr) {
+    scale_optimizer->add_to_gradient(*l.m_scale_gradient, TensorDataType{1}, true);
+  }
+  auto* bias_optimizer = l.get_weights(1).get_optimizer();
+  if (bias_optimizer != nullptr) {
+    bias_optimizer->add_to_gradient(*l.m_bias_gradient, TensorDataType{1}, true);
+  }
+
+  m_bn->backward_stage2(this->get_prev_activations(), this->get_prev_error_signals(),
+                        m_mean, m_var, m_scale, m_mean_gradient, m_var_gradient,
+                        this->get_error_signals());
+}
+
+#endif // LBANN_HAS_DISTCONV
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    get_distconv_adapter().fp_compute();
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+
+  const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // CUDA objects
   CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
   auto&& stream = El::GPUManager::Stream();
 
   // Matrices
-  const auto& input = get_prev_activations();
+  const auto& input = this->get_prev_activations();
   const auto& local_input = input.LockedMatrix();
-  auto& local_output = get_local_activations();
+  auto& local_output = this->get_local_activations();
 
   // Matrix parameters
   const auto& width = input.Width();
   const auto& local_width = local_input.Width();
-  const auto& output_dims = get_output_dims();
+  const auto& output_dims = this->get_output_dims();
   const auto& num_channels = output_dims[0];
-  const auto& channel_size = get_output_size() / num_channels;
+  const auto& channel_size = this->get_output_size() / num_channels;
 
   // Compute statistics
   if (is_training) {
+    using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
 
     // Local matrices
-    auto& local_mean = m_mean->Matrix();
-    auto& local_var = m_var->Matrix();
-    auto& local_running_mean = this->m_weights[2]->get_values().Matrix();
-    auto& local_running_var = this->m_weights[3]->get_values().Matrix();
+    auto& local_mean = this->m_mean_v->Matrix();
+    auto& local_var = this->m_var_v->Matrix();
+    auto& local_running_mean =
+      ValuesGetter::mutable_values(this->get_weights(2)).Matrix();
+    auto& local_running_var =
+      ValuesGetter::mutable_values(this->get_weights(3)).Matrix();
 
     // Compute sums and sums of squares
     El::Zero(local_mean);
@@ -339,39 +448,37 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_
           local_mean.Buffer(), local_var.Buffer());
     }
     El::Int num_per_sum;
-    switch (m_stats_aggregation) {
-    case batch_normalization_stats_aggregation::global:
-      m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
-      m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
+    if (this->m_statistics_group_size == 0) {
+      // Global statistics aggregation; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var, this->m_mean_and_var->RedundantComm(),
+                        El::mpi::SUM);
       num_per_sum = channel_size * width;
-      break;
-    case batch_normalization_stats_aggregation::node_local:
-      m_comm->allreduce(*m_mean, m_comm->get_node_comm(), El::mpi::SUM);
-      m_comm->allreduce(*m_var, m_comm->get_node_comm(), El::mpi::SUM);
-      if (m_num_per_sum_cache.count(width) == 0) {
+    } else if (this->m_statistics_group_size == 1) {
+      // Local aggregation, no allreduce needed.
+      num_per_sum = channel_size * local_width;
+    } else {
+      // Grouped batchnorm. Allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var,
+                        this->m_comm->get_packed_group_comm(this->m_statistics_group_size),
+                        El::mpi::SUM);
+      if (this->m_num_per_sum_cache.count(width) == 0) {
         num_per_sum = channel_size * local_width;
-        num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm());
-        m_num_per_sum_cache[width] = num_per_sum;
+        num_per_sum = this->m_comm->allreduce(
+          num_per_sum, this->m_comm->get_packed_group_comm(this->m_statistics_group_size));
+        this->m_num_per_sum_cache[width] = num_per_sum;
       } else {
-        num_per_sum = m_num_per_sum_cache[width];
+        num_per_sum = this->m_num_per_sum_cache[width];
       }
-      break;
-    case batch_normalization_stats_aggregation::local:
-      num_per_sum = channel_size * local_width;
-      break;
-    default:
-      LBANN_ERROR("Unknown batch normalization stats aggregation");
     }
 
     // Compute minibatch statistics
     if (num_per_sum <= 1) {
-      El::Fill(local_var, one);
+      El::Fill(local_var, TensorDataType(1.0));
     } else if (num_channels > 0) {
       const El::Int block_dim = 256;
       const El::Int grid_dim = (num_channels + block_dim - 1) / block_dim;
-      compute_statistics_kernel
-        <<<grid_dim, block_dim, 0, stream>>>(
-          num_channels, num_per_sum, m_epsilon, m_decay,
+      compute_statistics_kernel<<<grid_dim, block_dim, 0, stream>>>(
+          num_channels, num_per_sum, this->m_epsilon, this->m_decay,
           local_mean.Buffer(), local_var.Buffer(),
           local_running_mean.Buffer(), local_running_var.Buffer());
     }
@@ -379,14 +486,14 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_
   }
 
   // Apply batch normalization
-  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
-  const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix();
+  const auto& local_scale = this->weights_values(0).LockedMatrix();
+  const auto& local_bias = this->weights_values(1).LockedMatrix();
   const auto& local_mean = (is_training ?
-                            m_mean->LockedMatrix() :
-                            this->m_weights[2]->get_values().LockedMatrix());
+                            this->m_mean_v->LockedMatrix() :
+                            this->weights_values(2).LockedMatrix());
   const auto& local_var = (is_training ?
-                           m_var->LockedMatrix() :
-                           this->m_weights[3]->get_values().LockedMatrix());
+                           this->m_var_v->LockedMatrix() :
+                           this->weights_values(3).LockedMatrix());
   if (!local_input.IsEmpty()) {
     const El::Int block_size = 256;
     dim3 block_dims, grid_dims;
@@ -397,46 +504,51 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_
       <<<grid_dims, block_dims, 0, stream>>>(
         channel_size, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
-        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,
         local_scale.LockedBuffer(), local_bias.LockedBuffer(),
         local_output.Buffer(), local_output.LDim());
   }
 
 }
 
-template <>
-void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-  constexpr DataType one = 1;
-  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+#ifdef LBANN_HAS_DISTCONV
+  if (this->distconv_enabled()) {
+    get_distconv_adapter().bp_compute();
+    return;
+  }
+#endif // LBANN_HAS_DISTCONV
+
+  const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training;
 
   // CUDA objects
   CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
   auto&& stream = El::GPUManager::Stream();
 
   // Matrices
-  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_scale = this->weights_values(0).LockedMatrix();
   const auto& local_mean = (is_training ?
-                            m_mean->LockedMatrix() :
-                            this->m_weights[2]->get_values().LockedMatrix());
+                            this->m_mean_v->LockedMatrix() :
+                            this->weights_values(2).LockedMatrix());
   const auto& local_var = (is_training ?
-                           m_var->LockedMatrix() :
-                           this->m_weights[3]->get_values().LockedMatrix());
-  const auto& input = get_prev_activations();
+                           this->m_var_v->LockedMatrix() :
+                           this->weights_values(3).LockedMatrix());
+  const auto& input = this->get_prev_activations();
   const auto& local_input = input.LockedMatrix();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  auto& local_mean_gradient = m_mean_gradient->Matrix();
-  auto& local_var_gradient = m_var_gradient->Matrix();
-  auto& local_scale_gradient = m_scale_gradient->Matrix();
-  auto& local_bias_gradient = m_bias_gradient->Matrix();
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
+  auto& local_mean_gradient = this->m_mean_gradient_v->Matrix();
+  auto& local_var_gradient = this->m_var_gradient_v->Matrix();
+  auto& local_scale_gradient = this->m_scale_gradient->Matrix();
+  auto& local_bias_gradient = this->m_bias_gradient->Matrix();
 
   // Matrix parameters
-  const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
   const auto& width = input.Width();
   const auto& local_width = local_input.Width();
-  const auto& output_dims = get_output_dims();
+  const auto& output_dims = this->get_output_dims();
   const auto& num_channels = output_dims[0];
-  const auto& channel_size = get_output_size() / num_channels;
+  const auto& channel_size = this->get_output_size() / num_channels;
 
   // Compute local gradients
   // Compute gradients w.r.t. batch norm parameters
@@ -455,7 +567,7 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_
         channel_size, local_width,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
-        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,
         local_scale.LockedBuffer(),
         local_scale_gradient.Buffer(), local_bias_gradient.Buffer(),
         local_mean_gradient.Buffer(), local_var_gradient.Buffer());
@@ -463,52 +575,41 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_
 
   // Accumulate gradients
   if (is_training) {
-    if (m_stats_aggregation == batch_normalization_stats_aggregation::global) {
-      m_comm->allreduce(*m_mean_gradient,
-                        m_mean_gradient->RedundantComm(),
+    if (this->m_statistics_group_size == 0) {
+      // Global aggregation; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var_gradient,
+                        this->m_mean_and_var_gradient->RedundantComm(),
                         El::mpi::SUM);
-      m_comm->allreduce(*m_var_gradient,
-                        m_var_gradient->RedundantComm(),
-                        El::mpi::SUM);
-    } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) {
-      m_comm->allreduce(*m_mean_gradient,
-                        m_comm->get_node_comm(),
-                        El::mpi::SUM);
-      m_comm->allreduce(*m_var_gradient,
-                        m_comm->get_node_comm(),
+    } else if (this->m_statistics_group_size > 1) {
+      // Grouped batchnorm; allreduce on fused buffer.
+      this->m_comm->allreduce(*this->m_mean_and_var_gradient,
+                        this->m_comm->get_packed_group_comm(this->m_statistics_group_size),
                         El::mpi::SUM);
     }
   } else {
-    El::Zero(*m_mean_gradient);
-    El::Zero(*m_var_gradient);
+    // Zero fused buffer.
+    El::Zero(*this->m_mean_and_var_gradient);
   }
-  optimizer* scale_optimizer = m_weights[0]->get_optimizer();
+  auto* scale_optimizer = this->get_weights(0).get_optimizer();
   if (scale_optimizer != nullptr) {
-    scale_optimizer->add_to_gradient(*m_scale_gradient,
-                                     one / effective_mini_batch_size,
-                                     true);
+    scale_optimizer->add_to_gradient(*this->m_scale_gradient, TensorDataType(1.0), true);
   }
-  optimizer* bias_optimizer = m_weights[1]->get_optimizer();
+  auto* bias_optimizer = this->get_weights(1).get_optimizer();
   if (bias_optimizer != nullptr) {
-    bias_optimizer->add_to_gradient(*m_bias_gradient,
-                                    one / effective_mini_batch_size,
-                                    true);
+    bias_optimizer->add_to_gradient(*this->m_bias_gradient, TensorDataType(1.0), true);
   }
 
   // Compute error signal
   El::Int num_per_sum;
-  switch (m_stats_aggregation) {
-  case batch_normalization_stats_aggregation::global:
+  if (this->m_statistics_group_size == 0) {
+    // Global statistics aggregation.
     num_per_sum = channel_size * width;
-    break;
-  case batch_normalization_stats_aggregation::node_local:
-    num_per_sum = m_num_per_sum_cache[width];  // This was computed in FP.
-    break;
-  case batch_normalization_stats_aggregation::local:
+  } else if (this->m_statistics_group_size == 1) {
+    // Local aggregation.
     num_per_sum = channel_size * local_width;
-    break;
-  default:
-    LBANN_ERROR("Unknown batch normalization stats aggregation");
+  } else {
+    // Grouped batchnorm.
+    num_per_sum = this->m_num_per_sum_cache[width];  // This was computed in FP.
   }
   if (num_per_sum <= 1) {
     El::Zero(local_gradient_wrt_input);
@@ -523,7 +624,7 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_
         channel_size, local_width, num_per_sum,
         local_input.LockedBuffer(), local_input.LDim(),
         local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
-        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), this->m_epsilon,
         local_scale.LockedBuffer(),
         local_mean_gradient.LockedBuffer(), local_var_gradient.LockedBuffer(),
         local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim());
@@ -531,4 +632,10 @@ void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_
 
 }
 
+#define PROTO(T)                                      \
+  template class batch_normalization_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/regularizers/dropout.cpp b/src/layers/regularizers/dropout.cpp
new file mode 100644
index 00000000000..4ef9f84b760
--- /dev/null
+++ b/src/layers/regularizers/dropout.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DROPOUT_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/dropout.hpp"
+#include "layers.pb.h"
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout layout, El::Device device>
+std::unique_ptr<Layer> build_dropout_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& layer_msg)
+{
+  const auto& params = layer_msg.dropout();
+  return lbann::make_unique<dropout_layer<TensorDataType, layout, device>>(
+    comm,
+    params.keep_prob());
+}
+
+#define PROTO_DEVICE(T, Device)                                         \
+  template std::unique_ptr<Layer>                                       \
+  build_dropout_layer_from_pbuf<T, data_layout::DATA_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template std::unique_ptr<Layer>                                       \
+  build_dropout_layer_from_pbuf<T, data_layout::MODEL_PARALLEL, Device>( \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template class dropout<T, data_layout::DATA_PARALLEL, Device>; \
+  template class dropout<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp
new file mode 100644
index 00000000000..89e647ec1f0
--- /dev/null
+++ b/src/layers/regularizers/entrywise_batch_normalization.cpp
@@ -0,0 +1,431 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp"
+
+#include "lbann/weights/weights_helpers.hpp"
+
+namespace lbann {
+
+namespace {
+
+// Block size for loops
+// Note: x86 cache lines are 64B
+constexpr El::Int _bsize = 64 / sizeof(DataType);
+constexpr El::Int bsize = _bsize > 1 ? _bsize : 1;
+
+/**
+ *  mean = sum(x_i) / n
+ *
+ *  var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1)
+ */
+template <typename TensorDataType>
+void compute_batch_statistics(lbann_comm& comm,
+                              TensorDataType decay,
+                              const El::AbstractDistMatrix<TensorDataType>& input,
+                              El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+                              El::AbstractDistMatrix<TensorDataType>& running_mean,
+                              El::AbstractDistMatrix<TensorDataType>& running_var) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const CPUMatType&>(input.LockedMatrix());
+  auto& local_batch_statistics = dynamic_cast<CPUMatType&>(batch_statistics.Matrix());
+  auto local_batch_mean = El::View(local_batch_statistics, El::ALL, El::IR(0));
+  auto local_batch_var = El::View(local_batch_statistics, El::ALL, El::IR(1));
+  auto& local_running_mean = dynamic_cast<CPUMatType&>(running_mean.Matrix());
+  auto& local_running_var = dynamic_cast<CPUMatType&>(running_var.Matrix());
+
+  // Dimensions
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+
+  // Compute local sums
+  El::Zero(batch_statistics);
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& x = local_input(row, col);
+        local_batch_mean(row, 0) += x;
+        local_batch_var(row, 0) += x * x;
+      }
+    }
+  }
+
+  // Accumulate sums between processes
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  comm.allreduce(batch_statistics,
+                 batch_statistics.RedundantComm(),
+                 El::mpi::SUM);
+  const size_t statistics_count = input.Width();
+
+  // Compute mini-batch statistics from sums
+  if (statistics_count <= 1) {
+    // local_mean already has correct values
+    El::Fill(local_batch_var, El::TypeTraits<TensorDataType>::One());
+  } else {
+    LBANN_OMP_PARALLEL_FOR
+    for (El::Int row = 0; row < local_height; ++row) {
+      auto& mean = local_batch_mean(row, 0);
+      auto& var = local_batch_var(row, 0);
+      auto& _running_mean = local_running_mean(row, 0);
+      auto& _running_var = local_running_var(row, 0);
+      const auto sum = local_batch_mean(row, 0);
+      const auto sqsum = local_batch_var(row, 0);
+      mean = sum / statistics_count;
+      const auto sqmean = sqsum / statistics_count;
+      var = (sqmean - mean * mean) * statistics_count / (statistics_count - 1);
+      _running_mean = decay * _running_mean + (DataType{1} - decay) * mean;
+      _running_var = decay * _running_var + (DataType{1} - decay) * var;
+    }
+  }
+
+}
+
+/**
+ *  y_i = (x_i - mean) / sqrt(var + epsilon)
+ */
+template <typename TensorDataType>
+void apply_batchnorm(DataType epsilon,
+                     const El::Matrix<TensorDataType, El::Device::CPU>& local_input,
+                     El::Matrix<TensorDataType, El::Device::CPU>& local_output,
+                     const El::Matrix<TensorDataType, El::Device::CPU>& local_mean,
+                     const El::Matrix<TensorDataType, El::Device::CPU>& local_var) {
+  const El::Int local_height = local_input.Height();
+  const El::Int local_width = local_input.Width();
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    TensorDataType _inv_stdev[bsize];
+    for (El::Int row = row_start; row < row_end; ++row) {
+      const auto& var = local_var(row, 0);
+      _inv_stdev[row-row_start] = 1 / El::Sqrt(var + epsilon);
+    }
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& mean = local_mean(row, 0);
+        const auto& inv_stdev = _inv_stdev[row - row_start];
+        const auto& x = local_input(row, col);
+        auto& y = local_output(row, col);
+        y = (x - mean) * inv_stdev;
+      }
+    }
+  }
+}
+
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             TensorDataType decay,
+             TensorDataType epsilon,
+             bool is_training,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+             El::AbstractDistMatrix<TensorDataType>& running_mean,
+             El::AbstractDistMatrix<TensorDataType>& running_var) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const CPUMatType&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<CPUMatType&>(output.Matrix());
+
+  // Batchnorm has different behavior for training and inference
+  if (is_training) {
+
+    // For training, normalize with batch statistics
+    compute_batch_statistics<TensorDataType>(comm,
+                                             decay,
+                                             input,
+                                             batch_statistics,
+                                             running_mean,
+                                             running_var);
+    const auto& local_batch_statistics
+      = dynamic_cast<const CPUMatType&>(batch_statistics.LockedMatrix());
+    const auto local_batch_mean = El::LockedView(local_batch_statistics,
+                                                 El::ALL, El::IR(0));
+    const auto local_batch_var = El::LockedView(local_batch_statistics,
+                                                El::ALL, El::IR(1));
+    apply_batchnorm<TensorDataType>(epsilon,
+                                    local_input,
+                                    local_output,
+                                    local_batch_mean,
+                                    local_batch_var);
+
+  }
+  else {
+
+    // For inference, normalize with running statistics
+    const auto& local_running_mean = dynamic_cast<const CPUMatType&>(running_mean.LockedMatrix());
+    const auto& local_running_var = dynamic_cast<const CPUMatType&>(running_var.LockedMatrix());
+    apply_batchnorm<TensorDataType>(epsilon,
+                                    local_input,
+                                    local_output,
+                                    local_running_mean,
+                                    local_running_var);
+
+  }
+
+}
+
+/** @brief Backprop for training.
+ *
+ *  Assumes forward prop uses mini-batch statistics. In other words,
+ *  statistics are dependent on input.
+ */
+template <typename TensorDataType>
+void bp_training_impl(lbann_comm& comm,
+                      TensorDataType epsilon,
+                      const El::AbstractDistMatrix<TensorDataType>& input,
+                      const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                      El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+                      const El::AbstractDistMatrix<TensorDataType>& statistics,
+                      El::AbstractDistMatrix<TensorDataType>& gradient_wrt_statistics) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const CPUMatType&>(input.LockedMatrix());
+  const auto& local_gradient_wrt_output = dynamic_cast<const CPUMatType&>(gradient_wrt_output.LockedMatrix());
+  auto& local_gradient_wrt_input = dynamic_cast<CPUMatType&>(gradient_wrt_input.Matrix());
+  const auto& local_statistics = dynamic_cast<const CPUMatType&>(statistics.LockedMatrix());
+  const auto local_mean = El::LockedView(local_statistics, El::ALL, El::IR(0));
+  const auto local_var = El::LockedView(local_statistics, El::ALL, El::IR(1));
+  auto& local_gradient_wrt_statistics = dynamic_cast<CPUMatType&>(gradient_wrt_statistics.Matrix());
+  auto local_gradient_wrt_mean = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(0));
+  auto local_gradient_wrt_var = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(1));
+
+  // Dimensions
+  const El::Int local_height = local_gradient_wrt_input.Height();
+  const El::Int local_width = local_gradient_wrt_input.Width();
+
+  // Count for statistics
+  // Note: Output is constant if statistics count is <=1, so error
+  // signal is zero.
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  const size_t statistics_count = input.Width();
+  if (statistics_count <= 1) {
+    El::Zero(local_gradient_wrt_input);
+    return;
+  }
+
+  // Compute local gradient w.r.t. batch statistics
+  //   dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+  //   dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+  El::Zero(gradient_wrt_statistics);
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    TensorDataType _inv_stdev[bsize];
+    for (El::Int row = row_start; row < row_end; ++row) {
+      const auto& var = local_var(row, 0);
+      _inv_stdev[row-row_start] = 1 / El::Sqrt(var + epsilon);
+    }
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& mean = local_mean(row, 0);
+        const auto& inv_stdev = _inv_stdev[row - row_start];
+        const auto& x = local_input(row, col);
+        const auto& dy = local_gradient_wrt_output(row, col);
+        auto& dmean = local_gradient_wrt_mean(row, 0);
+        auto& dvar = local_gradient_wrt_var(row, 0);
+        dmean += - dy * inv_stdev;
+        dvar += - dy * (x - mean) * inv_stdev*inv_stdev*inv_stdev / 2;
+      }
+    }
+  }
+
+  // Accumulate gradient w.r.t. statistics across processes
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  comm.allreduce(gradient_wrt_statistics,
+                 gradient_wrt_statistics.RedundantComm(),
+                 El::mpi::SUM);
+
+  // Compute gradient w.r.t. input
+  //   dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+  //             + dL/dmean / n
+  //             + dL/dvar * (x_i - mean) * 2/(n-1) )
+  const auto statistics_count_dt = El::To<TensorDataType>(statistics_count);
+  const TensorDataType inv_stats_count = El::TypeTraits<TensorDataType>::One()
+    / statistics_count_dt;
+  const TensorDataType inv_stats_countm1 = El::TypeTraits<TensorDataType>::One()
+    / (statistics_count_dt - El::TypeTraits<TensorDataType>::One());
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    TensorDataType _inv_stdev[bsize];
+    for (El::Int row = row_start; row < row_end; ++row) {
+      const auto& var = local_var(row, 0);
+      _inv_stdev[row-row_start] = 1 / El::Sqrt(var + epsilon);
+    }
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& mean = local_mean(row, 0);
+        const auto& inv_stdev = _inv_stdev[row - row_start];
+        const auto& x = local_input(row, col);
+        const auto& dy = local_gradient_wrt_output(row, col);
+        auto& dx = local_gradient_wrt_input(row, col);
+        auto& dmean = local_gradient_wrt_mean(row, 0);
+        auto& dvar = local_gradient_wrt_var(row, 0);
+        dx = (dy * inv_stdev
+              + dmean * inv_stats_count
+              + dvar * (x - mean) * 2 * inv_stats_countm1);
+      }
+    }
+  }
+
+}
+
+/** @brief Backprop for inference.
+ *
+ *  Computes gradient w.r.t. input when the model is performing
+ *  inference, e.g. in validation or testing mode. In this case,
+ *  forward prop uses running statistics, which are independent of
+ *  input.
+ */
+template <typename TensorDataType>
+void bp_inference_impl(DataType epsilon,
+                       const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                       El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+                       const El::AbstractDistMatrix<TensorDataType>& running_var) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_gradient_wrt_output = dynamic_cast<const CPUMatType&>(gradient_wrt_output.LockedMatrix());
+  auto& local_gradient_wrt_input = dynamic_cast<CPUMatType&>(gradient_wrt_input.Matrix());
+  const auto& local_running_var = dynamic_cast<const CPUMatType&>(running_var.LockedMatrix());
+
+  // Compute gradient w.r.t. input
+  //   dL/dx_i = dL/dy_i / sqrt(var+epsilon)
+  const El::Int local_height = local_gradient_wrt_input.Height();
+  const El::Int local_width = local_gradient_wrt_input.Width();
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int row_start = 0; row_start < local_height; row_start += bsize) {
+    const El::Int row_end = std::min(row_start + bsize, local_height);
+    const El::Int col_start = 0;
+    const El::Int col_end = local_width;
+    TensorDataType _inv_stdev[bsize];
+    for (El::Int row = row_start; row < row_end; ++row) {
+      const auto& var = local_running_var(row, 0);
+      _inv_stdev[row-row_start] = 1 / El::Sqrt(var + epsilon);
+    }
+    for (El::Int col = col_start; col < col_end; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& inv_stdev = _inv_stdev[row - row_start];
+        const auto& dy = local_gradient_wrt_output(row, col);
+        auto& dx = local_gradient_wrt_input(row, col);
+        dx = dy * inv_stdev;
+      }
+    }
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             bool is_training,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+             El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+             const El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+             El::AbstractDistMatrix<TensorDataType>& gradient_wrt_batch_statistics,
+             const El::AbstractDistMatrix<TensorDataType>& running_var) {
+
+  // Batchnorm has different behavior for training and inference
+  if (is_training) {
+    bp_training_impl<TensorDataType>(comm,
+                                     epsilon,
+                                     input,
+                                     gradient_wrt_output,
+                                     gradient_wrt_input,
+                                     batch_statistics,
+                                     gradient_wrt_batch_statistics);
+  }
+  else {
+    bp_inference_impl<TensorDataType>(epsilon,
+                                      gradient_wrt_output,
+                                      gradient_wrt_input,
+                                      running_var);
+  }
+
+}
+
+} // namespace
+
+// Template instantiation
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void entrywise_batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+
+  const auto mode = this->m_model->get_execution_context().get_execution_mode();
+  fp_impl(*this->get_comm(),
+          this->m_decay,
+          this->m_epsilon,
+          mode == execution_mode::training,
+          this->get_prev_activations(),
+          this->get_activations(),
+          *this->m_batch_statistics,
+          ValuesGetter::mutable_values(this->get_weights(0)),
+          ValuesGetter::mutable_values(this->get_weights(1)));
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void entrywise_batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  const auto mode = this->m_model->get_execution_context().get_execution_mode();
+  bp_impl(*this->get_comm(),
+          this->m_epsilon,
+          mode == execution_mode::training,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          *this->m_batch_statistics,
+          *this->m_batch_statistics_gradient,
+          this->weights_values(1));
+}
+
+#define PROTO(T)                                      \
+  template class entrywise_batch_normalization_layer< \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;  \
+  template class entrywise_batch_normalization_layer< \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu
new file mode 100644
index 00000000000..3acd10ba1ab
--- /dev/null
+++ b/src/layers/regularizers/entrywise_batch_normalization.cu
@@ -0,0 +1,618 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp"
+#include "lbann/weights/weights_helpers.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+/**
+ *  On input, sums and sqsums are assumed to be filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (height / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void row_sums_kernel(size_t height,
+                                size_t width,
+                                const TensorDataType* __restrict__ vals,
+                                size_t vals_ldim,
+                                TensorDataType* __restrict__ sums,
+                                TensorDataType* __restrict__ sqsums) {
+  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nthreads = blockDim.x * gridDim.x;
+  for (size_t row = gid; row < height; row += nthreads) {
+    auto& sum = sums[row];
+    auto& sqsum = sqsums[row];
+    for (size_t col = 0; col < width; ++col) {
+      const auto& x = vals[row + col * vals_ldim];
+      sum += x;
+      sqsum += x * x;
+    }
+  }
+}
+
+/**
+ *  On input, batch_mean and batch_var are assumed to contain sums and
+ *  squares of sums, respectively.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (size / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void compute_statistics_kernel(size_t size,
+                                          unsigned long long statistics_count,
+                                          TensorDataType decay,
+                                          TensorDataType* __restrict__ batch_mean,
+                                          TensorDataType* __restrict__ batch_var,
+                                          TensorDataType* __restrict__ running_mean,
+                                          TensorDataType* __restrict__ running_var) {
+  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nthreads = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < size; i += nthreads) {
+    auto& mean = batch_mean[i];
+    auto& var = batch_var[i];
+    auto& _running_mean = running_mean[i];
+    auto& _running_var = running_var[i];
+    const auto sum = batch_mean[i];
+    const auto sqsum = batch_var[i];
+    const TensorDataType statistics_count_dt = TensorDataType(statistics_count);
+    mean = sum / statistics_count_dt;
+    const auto sqmean = sqsum / statistics_count_dt;
+    var = (sqmean - mean * mean) * statistics_count_dt / TensorDataType(statistics_count - 1);
+    _running_mean = decay * _running_mean + (TensorDataType{1} - decay) * mean;
+    _running_var = decay * _running_var + (TensorDataType{1} - decay) * var;
+  }
+}
+
+/**
+ *  mean = sum(x_i) / n
+ *
+ *  var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1)
+ */
+template <typename TensorDataType>
+void compute_batch_statistics(lbann_comm& comm,
+                              TensorDataType decay,
+                              const El::AbstractDistMatrix<TensorDataType>& input,
+                              El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+                              El::AbstractDistMatrix<TensorDataType>& running_mean,
+                              El::AbstractDistMatrix<TensorDataType>& running_var) {
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input.LockedMatrix());
+  auto& local_batch_statistics = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(batch_statistics.Matrix());
+  auto local_batch_mean = El::View(local_batch_statistics, El::ALL, El::IR(0));
+  auto local_batch_var = El::View(local_batch_statistics, El::ALL, El::IR(1));
+  auto& local_running_mean = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(running_mean.Matrix());
+  auto& local_running_var = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(running_var.Matrix());
+
+  // Dimensions
+  const size_t local_height = local_input.Height();
+  const size_t local_width = local_input.Width();
+
+  // Compute local sums
+  El::Zero(batch_statistics);
+  if (local_height > 0) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    row_sums_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        local_input.LockedBuffer(),
+        local_input.LDim(),
+        local_batch_mean.Buffer(),
+        local_batch_var.Buffer());
+  }
+
+  // Accumulate sums between processes
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  comm.allreduce(batch_statistics,
+                 batch_statistics.RedundantComm(),
+                 El::mpi::SUM);
+  const size_t statistics_count = input.Width();
+
+  // Compute mini-batch statistics from sums
+  if (statistics_count <= 1) {
+    // local_mean already has correct values
+    El::Fill(local_batch_var, El::TypeTraits<TensorDataType>::One());
+  } else {
+    if (local_height > 0) {
+      constexpr size_t block_size = 256;
+      dim3 block_dims, grid_dims;
+      block_dims.x = block_size;
+      grid_dims.x = (local_height + block_size - 1) / block_size;
+      compute_statistics_kernel<TensorDataType>
+        <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+          local_height,
+          statistics_count,
+          decay,
+          local_batch_mean.Buffer(),
+          local_batch_var.Buffer(),
+          local_running_mean.Buffer(),
+          local_running_var.Buffer());
+    }
+  }
+
+}
+
+/**
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (height / bsizex) x (width / bsizey) x 1
+ */
+template <typename TensorDataType>
+__global__ void batchnorm_kernel(size_t height,
+                                 size_t width,
+                                 TensorDataType epsilon,
+                                 const TensorDataType* __restrict__ input,
+                                 size_t input_ldim,
+                                 TensorDataType* __restrict__ output,
+                                 size_t output_ldim,
+                                 const TensorDataType* __restrict__ mean,
+                                 const TensorDataType* __restrict__ var) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t row = gidx; row < height; row += nthreadsx) {
+    const auto& _mean = mean[row];
+    const auto& _var = var[row];
+    const auto inv_stdev = cuda::rsqrt(_var + epsilon);
+    for (size_t col = gidy; col < width; col += nthreadsy) {
+      const auto& x = input[row + col*input_ldim];
+      auto& y = output[row + col*output_ldim];
+      y = (x - _mean) * inv_stdev;
+    }
+  }
+}
+
+/**
+ *  y_i = (x_i - mean) / sqrt(var + epsilon)
+ */
+template <typename TensorDataType>
+void apply_batchnorm(DataType epsilon,
+                     const El::Matrix<TensorDataType, El::Device::GPU>& local_input,
+                     El::Matrix<TensorDataType, El::Device::GPU>& local_output,
+                     const El::Matrix<TensorDataType, El::Device::GPU>& local_mean,
+                     const El::Matrix<TensorDataType, El::Device::GPU>& local_var) {
+  if (!local_input.IsEmpty()) {
+    const size_t local_height = local_input.Height();
+    const size_t local_width = local_input.Width();
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    batchnorm_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        epsilon,
+        local_input.LockedBuffer(),
+        local_input.LDim(),
+        local_output.Buffer(),
+        local_output.LDim(),
+        local_mean.LockedBuffer(),
+        local_var.LockedBuffer());
+  }
+}
+
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             TensorDataType decay,
+             TensorDataType epsilon,
+             bool is_training,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+             El::AbstractDistMatrix<TensorDataType>& running_mean,
+             El::AbstractDistMatrix<TensorDataType>& running_var) {
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(output.Matrix());
+
+  // Batchnorm has different behavior for training and inference
+  if (is_training) {
+
+    // For training, normalize with batch statistics
+    compute_batch_statistics<TensorDataType>(comm,
+                                             decay,
+                                             input,
+                                             batch_statistics,
+                                             running_mean,
+                                             running_var);
+    const auto& local_batch_statistics
+      = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(batch_statistics.LockedMatrix());
+    const auto local_batch_mean = El::LockedView(local_batch_statistics,
+                                                 El::ALL, El::IR(0));
+    const auto local_batch_var = El::LockedView(local_batch_statistics,
+                                                El::ALL, El::IR(1));
+    apply_batchnorm<TensorDataType>(epsilon,
+                                    local_input,
+                                    local_output,
+                                    local_batch_mean,
+                                    local_batch_var);
+
+  }
+  else {
+
+    // For inference, normalize with running statistics
+    const auto& local_running_mean = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(running_mean.LockedMatrix());
+    const auto& local_running_var = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(running_var.LockedMatrix());
+    apply_batchnorm<TensorDataType>(epsilon,
+                                    local_input,
+                                    local_output,
+                                    local_running_mean,
+                                    local_running_var);
+
+  }
+
+}
+
+/**
+ *  On input, gradient_wrt_mean and gradient_wrt_var are assumed to be
+ *  filled with zeros.
+ *
+ *  dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+ *
+ *  dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (height / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void bp_training_stats_gradient_kernel(size_t height,
+                                                  size_t width,
+                                                  TensorDataType epsilon,
+                                                  const TensorDataType* __restrict__ input,
+                                                  size_t input_ldim,
+                                                  const TensorDataType* __restrict__ gradient_wrt_output,
+                                                  size_t gradient_wrt_output_ldim,
+                                                  const TensorDataType* __restrict__ mean,
+                                                  const TensorDataType* __restrict__ var,
+                                                  TensorDataType* __restrict__ gradient_wrt_mean,
+                                                  TensorDataType* __restrict__ gradient_wrt_var) {
+  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nthreads = blockDim.x * gridDim.x;
+  for (size_t row = gid; row < height; row += nthreads) {
+    const auto& _mean = mean[row];
+    const auto& _var = var[row];
+    const auto inv_stdev = cuda::rsqrt(_var + epsilon);
+    auto& dmean = gradient_wrt_mean[row];
+    auto& dvar = gradient_wrt_var[row];
+    for (size_t col = 0; col < width; ++col) {
+      const auto& x = input[row + col * input_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      dmean += - dy * inv_stdev;
+      dvar += - dy * (x - _mean) * inv_stdev*inv_stdev*inv_stdev / TensorDataType(2);
+    }
+  }
+}
+
+/**
+ *  dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+ *              + dL/dmean / n
+ *              + dL/dvar * (x_i - mean) * 2/(n-1) )
+ *
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (height / bsizex) x (width / bsizey) x 1
+ */
+template <typename TensorDataType>
+__global__ void bp_training_error_signal_kernel(size_t height,
+                                                size_t width,
+                                                TensorDataType epsilon,
+                                                unsigned long long statistics_count,
+                                                const TensorDataType* __restrict__ input,
+                                                size_t input_ldim,
+                                                const TensorDataType* __restrict__ gradient_wrt_output,
+                                                size_t gradient_wrt_output_ldim,
+                                                TensorDataType* __restrict__ gradient_wrt_input,
+                                                size_t gradient_wrt_input_ldim,
+                                                const TensorDataType* __restrict__ mean,
+                                                const TensorDataType* __restrict__ var,
+                                                const TensorDataType* __restrict__ gradient_wrt_mean,
+                                                const TensorDataType* __restrict__ gradient_wrt_var) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t row = gidx; row < height; row += nthreadsx) {
+    const auto& _mean = mean[row];
+    const auto& _var = var[row];
+    const auto& dmean = gradient_wrt_mean[row];
+    const auto& dvar = gradient_wrt_var[row];
+    const auto inv_stdev = cuda::rsqrt(_var + epsilon);
+    for (size_t col = gidy; col < width; col += nthreadsy) {
+      const auto& x = input[row + col * input_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = (dy * inv_stdev
+            + dmean / TensorDataType(statistics_count)
+            + dvar * (x - _mean) * TensorDataType(2) / TensorDataType(statistics_count - 1));
+    }
+  }
+}
+
+/** @brief Backprop for training.
+ *
+ *  Assumes forward prop uses mini-batch statistics. In other words,
+ *  statistics are dependent on input.
+ */
+template <typename TensorDataType>
+void bp_training_impl(lbann_comm& comm,
+                      TensorDataType epsilon,
+                      const El::AbstractDistMatrix<TensorDataType>& input,
+                      const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                      El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+                      const El::AbstractDistMatrix<TensorDataType>& statistics,
+                      El::AbstractDistMatrix<TensorDataType>& gradient_wrt_statistics) {
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(input.LockedMatrix());
+  const auto& local_gradient_wrt_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_output.LockedMatrix());
+  auto& local_gradient_wrt_input = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_input.Matrix());
+  const auto& local_statistics = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(statistics.LockedMatrix());
+  const auto local_mean = El::LockedView(local_statistics, El::ALL, El::IR(0));
+  const auto local_var = El::LockedView(local_statistics, El::ALL, El::IR(1));
+  auto& local_gradient_wrt_statistics = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_statistics.Matrix());
+  auto local_gradient_wrt_mean = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(0));
+  auto local_gradient_wrt_var = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(1));
+
+  // Dimensions
+  const size_t local_height = local_gradient_wrt_input.Height();
+  const size_t local_width = local_gradient_wrt_input.Width();
+
+  // Count for statistics
+  // Note: Output is constant if statistics count is <=1, so error
+  // signal is zero.
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  const size_t statistics_count = input.Width();
+  if (statistics_count <= 1) {
+    El::Zero(local_gradient_wrt_input);
+    return;
+  }
+
+  // Compute local gradient w.r.t. batch statistics
+  El::Zero(gradient_wrt_statistics);
+  if (local_height > 0) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_training_stats_gradient_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        epsilon,
+        local_input.LockedBuffer(),
+        local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
+        local_mean.LockedBuffer(),
+        local_var.LockedBuffer(),
+        local_gradient_wrt_mean.Buffer(),
+        local_gradient_wrt_var.Buffer());
+  }
+
+  // Accumulate gradient w.r.t. statistics across processes
+  /// @todo Local statistics
+  /// @todo Arbitrary group sizes
+  comm.allreduce(gradient_wrt_statistics,
+                 gradient_wrt_statistics.RedundantComm(),
+                 El::mpi::SUM);
+
+  // Compute gradient w.r.t. input
+  if (!local_input.IsEmpty()) {
+    const size_t local_height = local_input.Height();
+    const size_t local_width = local_input.Width();
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    bp_training_error_signal_kernel
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        epsilon,
+        statistics_count,
+        local_input.LockedBuffer(),
+        local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
+        local_gradient_wrt_input.Buffer(),
+        local_gradient_wrt_input.LDim(),
+        local_mean.LockedBuffer(),
+        local_var.LockedBuffer(),
+        local_gradient_wrt_mean.LockedBuffer(),
+        local_gradient_wrt_var.LockedBuffer());
+  }
+
+}
+
+/**
+ *  dL/dx_i = dL/dy_i / sqrt(var+epsilon)
+ *
+ *  Block dimensions: bsizex x bsizey x 1
+ *
+ *  Grid dimensions: (height / bsizex) x (width / bsizey) x 1
+ */
+template <typename TensorDataType>
+__global__ void bp_inference_kernel(size_t height,
+                                    size_t width,
+                                    TensorDataType epsilon,
+                                    const TensorDataType* __restrict__ gradient_wrt_output,
+                                    size_t gradient_wrt_output_ldim,
+                                    TensorDataType* __restrict__ gradient_wrt_input,
+                                    size_t gradient_wrt_input_ldim,
+                                    const TensorDataType* __restrict__ running_var) {
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t row = gidx; row < height; row += nthreadsx) {
+    const auto& var = running_var[row];
+    const auto inv_stdev = cuda::rsqrt(var + epsilon);
+    for (size_t col = gidy; col < width; col += nthreadsy) {
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = dy * inv_stdev;
+    }
+  }
+}
+
+/** @brief Backprop for inference.
+ *
+ *  Assumes forward prop uses running statistics. In other words,
+ *  statistics are independent of input.
+ */
+template <typename TensorDataType>
+void bp_inference_impl(DataType epsilon,
+                       const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                       El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+                       const El::AbstractDistMatrix<TensorDataType>& running_var) {
+
+  // Local matrices
+  const auto& local_gradient_wrt_output = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_output.LockedMatrix());
+  auto& local_gradient_wrt_input = dynamic_cast<El::Matrix<TensorDataType, El::Device::GPU>&>(gradient_wrt_input.Matrix());
+  const auto& local_running_var = dynamic_cast<const El::Matrix<TensorDataType, El::Device::GPU>&>(running_var.LockedMatrix());
+
+  // Compute gradient w.r.t. input
+  if (!local_gradient_wrt_output.IsEmpty()) {
+    const size_t local_height = local_gradient_wrt_output.Height();
+    const size_t local_width = local_gradient_wrt_output.Width();
+    constexpr size_t block_size_x = 256;
+    constexpr size_t block_size_y = 1;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size_x;
+    block_dims.y = block_size_y;
+    grid_dims.x = (local_height + block_size_x - 1) / block_size_x;
+    grid_dims.y = (local_width + block_size_y - 1) / block_size_y;
+    bp_inference_kernel<TensorDataType>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height,
+        local_width,
+        epsilon,
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
+        local_gradient_wrt_input.Buffer(),
+        local_gradient_wrt_input.LDim(),
+        local_running_var.LockedBuffer());
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             bool is_training,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+             El::AbstractDistMatrix<TensorDataType>& gradient_wrt_input,
+             const El::AbstractDistMatrix<TensorDataType>& batch_statistics,
+             El::AbstractDistMatrix<TensorDataType>& gradient_wrt_batch_statistics,
+             const El::AbstractDistMatrix<TensorDataType>& running_var) {
+
+  // Batchnorm has different behavior for training and inference
+  if (is_training) {
+    bp_training_impl<TensorDataType>(comm,
+                                     epsilon,
+                                     input,
+                                     gradient_wrt_output,
+                                     gradient_wrt_input,
+                                     batch_statistics,
+                                     gradient_wrt_batch_statistics);
+  }
+  else {
+    bp_inference_impl<TensorDataType>(epsilon,
+                                      gradient_wrt_output,
+                                      gradient_wrt_input,
+                                      running_var);
+  }
+
+}
+
+} // namespace
+
+// Template instantiation
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void entrywise_batch_normalization_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
+
+  const auto mode = this->get_model()->get_execution_context().get_execution_mode();
+  fp_impl(*this->get_comm(),
+          this->m_decay,
+          this->m_epsilon,
+          mode == execution_mode::training,
+          this->get_prev_activations(),
+          this->get_activations(),
+          *this->m_batch_statistics,
+          ValuesGetter::mutable_values(this->get_weights(0)),
+          ValuesGetter::mutable_values(this->get_weights(1)));
+}
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void entrywise_batch_normalization_layer<TensorDataType, T_layout, Dev>::bp_compute() {
+  const auto mode = this->get_model()->get_execution_context().get_execution_mode();
+  bp_impl(*this->get_comm(),
+          this->m_epsilon,
+          mode == execution_mode::training,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          *this->m_batch_statistics,
+          *this->m_batch_statistics_gradient,
+          this->weights_values(1));
+}
+
+#define PROTO(T)                                      \
+  template class entrywise_batch_normalization_layer< \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;  \
+  template class entrywise_batch_normalization_layer< \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/regularizers/instance_norm.cpp b/src/layers/regularizers/instance_norm.cpp
new file mode 100644
index 00000000000..3c0bf547bf9
--- /dev/null
+++ b/src/layers/regularizers/instance_norm.cpp
@@ -0,0 +1,336 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_INSTANCE_NORM_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/instance_norm.hpp"
+#include "lbann/utils/h2_tmp.hpp"
+
+namespace lbann
+{
+
+// =============================================
+// Forward prop
+// =============================================
+
+namespace
+{
+
+/** @brief Forward prop */
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             El::Int num_channels,
+             El::Int channel_size,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::Matrix<TensorDataType, El::Device::CPU>& local_workspace)
+{
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(output.Matrix());
+
+  // Dimensions
+  const El::Int local_mini_batch_size = local_input.Width();
+
+  // Trivial case if channel size is 1
+  // Note: Output is constant.
+  if (channel_size <= 1) {
+    El::Zero(output);
+    return;
+  }
+
+  // Compute sums
+  El::Zeros(local_workspace, 2*num_channels, local_mini_batch_size);
+  auto local_sums = El::View(local_workspace,
+                             El::IR(0, num_channels),
+                             El::ALL);
+  auto local_sqsums = El::View(local_workspace,
+                               El::IR(num_channels, 2*num_channels),
+                               El::ALL);
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      auto& sum = local_sums(j,k);
+      auto& sqsum = local_sqsums(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        sum += x;
+        sqsum += x * x;
+      }
+    }
+  }
+
+  // Normalize output
+  //   mean = sum(x_i) / n
+  //   var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1)
+  //   y_i = (x_i - mean) / sqrt(var + epsilon)
+  const TensorDataType mean_scale = 1. / channel_size;
+  const TensorDataType var_correction = double(channel_size) / (channel_size - 1);
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& sum = local_sums(j,k);
+      const auto& sqsum = local_sqsums(j,k);
+      const auto mean = sum * mean_scale;
+      const auto sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean * mean) * var_correction;
+      var = std::max(var, TensorDataType{0.});
+      const TensorDataType inv_stdev
+        = TensorDataType{1.} / std::sqrt(var + epsilon);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        auto& y = local_output(i+j*channel_size,k);
+        y = (x - mean) * inv_stdev;
+      }
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void instance_norm_layer<TensorDataType,Layout,Device>::fp_compute()
+{
+  const El::Int num_channels = this->get_output_dims().front();
+  const El::Int channel_size = this->get_output_size() / num_channels;
+  fp_impl(*this->get_comm(),
+          num_channels,
+          channel_size,
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_activations(),
+          this->m_workspace);
+}
+
+// =============================================
+// Backprop
+// =============================================
+
+namespace
+{
+
+/** @brief Backprop */
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             El::Int num_channels,
+             El::Int channel_size,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad,
+             const El::Matrix<TensorDataType, El::Device::CPU>& local_workspace)
+{
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::CPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<LocalMat&>(input_grad.Matrix());
+  const auto local_sums = El::LockedView(local_workspace,
+                                         El::IR(0, num_channels),
+                                         El::ALL);
+  const auto local_sqsums = El::LockedView(local_workspace,
+                                           El::IR(num_channels, 2*num_channels),
+                                           El::ALL);
+
+  // Dimensions
+  const El::Int local_mini_batch_size = local_input.Width();
+
+  // Trivial case if channel size is 1
+  // Note: Output is constant, so error signal is zero.
+  if (channel_size <= 1) {
+    El::Zero(input_grad);
+    return;
+  }
+
+  // Compute gradient w.r.t. statistics
+  //   dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+  //   dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+  LocalMat local_statistics_grad;
+  El::Zeros(local_statistics_grad, 2*num_channels, local_mini_batch_size);
+  auto local_means_grad = El::View(local_statistics_grad,
+                                   El::IR(0, num_channels),
+                                   El::ALL);
+  auto local_vars_grad = El::View(local_statistics_grad,
+                                  El::IR(num_channels, 2*num_channels),
+                                  El::ALL);
+  const TensorDataType mean_scale = 1. / channel_size;
+  const TensorDataType var_correction = double(channel_size) / (channel_size - 1);
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& sum = local_sums(j,k);
+      const auto& sqsum = local_sqsums(j,k);
+      const auto mean = sum * mean_scale;
+      const auto sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean * mean) * var_correction;
+      const TensorDataType inv_stdev
+        = TensorDataType{1.} / std::sqrt(var + epsilon);
+      auto& dmean = local_means_grad(j,k);
+      auto& dvar = local_vars_grad(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        const auto& dy = local_output_grad(i+j*channel_size,k);
+        dmean += dy;
+        dvar += dy * (x-mean);
+      }
+      dmean *= -inv_stdev;
+      dvar *= -inv_stdev*inv_stdev*inv_stdev / 2;
+    }
+  }
+
+  // Compute gradient w.r.t. input
+  //   dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+  //             + dL/dmean / n
+  //             + dL/dvar * (x_i - mean) * 2/(n-1) )
+  LBANN_OMP_PARALLEL_FOR_COLLAPSE2
+  for (El::Int k = 0; k < local_mini_batch_size; ++k) {
+    for (El::Int j = 0; j < num_channels; ++j) {
+      const auto& sum = local_sums(j,k);
+      const auto& sqsum = local_sqsums(j,k);
+      const auto mean = sum * mean_scale;
+      const auto sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean * mean) * var_correction;
+      const TensorDataType inv_stdev
+        = TensorDataType{1.} / std::sqrt(var + epsilon);
+      const auto& dmean = local_means_grad(j,k);
+      const auto& dvar = local_vars_grad(j,k);
+      for (El::Int i = 0; i < channel_size; ++i) {
+        const auto& x = local_input(i+j*channel_size,k);
+        const auto& dy = local_output_grad(i+j*channel_size,k);
+        auto& dx = local_input_grad(i+j*channel_size,k);
+        dx = (dy * inv_stdev
+              + dmean / channel_size
+              + dvar * (x - mean) * 2 / (channel_size - 1));
+      }
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void instance_norm_layer<TensorDataType,Layout,Device>::bp_compute()
+{
+  const El::Int num_channels = this->get_output_dims().front();
+  const El::Int channel_size = this->get_output_size() / num_channels;
+  bp_impl(*this->get_comm(),
+          num_channels,
+          channel_size,
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          this->m_workspace);
+}
+
+// =============================================
+// Builder function
+// =============================================
+
+namespace
+{
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR(
+      "Attempted to construct instance_norm_layer ",
+      "with invalid parameters ",
+      "(TensorDataType=",TypeName<T>(),", ",
+      "Layout=",to_string(L),", ",
+      "Device=",to_string(D),")");
+    return nullptr;
+  }
+};
+
+template <El::Device Device>
+struct Builder<float,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = instance_norm_layer<float,
+                                          data_layout::DATA_PARALLEL,
+                                          Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+
+template <El::Device Device>
+struct Builder<double,data_layout::DATA_PARALLEL,Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = instance_norm_layer<double,
+                                          data_layout::DATA_PARALLEL,
+                                          Device>;
+    return make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_instance_norm_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const&)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  return BuilderType::Build(comm);
+}
+
+// =============================================
+// Explicit template instantiation
+// =============================================
+
+#define PROTO(T)                                        \
+  template class instance_norm_layer<                   \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_GPU
+#define PROTO(T)                                        \
+  extern template class instance_norm_layer<            \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#endif // LBANN_HAS_GPU
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(instance_norm, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+#undef PROTO_DEVICE
+
+} // namespace lbann
diff --git a/src/layers/regularizers/instance_norm.cu b/src/layers/regularizers/instance_norm.cu
new file mode 100644
index 00000000000..f1b0a7f4775
--- /dev/null
+++ b/src/layers/regularizers/instance_norm.cu
@@ -0,0 +1,515 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_INSTANCE_NORM_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/instance_norm.hpp"
+#include "lbann/utils/cuda.hpp"
+
+#include <thrust/pair.h>
+
+namespace lbann
+{
+
+namespace
+{
+
+/** Functor for adding @c thrust::pair objects. */
+template <typename Pair>
+struct pair_sum
+{
+  __device__ __forceinline__
+  Pair operator()(const Pair& x, const Pair& y)
+  {
+    return Pair(x.first+y.first, x.second+y.second);
+  }
+};
+
+} // namespace <anon>
+
+// =============================================
+// Forward prop
+// =============================================
+
+namespace
+{
+
+/** Accumulate sums and sums of squares for each channel.
+ *
+ *  On input, sums and sqsums are filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (channel_size / bsize) x num_channels x mini_batch_size
+ */
+template <typename TensorDataType, size_t bdimx>
+__global__ void fp_sums_kernel(
+  size_t mini_batch_size,
+  size_t num_channels,
+  size_t channel_size,
+  const TensorDataType* __restrict__ vals,
+  size_t vals_ldim,
+  TensorDataType* sums,
+  size_t sums_ldim,
+  TensorDataType* sqsums,
+  size_t sqsums_ldim)
+{
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  for (size_t k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (size_t j = gidy; j < num_channels; j += nthreadsy) {
+
+      // Accumulate sums and perform block-wide reduction
+      using pair_t = thrust::pair<TensorDataType,TensorDataType>;
+      using pair_sum_t = pair_sum<pair_t>;
+      pair_t sum_sqsum(0,0);
+      for (size_t i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = vals[i + j*channel_size + k*vals_ldim];
+        sum_sqsum.first += x;
+        sum_sqsum.second += x * x;
+      }
+      sum_sqsum = cuda::block_reduce<bdimx,bdimy,bdimz,pair_t,pair_sum_t>(sum_sqsum);
+
+      // Output result to global memory
+      if (tid == 0) {
+        cuda::atomic_add(&sums[j+k*sums_ldim], sum_sqsum.first);
+        cuda::atomic_add(&sqsums[j+k*sqsums_ldim], sum_sqsum.second);
+      }
+
+    }
+  }
+
+}
+
+/** Compute outputs.
+ *
+ *  y_i = (x_i - mean) / sqrt(var + epsilon)
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (channel_size / bdimx) x (num_channels / bdimy) x (mini_batch_size / bdimz)
+ */
+template <typename TensorDataType>
+__global__ void fp_output_kernel(
+  size_t mini_batch_size,
+  size_t num_channels,
+  size_t channel_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  TensorDataType* __restrict__ output,
+  size_t output_ldim,
+  const TensorDataType* sums,
+  size_t sums_ldim,
+  const TensorDataType* sqsums,
+  size_t sqsums_ldim)
+{
+
+  // Indices
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  const TensorDataType mean_scale = 1. / channel_size;
+  const TensorDataType var_correction = double(channel_size) / (channel_size - 1);
+  for (size_t k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (size_t j = gidy; j < num_channels; j += nthreadsy) {
+      const auto& sum = sums[j+k*sums_ldim];
+      const auto& sqsum = sqsums[j+k*sqsums_ldim];
+      const auto& mean = sum * mean_scale;
+      const auto& sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean*mean) * var_correction;
+      var = cuda::max(var, TensorDataType{0.});
+      const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+      for (size_t i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = input[i + j*channel_size + k*input_ldim];
+        auto& y = output[i + j*channel_size + k*output_ldim];
+        y = (x - mean) * inv_stdev;
+      }
+    }
+  }
+
+}
+
+/** @brief Forward prop */
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             size_t num_channels,
+             size_t channel_size,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::Matrix<TensorDataType, El::Device::GPU>& local_workspace)
+{
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<LocalMat&>(output.Matrix());
+
+  // Dimensions
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // Trivial case if channel size is 1
+  // Note: Output is constant.
+  if (channel_size <= 1) {
+    El::Zero(output);
+    return;
+  }
+
+  // Compute sums
+  El::Zeros(local_workspace, 2*num_channels, local_mini_batch_size);
+  auto local_sums = El::View(local_workspace,
+                             El::IR(0, num_channels),
+                             El::ALL);
+  auto local_sqsums = El::View(local_workspace,
+                               El::IR(num_channels, 2*num_channels),
+                               El::ALL);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    fp_sums_kernel<TensorDataType,block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_mini_batch_size, num_channels, channel_size,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_sums.Buffer(), local_sums.LDim(),
+        local_sqsums.Buffer(), local_sqsums.LDim());
+  }
+
+  // Normalize output
+  if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    fp_output_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      local_mini_batch_size, num_channels, channel_size, epsilon,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_output.Buffer(), local_output.LDim(),
+      local_sums.LockedBuffer(), local_sums.LDim(),
+      local_sqsums.LockedBuffer(), local_sqsums.LDim());
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void instance_norm_layer<TensorDataType, Layout, Device>::fp_compute()
+{
+  const size_t num_channels = this->get_output_dims().front();
+  const size_t channel_size = this->get_output_size() / num_channels;
+  fp_impl(*this->get_comm(),
+          num_channels,
+          channel_size,
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_activations(),
+          this->m_workspace);
+}
+
+// =============================================
+// Backprop
+// =============================================
+
+namespace
+{
+
+/** Compute gradients w.r.t. per-channel statistics.
+ *
+ *  dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+ *
+ *  dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+ *
+ *  On input, means_grad and vars_grad are filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (channel_size / bsize) x num_channels x mini_batch_size
+ */
+template <typename TensorDataType, size_t bdimx>
+__global__ void bp_statistics_grad_kernel(
+  size_t mini_batch_size,
+  size_t num_channels,
+  size_t channel_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  const TensorDataType* __restrict__ output_grad,
+  size_t output_grad_ldim,
+  const TensorDataType* sums,
+  size_t sums_ldim,
+  const TensorDataType* sqsums,
+  size_t sqsums_ldim,
+  TensorDataType* means_grad,
+  size_t means_grad_ldim,
+  TensorDataType* vars_grad,
+  size_t vars_grad_ldim)
+{
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  const TensorDataType mean_scale = 1. / channel_size;
+  const TensorDataType var_correction = double(channel_size) / (channel_size - 1);
+  for (size_t k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (size_t j = gidy; j < num_channels; j += nthreadsy) {
+
+      // Compute statistics from sums
+      const auto& sum = sums[j+k*sums_ldim];
+      const auto& sqsum = sqsums[j+k*sqsums_ldim];
+      const auto& mean = sum * mean_scale;
+      const auto& sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean*mean) * var_correction;
+      var = cuda::max(var, TensorDataType{0.});
+      const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+
+      // Accumulate sums and perform block-wide reduction
+      using pair_t = thrust::pair<TensorDataType,TensorDataType>;
+      using pair_sum_t = pair_sum<pair_t>;
+      pair_t dmean_dvar(0,0);
+      for (size_t i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = input[i + j*channel_size + k*input_ldim];
+        const auto& dy = output_grad[i + j*channel_size + k*output_grad_ldim];
+        dmean_dvar.first += dy;
+        dmean_dvar.second += dy * (x - mean);
+      }
+      dmean_dvar = cuda::block_reduce<bdimx,bdimy,bdimz,pair_t,pair_sum_t>(dmean_dvar);
+
+      // Output result to global memory
+      if (tid == 0) {
+        const TensorDataType dmean = -dmean_dvar.first * inv_stdev;
+        const TensorDataType dvar = -dmean_dvar.second * inv_stdev*inv_stdev*inv_stdev / 2;
+        cuda::atomic_add(&means_grad[j+k*means_grad_ldim], dmean);
+        cuda::atomic_add(&vars_grad[j+k*vars_grad_ldim], dvar);
+      }
+
+    }
+  }
+
+}
+
+/** Compute gradients w.r.t. input.
+ *
+ *  dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+ *              + dL/dmean / n
+ *              + dL/dvar * (x_i - mean) * 2/(n-1) )
+ *
+ *  Block dimensions: bdimx x bdimy x bdimz
+ *
+ *  Grid dimensions: (channel_size / bdimx) x (num_channels / bdimy) x (mini_batch_size / bdimz)
+ */
+template <typename TensorDataType>
+__global__ void bp_input_grad_kernel(
+  size_t mini_batch_size,
+  size_t num_channels,
+  size_t channel_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  const TensorDataType* __restrict__ output_grad,
+  size_t output_grad_ldim,
+  TensorDataType* __restrict__ input_grad,
+  size_t input_grad_ldim,
+  const TensorDataType* __restrict__ sums,
+  size_t sums_ldim,
+  const TensorDataType* __restrict__ sqsums,
+  size_t sqsums_ldim,
+  const TensorDataType* means_grad,
+  size_t means_grad_ldim,
+  const TensorDataType* vars_grad,
+  size_t vars_grad_ldim)
+{
+
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  const size_t nthreadsz = blockDim.z * gridDim.z;
+
+  const TensorDataType mean_scale = 1. / channel_size;
+  const TensorDataType var_correction = double(channel_size) / (channel_size - 1);
+  for (size_t k = gidz; k < mini_batch_size; k += nthreadsz) {
+    for (size_t j = gidy; j < num_channels; j += nthreadsy) {
+      const auto& sum = sums[j+k*sums_ldim];
+      const auto& sqsum = sqsums[j+k*sqsums_ldim];
+      const auto& mean = sum * mean_scale;
+      const auto& sqmean = sqsum * mean_scale;
+      auto var = (sqmean - mean*mean) * var_correction;
+      var = cuda::max(var, TensorDataType{0.});
+      const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+      const auto& dmean = means_grad[j+k*means_grad_ldim];
+      const auto& dvar = vars_grad[j+k*vars_grad_ldim];
+      for (size_t i = gidx; i < channel_size; i += nthreadsx) {
+        const auto& x = input[i + j*channel_size + k*input_ldim];
+        const auto& dy = output_grad[i + j*channel_size + k*output_grad_ldim];
+        auto& dx = input_grad[i + j*channel_size + k*input_grad_ldim];
+        dx = (dy * inv_stdev
+              + dmean * mean_scale
+              + dvar * (x - mean) * 2 * mean_scale * var_correction);
+      }
+    }
+  }
+
+}
+
+/** @brief Backprop */
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             size_t num_channels,
+             size_t channel_size,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad,
+             const El::Matrix<TensorDataType, El::Device::GPU>& local_workspace)
+{
+
+  // Local matrices
+  using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& local_input = dynamic_cast<const LocalMat&>(input.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const LocalMat&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<LocalMat&>(input_grad.Matrix());
+  const auto local_sums = El::LockedView(local_workspace,
+                                         El::IR(0, num_channels),
+                                         El::ALL);
+  const auto local_sqsums = El::LockedView(local_workspace,
+                                           El::IR(num_channels, 2*num_channels),
+                                           El::ALL);
+
+  // Dimensions
+  const size_t local_mini_batch_size = local_input.Width();
+
+  // Trivial case if channel size is 1
+  // Note: Output is constant, so error signal is zero.
+  if (channel_size <= 1) {
+    El::Zero(input_grad);
+    return;
+  }
+
+  // Compute gradient w.r.t. statistics
+  LocalMat local_statistics_grad;
+  El::Zeros(local_statistics_grad, 2*num_channels, local_mini_batch_size);
+  auto local_means_grad = El::View(local_statistics_grad,
+                                   El::IR(0, num_channels),
+                                   El::ALL);
+  auto local_vars_grad = El::View(local_statistics_grad,
+                                  El::IR(num_channels, 2*num_channels),
+                                  El::ALL);
+  if (!local_output_grad.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    bp_statistics_grad_kernel<TensorDataType,block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_mini_batch_size, num_channels, channel_size, epsilon,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output_grad.LockedBuffer(), local_output_grad.LDim(),
+        local_sums.LockedBuffer(), local_sums.LDim(),
+        local_sqsums.LockedBuffer(), local_sqsums.LDim(),
+        local_means_grad.Buffer(), local_means_grad.LDim(),
+        local_vars_grad.Buffer(), local_vars_grad.LDim());
+  }
+
+  // Compute gradient w.r.t. input
+  if (!local_input_grad.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    grid_dims.z = local_mini_batch_size;
+    bp_input_grad_kernel
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_mini_batch_size, num_channels, channel_size, epsilon,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output_grad.LockedBuffer(), local_output_grad.LDim(),
+        local_input_grad.Buffer(), local_input_grad.LDim(),
+        local_sums.LockedBuffer(), local_sums.LDim(),
+        local_sqsums.LockedBuffer(), local_sqsums.LDim(),
+        local_means_grad.LockedBuffer(), local_means_grad.LDim(),
+        local_vars_grad.LockedBuffer(), local_vars_grad.LDim());
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void instance_norm_layer<TensorDataType, Layout, Device>::bp_compute()
+{
+  const size_t num_channels = this->get_output_dims().front();
+  const size_t channel_size = this->get_output_size() / num_channels;
+  bp_impl(*this->get_comm(),
+          num_channels,
+          channel_size,
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          this->m_workspace);
+}
+
+// =============================================
+// Explicit template instantiation
+// =============================================
+
+#define PROTO(T)                                        \
+  template class instance_norm_layer<                   \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/regularizers/layer_norm.cpp b/src/layers/regularizers/layer_norm.cpp
new file mode 100644
index 00000000000..aca049581c1
--- /dev/null
+++ b/src/layers/regularizers/layer_norm.cpp
@@ -0,0 +1,219 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_LAYER_NORM_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/layer_norm.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** @brief Forward prop */
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::AbstractDistMatrix<TensorDataType>& statistics) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const CPUMatType&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<CPUMatType&>(output.Matrix());
+  auto& local_statistics = dynamic_cast<CPUMatType&>(statistics.Matrix());
+  auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL);
+  auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL);
+
+  // Dimensions
+  const El::Int sample_size = input.Height();
+  const El::Int local_num_samples = local_input.Width();
+  const El::Int local_sample_size = local_input.Height();
+
+  // Compute sums
+  El::Zero(statistics);
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < local_num_samples; ++i) {
+    auto& sum = local_means(0,i);
+    auto& sqsum = local_vars(0,i);
+    for (El::Int j = 0; j < local_sample_size; ++j) {
+      const auto& x = local_input(j,i);
+      sum += x;
+      sqsum += x * x;
+    }
+  }
+  comm.allreduce(statistics, statistics.RedundantComm(), El::mpi::SUM);
+
+  // Compute statistics from sums
+  //   mean = sum(x_i) / n
+  //   var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1)
+  if (sample_size <= 1) {
+    // local_means already has correct values
+    El::Fill(local_vars, El::TypeTraits<TensorDataType>::One());
+  }
+  else {
+    LBANN_OMP_PARALLEL_FOR
+    for (El::Int i = 0; i < local_num_samples; ++i) {
+      const auto sum = local_means(0,i);
+      const auto sqsum = local_vars(0,i);
+      auto sample_size_dt = El::To<TensorDataType>(sample_size);
+      const auto& mean = sum / sample_size_dt;
+      const auto& sqmean = sqsum / sample_size_dt;
+      const auto& var = (sqmean - mean*mean) * sample_size_dt
+        / (sample_size_dt-El::TypeTraits<TensorDataType>::One());
+      local_means(0,i) = mean;
+      local_vars(0,i) = std::max(var, El::TypeTraits<TensorDataType>::Zero());
+    }
+  }
+
+  // Apply layer norm
+  //   y_i = (x_i - mean) / sqrt(var + epsilon)
+  for (El::Int i = 0; i < local_num_samples; ++i) {
+    const auto& mean = local_means(0,i);
+    const auto& var = local_vars(0,i);
+    const TensorDataType inv_stdev = El::TypeTraits<TensorDataType>::One() / El::Sqrt(var + epsilon);
+    for (El::Int j = 0; j < local_sample_size; ++j) {
+      const auto& x = local_input(j,i);
+      auto& y = local_output(j,i);
+      y = (x - mean) * inv_stdev;
+    }
+  }
+
+}
+
+/** @brief Backprop */
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad,
+             const El::AbstractDistMatrix<TensorDataType>& statistics,
+             El::AbstractDistMatrix<TensorDataType>& statistics_grad) {
+  using CPUMatType = El::Matrix<TensorDataType, El::Device::CPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const CPUMatType&>(input.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const CPUMatType&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<CPUMatType&>(input_grad.Matrix());
+  const auto& local_statistics = dynamic_cast<const CPUMatType&>(statistics.LockedMatrix());
+  const auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL);
+  const auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL);
+  auto& local_statistics_grad = dynamic_cast<CPUMatType&>(statistics_grad.Matrix());
+  auto local_means_grad = El::View(local_statistics_grad, El::IR(0), El::ALL);
+  auto local_vars_grad = El::View(local_statistics_grad, El::IR(1), El::ALL);
+
+  // Dimensions
+  const El::Int sample_size = input.Height();
+  const El::Int local_num_samples = local_input.Width();
+  const El::Int local_sample_size = local_input.Height();
+
+  // Trivial case if sample size <= 1
+  // Note: Output is constant, so error signal is zero.
+  if (sample_size <= 1) {
+    El::Zero(input_grad);
+    return;
+  }
+
+  // Compute gradient w.r.t. statistics
+  //   dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+  //   dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+  El::Zero(statistics_grad);
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < local_num_samples; ++i) {
+    const auto& mean = local_means(0,i);
+    const auto& var = local_vars(0,i);
+    const TensorDataType inv_stdev = El::TypeTraits<TensorDataType>::One() / El::Sqrt(var + epsilon);
+    auto& dmean = local_means_grad(0,i);
+    auto& dvar = local_vars_grad(0,i);
+    for (El::Int j = 0; j < local_sample_size; ++j) {
+      const auto& x = local_input(j,i);
+      const auto& dy = local_output_grad(j,i);
+      dmean += dy;
+      dvar += dy * (x - mean);
+    }
+    dmean *= -inv_stdev;
+    dvar *= -inv_stdev*inv_stdev*inv_stdev / 2;
+  }
+  comm.allreduce(statistics_grad,
+                 statistics_grad.RedundantComm(),
+                 El::mpi::SUM);
+
+  // Compute gradient w.r.t. input
+  //   dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+  //             + dL/dmean / n
+  //             + dL/dvar * (x_i - mean) * 2/(n-1) )
+  LBANN_OMP_PARALLEL_FOR
+  for (El::Int i = 0; i < local_num_samples; ++i) {
+    const auto& mean = local_means(0,i);
+    const auto& var = local_vars(0,i);
+    const TensorDataType inv_stdev = El::TypeTraits<TensorDataType>::One() / El::Sqrt(var + epsilon);
+    const auto& dmean = local_means_grad(0,i);
+    const auto& dvar = local_vars_grad(0,i);
+    for (El::Int j = 0; j < local_sample_size; ++j) {
+      const auto& x = local_input(j,i);
+      const auto& dy = local_output_grad(j,i);
+      auto& dx = local_input_grad(j,i);
+      dx = (dy * inv_stdev
+            + dmean / sample_size
+            + dvar * (x - mean) * 2 / (sample_size - 1));
+    }
+  }
+
+}
+
+} // namespace <anon>
+
+// Template instantiation
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_impl(*this->get_comm(),
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_activations(),
+          *this->m_statistics);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_impl(*this->get_comm(),
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          *this->m_statistics,
+          *this->m_statistics_gradient);
+}
+
+#define PROTO(T)                                     \
+  template class layer_norm_layer<                   \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class layer_norm_layer<                   \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/regularizers/layer_norm.cu b/src/layers/regularizers/layer_norm.cu
new file mode 100644
index 00000000000..11b55d7cce2
--- /dev/null
+++ b/src/layers/regularizers/layer_norm.cu
@@ -0,0 +1,476 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_LAYER_NORM_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/layer_norm.hpp"
+#include "lbann/utils/cuda.hpp"
+
+#include <thrust/pair.h>
+
+namespace lbann {
+
+namespace {
+
+/** Functor for adding @c thrust::pair objects. */
+template <typename Pair>
+struct pair_sum {
+  __device__ __forceinline__
+  Pair operator()(const Pair& x, const Pair& y) {
+    return Pair(x.first+y.first, x.second+y.second);
+  }
+};
+
+/** Accumulate sums and sums of squares for each data sample.
+ *
+ *  On input, sums and sqsums are filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (local_sample_size / bsize) x local_num_samples x 1
+ */
+template <size_t bdimx, typename TensorDataType>
+__global__ void fp_sums_kernel(
+  size_t local_num_samples,
+  size_t local_sample_size,
+  const TensorDataType* __restrict__ vals,
+  size_t vals_ldim,
+  TensorDataType* sums,
+  size_t sums_stride,
+  TensorDataType* sqsums,
+  size_t sqsums_stride) {
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x + blockDim.x * threadIdx.y;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+
+  for (size_t i = gidy; i < local_num_samples; i += nthreadsy) {
+
+    // Accumulate sums and perform block-wide reduction
+    using pair_t = thrust::pair<TensorDataType,TensorDataType>;
+    using pair_sum_t = pair_sum<pair_t>;
+    pair_t sum_sqsum(0,0);
+    for (size_t j = gidx; j < local_sample_size; j += nthreadsx) {
+      const auto& x = vals[i*vals_ldim + j];
+      sum_sqsum.first += x;
+      sum_sqsum.second += x * x;
+    }
+    sum_sqsum = cuda::block_reduce<bdimx,bdimy,bdimz,pair_t,pair_sum_t>(sum_sqsum);
+
+    // Output result to global memory
+    if (tid == 0) {
+      cuda::atomic_add(&sums[i*sums_stride], sum_sqsum.first);
+      cuda::atomic_add(&sqsums[i*sqsums_stride], sum_sqsum.second);
+    }
+
+  }
+
+}
+
+/** Compute per-sample statistics.
+ *
+ *  mean = sum(x_i) / n
+ *
+ *  var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1)
+ *
+ *  On input, means contains per-sample sums and vars contains
+ *  per-sample sums of squares.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (local_num_samples / bsize) x 1 x 1
+ */
+template <typename TensorDataType>
+__global__ void fp_statistics_kernel(
+  unsigned long long sample_size,
+  size_t local_num_samples,
+  TensorDataType* means,
+  size_t means_stride,
+  TensorDataType* vars,
+  size_t vars_stride) {
+
+  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nthreads = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < local_num_samples; i += nthreads) {
+    const auto sum = means[i*means_stride];
+    const auto sqsum = vars[i*means_stride];
+    const TensorDataType sample_size_dt = TensorDataType(sample_size);
+    const auto& mean = sum / sample_size_dt;
+    const auto& sqmean = sqsum / sample_size_dt;
+    const auto& var = (sqmean - mean*mean) * sample_size_dt / TensorDataType(sample_size-1);
+    means[i*means_stride] = mean;
+    vars[i*vars_stride] = cuda::max(var, TensorDataType(0.0));
+  }
+
+}
+
+/** Compute outputs.
+ *
+ *  y_i = (x_i - mean) / sqrt(var + epsilon)
+ *
+ *  Block dimensions: bdimx x bdimy x 1
+ *
+ *  Grid dimensions: (local_sample_size / bdimx) x (local_num_samples / bdimy) x 1
+ */
+template <typename TensorDataType>
+__global__ void fp_output_kernel(
+  size_t local_num_samples,
+  size_t local_sample_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  TensorDataType* __restrict__ output,
+  size_t output_ldim,
+  const TensorDataType* means,
+  size_t means_stride,
+  const TensorDataType* vars,
+  size_t vars_stride) {
+
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t i = gidy; i < local_num_samples; i += nthreadsy) {
+    const auto& mean = means[i*means_stride];
+    const auto& var = vars[i*vars_stride];
+    const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+    for (size_t j = gidx; j < local_sample_size; j += nthreadsx) {
+      const auto& x = input[i*input_ldim + j];
+      auto& y = output[i*output_ldim + j];
+      y = (x - mean) * inv_stdev;
+    }
+  }
+
+}
+
+/** @brief Forward prop */
+template <typename TensorDataType>
+void fp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             El::AbstractDistMatrix<TensorDataType>& output,
+             El::AbstractDistMatrix<TensorDataType>& statistics) {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const GPUMatType&>(input.LockedMatrix());
+  auto& local_output = dynamic_cast<GPUMatType&>(output.Matrix());
+  auto& local_statistics = dynamic_cast<GPUMatType&>(statistics.Matrix());
+  auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL);
+  auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL);
+
+  // Dimensions
+  const size_t sample_size = input.Height();
+  const size_t local_num_samples = local_input.Width();
+  const size_t local_sample_size = local_input.Height();
+
+  // Trivial cases
+  if (local_num_samples < 1) { return; }
+
+  // Compute sums
+  El::Zero(statistics);
+  if (!local_input.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_sample_size + block_size - 1) / block_size;
+    grid_dims.y = local_num_samples;
+    fp_sums_kernel<block_size><<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      local_num_samples, local_sample_size,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_means.Buffer(), local_means.LDim(),
+      local_vars.Buffer(), local_vars.LDim());
+  }
+  comm.allreduce(statistics, statistics.RedundantComm(), El::mpi::SUM);
+
+  // Compute statistics from sums
+  if (sample_size <= 1) {
+    // local_means already has correct values
+    El::Fill(local_vars, El::TypeTraits<TensorDataType>::One());
+  }
+  else if (!local_statistics.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_num_samples + block_size - 1) / block_size;
+    fp_statistics_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      sample_size, local_num_samples,
+      local_means.Buffer(), local_means.LDim(),
+      local_vars.Buffer(), local_vars.LDim());
+  }
+
+  // Apply layer norm
+  if (!local_output.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_sample_size + block_size - 1) / block_size;
+    grid_dims.y = local_num_samples;
+    fp_output_kernel<<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+      local_num_samples, local_sample_size, epsilon,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_output.Buffer(), local_output.LDim(),
+      local_means.LockedBuffer(), local_means.LDim(),
+      local_vars.LockedBuffer(), local_vars.LDim());
+  }
+
+}
+
+/** Compute gradients w.r.t. per-sample statistics.
+ *
+ *  dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon)
+ *
+ *  dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2
+ *
+ *  On input, means_grad and vars_grad are filled with zeros.
+ *
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (local_sample_size / bsize) x local_num_samples x 1
+ */
+template <size_t bdimx, typename TensorDataType>
+__global__ void bp_statistics_grad_kernel(
+  size_t local_num_samples,
+  size_t local_sample_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  const TensorDataType* __restrict__ output_grad,
+  size_t output_grad_ldim,
+  const TensorDataType* means,
+  size_t means_stride,
+  const TensorDataType* vars,
+  size_t vars_stride,
+  TensorDataType* means_grad,
+  size_t means_grad_stride,
+  TensorDataType* vars_grad,
+  size_t vars_grad_stride) {
+
+  // Indices and dimensions
+  constexpr size_t bdimy = 1;
+  constexpr size_t bdimz = 1;
+  const size_t tid = threadIdx.x + blockDim.x * threadIdx.y;
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+
+  for (size_t i = gidy; i < local_num_samples; i += nthreadsy) {
+
+    // Accumulate sums and perform block-wide reduction
+    using pair_t = thrust::pair<TensorDataType,TensorDataType>;
+    using pair_sum_t = pair_sum<pair_t>;
+    pair_t sums(0,0);
+    const auto& mean = means[i*means_stride];
+    for (size_t j = gidx; j < local_sample_size; j += nthreadsx) {
+      const auto& x = input[i*input_ldim + j];
+      const auto& dy = output_grad[i*output_grad_ldim + j];
+      sums.first += dy;
+      sums.second += dy * (x - mean);
+    }
+    sums = cuda::block_reduce<bdimx,bdimy,bdimz,pair_t,pair_sum_t>(sums);
+
+    // Output result to global memory
+    if (tid == 0) {
+      const auto& var = vars[i*vars_stride];
+      const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+      const TensorDataType dmean = -sums.first * inv_stdev;
+      const TensorDataType dvar = -sums.second * inv_stdev*inv_stdev*inv_stdev / TensorDataType(2);
+      cuda::atomic_add(&means_grad[i*means_grad_stride], dmean);
+      cuda::atomic_add(&vars_grad[i*vars_grad_stride], dvar);
+    }
+
+  }
+
+}
+
+/** Compute gradients w.r.t. input.
+ *
+ *  dL/dx_i = ( dL/dy_i / sqrt(var+epsilon)
+ *              + dL/dmean / n
+ *              + dL/dvar * (x_i - mean) * 2/(n-1) )
+ *
+ *  Block dimensions: bdimx x bdimy x 1
+ *
+ *  Grid dimensions: (local_sample_size / bdimx) x (local_num_samples / bdimy) x 1
+ */
+template <typename TensorDataType>
+__global__ void bp_input_grad_kernel(
+  unsigned long long sample_size,
+  size_t local_num_samples,
+  size_t local_sample_size,
+  TensorDataType epsilon,
+  const TensorDataType* __restrict__ input,
+  size_t input_ldim,
+  const TensorDataType* __restrict__ output_grad,
+  size_t output_grad_ldim,
+  TensorDataType* __restrict__ input_grad,
+  size_t input_grad_ldim,
+  const TensorDataType* __restrict__ means,
+  size_t means_stride,
+  const TensorDataType* __restrict__ vars,
+  size_t vars_stride,
+  const TensorDataType* means_grad,
+  size_t means_grad_stride,
+  const TensorDataType* vars_grad,
+  size_t vars_grad_stride) {
+
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t nthreadsx = blockDim.x * gridDim.x;
+  const size_t nthreadsy = blockDim.y * gridDim.y;
+  for (size_t i = gidy; i < local_num_samples; i += nthreadsy) {
+    const auto& mean = means[i*means_stride];
+    const auto& var = vars[i*vars_stride];
+    const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+    const auto& dmean = means_grad[i*means_grad_stride];
+    const auto& dvar = vars_grad[i*vars_grad_stride];
+    for (size_t j = gidx; j < local_sample_size; j += nthreadsx) {
+      const auto& x = input[i*input_ldim + j];
+      const auto& dy = output_grad[i*output_grad_ldim + j];
+      auto& dx = input_grad[i*input_grad_ldim + j];
+      dx = (dy * inv_stdev
+            + dmean / TensorDataType(sample_size)
+            + dvar * (x - mean) * TensorDataType(2) / TensorDataType(sample_size - 1));
+    }
+  }
+
+}
+
+/** @brief Backprop */
+template <typename TensorDataType>
+void bp_impl(lbann_comm& comm,
+             TensorDataType epsilon,
+             const El::AbstractDistMatrix<TensorDataType>& input,
+             const El::AbstractDistMatrix<TensorDataType>& output_grad,
+             El::AbstractDistMatrix<TensorDataType>& input_grad,
+             const El::AbstractDistMatrix<TensorDataType>& statistics,
+             El::AbstractDistMatrix<TensorDataType>& statistics_grad) {
+  using GPUMatType = El::Matrix<TensorDataType, El::Device::GPU>;
+
+  // Local matrices
+  const auto& local_input = dynamic_cast<const GPUMatType&>(input.LockedMatrix());
+  const auto& local_output_grad = dynamic_cast<const GPUMatType&>(output_grad.LockedMatrix());
+  auto& local_input_grad = dynamic_cast<GPUMatType&>(input_grad.Matrix());
+  const auto& local_statistics = dynamic_cast<const GPUMatType&>(statistics.LockedMatrix());
+  const auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL);
+  const auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL);
+  auto& local_statistics_grad = dynamic_cast<GPUMatType&>(statistics_grad.Matrix());
+  auto local_means_grad = El::View(local_statistics_grad, El::IR(0), El::ALL);
+  auto local_vars_grad = El::View(local_statistics_grad, El::IR(1), El::ALL);
+
+  // Dimensions
+  const size_t sample_size = input.Height();
+  const size_t local_num_samples = local_input.Width();
+  const size_t local_sample_size = local_input.Height();
+
+  // Trivial case if sample size <= 1
+  // Note: Output is constant, so error signal is zero.
+  if (sample_size <= 1) {
+    El::Zero(input_grad);
+    return;
+  }
+
+  // Compute gradient w.r.t. statistics
+  El::Zero(statistics_grad);
+  if (!local_output_grad.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_sample_size + block_size - 1) / block_size;
+    grid_dims.y = local_num_samples;
+    bp_statistics_grad_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_num_samples, local_sample_size, epsilon,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output_grad.LockedBuffer(), local_output_grad.LDim(),
+        local_means.LockedBuffer(), local_means.LDim(),
+        local_vars.LockedBuffer(), local_vars.LDim(),
+        local_means_grad.Buffer(), local_means_grad.LDim(),
+        local_vars_grad.Buffer(), local_vars_grad.LDim());
+  }
+  comm.allreduce(statistics_grad,
+                 statistics_grad.RedundantComm(),
+                 El::mpi::SUM);
+
+  // Compute gradient w.r.t. input
+  if (!local_input_grad.IsEmpty()) {
+    constexpr size_t block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_sample_size + block_size - 1) / block_size;
+    grid_dims.y = local_num_samples;
+    bp_input_grad_kernel
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        sample_size, local_num_samples, local_sample_size, epsilon,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_output_grad.LockedBuffer(), local_output_grad.LDim(),
+        local_input_grad.Buffer(), local_input_grad.LDim(),
+        local_means.LockedBuffer(), local_means.LDim(),
+        local_vars.LockedBuffer(), local_vars.LDim(),
+        local_means_grad.LockedBuffer(), local_means_grad.LDim(),
+        local_vars_grad.LockedBuffer(), local_vars_grad.LDim());
+  }
+
+}
+
+} // namespace <anon>
+
+// Template instantiation
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType, Layout, Device>::fp_compute() {
+  fp_impl(*this->get_comm(),
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_activations(),
+          *this->m_statistics);
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+void layer_norm_layer<TensorDataType, Layout, Device>::bp_compute() {
+  bp_impl(*this->get_comm(),
+          this->m_epsilon,
+          this->get_prev_activations(),
+          this->get_prev_error_signals(),
+          this->get_error_signals(),
+          *this->m_statistics,
+          *this->m_statistics_gradient);
+}
+
+#define PROTO(T)                                     \
+  template class layer_norm_layer<                   \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class layer_norm_layer<                   \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/regularizers/local_response_normalization.cpp b/src/layers/regularizers/local_response_normalization.cpp
new file mode 100644
index 00000000000..560a59027c2
--- /dev/null
+++ b/src/layers/regularizers/local_response_normalization.cpp
@@ -0,0 +1,88 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/local_response_normalization.hpp"
+#include "layers.pb.h"
+
+namespace lbann {
+namespace {
+
+template <typename TensorDataType, data_layout layout, El::Device device>
+struct lrn_builder;
+
+template <typename TensorDataType, El::Device device>
+struct lrn_builder<TensorDataType, data_layout::DATA_PARALLEL, device> {
+  using LayerType =
+    local_response_normalization_layer<TensorDataType,
+                                       data_layout::DATA_PARALLEL,
+                                       device>;
+  static std::unique_ptr<LayerType> Get(lbann_comm* comm,
+                                        lbann_data::Layer const& layer_msg) {
+    const auto& params = layer_msg.local_response_normalization();
+    return lbann::make_unique<LayerType>(
+      comm,
+      params.window_width(),
+      El::To<TensorDataType>(params.lrn_alpha()),
+      El::To<TensorDataType>(params.lrn_beta()),
+      El::To<TensorDataType>(params.lrn_k()));
+  }
+};
+
+template <typename TensorDataType, El::Device device>
+struct lrn_builder<TensorDataType, data_layout::MODEL_PARALLEL, device> {
+  static std::unique_ptr<Layer> Get(lbann_comm* comm,
+                                    lbann_data::Layer const& layer_msg) {
+    LBANN_ERROR("local response normalization layer is only supported "
+                "with a data-parallel layout");
+    return nullptr;
+  }
+};
+}
+
+template <typename TensorDataType, data_layout layout, El::Device device>
+std::unique_ptr<Layer> build_local_response_normalization_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& layer_msg)
+{
+  using Builder = lrn_builder<TensorDataType,layout,device>;
+  return Builder::Get(comm, layer_msg);
+}
+
+#define PROTO_DEVICE(T, Device)                                         \
+  template std::unique_ptr<Layer>                                       \
+  build_local_response_normalization_layer_from_pbuf<                   \
+      T, data_layout::DATA_PARALLEL, Device>(                           \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template std::unique_ptr<Layer>                                       \
+  build_local_response_normalization_layer_from_pbuf<                   \
+      T, data_layout::MODEL_PARALLEL, Device>(                          \
+    lbann_comm*, lbann_data::Layer const&);                             \
+  template class local_response_normalization_layer<                    \
+    T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/regularizers/selu_dropout.cpp b/src/layers/regularizers/selu_dropout.cpp
new file mode 100644
index 00000000000..36ccffa1836
--- /dev/null
+++ b/src/layers/regularizers/selu_dropout.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SELU_DROPOUT_LAYER_INSTANTIATE
+#include "lbann/layers/regularizers/selu_dropout.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class selu_dropout<T, data_layout::DATA_PARALLEL, Device>; \
+  template class selu_dropout<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/CMakeLists.txt b/src/layers/transform/CMakeLists.txt
index 6040645f6cf..e84a3886d93 100644
--- a/src/layers/transform/CMakeLists.txt
+++ b/src/layers/transform/CMakeLists.txt
@@ -1,19 +1,44 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  bernoulli.cpp
+  categorical_random.cpp
+  concatenate.cpp
+  concatenate_builder.cpp
+  constant.cpp
   crop.cpp
+  crop_builder.cpp
+  discrete_random.cpp
+  dummy.cpp
   evaluation.cpp
+  gaussian.cpp
+  hadamard.cpp
   in_top_k.cpp
+  pooling.cpp
+  reduction.cpp
+  reshape.cpp
+  slice.cpp
   sort.cpp
+  split.cpp
+  stop_gradient.cpp
+  sum.cpp
   tessellate.cpp
+  uniform.cpp
+  unpooling.cpp
+  weighted_sum.cpp
+  weights.cpp
   )
 
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
+    concatenate.cu
     crop.cu
     in_top_k.cu
     sort.cu
+    slice.cu
     tessellate.cu
+    split.cu
+    sum.cu
     )
 endif ()
 
diff --git a/src/layers/transform/bernoulli.cpp b/src/layers/transform/bernoulli.cpp
new file mode 100644
index 00000000000..3e06eb13be5
--- /dev/null
+++ b/src/layers/transform/bernoulli.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_BERNOULLI_LAYER_INSTANTIATE
+#include "lbann/layers/transform/bernoulli.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <layers.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_bernoulli_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer) {
+
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, bernoulli);
+
+  const auto& params = proto_layer.bernoulli();
+  const auto& dims = parse_list<int>(params.neuron_dims());
+  return lbann::make_unique<bernoulli_layer<TensorDataType, Layout, Device>>(
+    comm, dims, params.prob());
+
+}
+
+#define PROTO_DEVICE(T, Device)                                         \
+  template class bernoulli_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class bernoulli_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(bernoulli, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/categorical_random.cpp b/src/layers/transform/categorical_random.cpp
new file mode 100644
index 00000000000..f2158c80929
--- /dev/null
+++ b/src/layers/transform/categorical_random.cpp
@@ -0,0 +1,93 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/categorical_random.hpp"
+
+namespace lbann {
+namespace {
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  static std::unique_ptr<Layer> Build(lbann_comm*)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"categorical_random\" with "
+                "Layout=", to_string(L), " and Device=", to_string(D) ,"\n",
+                "This layer is only supported with DATA_PARALLEL data layout"
+                "on CPU.");
+  }
+};
+
+template <typename T>
+struct Builder<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+{
+  static std::unique_ptr<Layer> Build(lbann_comm* comm)
+  {
+    using LayerType = categorical_random_layer<T,
+                                               data_layout::DATA_PARALLEL,
+                                               El::Device::CPU>;
+    return lbann::make_unique<LayerType>(comm);
+  }
+};
+
+#ifdef LBANN_HAS_GPU_FP16
+template <>
+struct Builder<El::gpu_half_type, data_layout::DATA_PARALLEL, El::Device::CPU>
+{
+  static std::unique_ptr<Layer> Build(lbann_comm*)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"categorical_random\" with "
+                "TensorDataType=gpu_half_type. This layer is does not support "
+                "this type.");
+  }
+};
+#endif // LBANN_HAS_GPU_FP16
+}
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_categorical_random_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const&)
+{
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  return BuilderType::Build(comm);
+}
+
+#define PROTO(T)                                                        \
+  template class                                                        \
+  categorical_random_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(categorical_random, T, Device)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/concatenate.cpp b/src/layers/transform/concatenate.cpp
new file mode 100644
index 00000000000..0a7068e9969
--- /dev/null
+++ b/src/layers/transform/concatenate.cpp
@@ -0,0 +1,329 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CONCATENATE_LAYER_INSTANTIATE
+#include "lbann/layers/transform/concatenate.hpp"
+
+namespace lbann {
+
+namespace {
+
+using dim4 = std::array<size_t, 4>;
+
+/** @brief Concatenate 4D tensors. */
+template <typename T>
+void concat4d(
+  size_t concat_dim,
+  const std::vector<const T*>& input_buffer_list,
+  const std::vector<dim4>& input_dims_list,
+  const std::vector<dim4>& input_strides_list,
+  T* output_buffer,
+  const dim4& output_strides) {
+
+  // Compute offset corresponding to each input tensor
+  std::vector<size_t> output_offset_list;
+  output_offset_list.push_back(0);
+  for (const auto& input_dims : input_dims_list) {
+    auto offset = output_offset_list.back();
+    offset += input_dims[concat_dim] * output_strides[concat_dim];
+    output_offset_list.push_back(offset);
+  }
+
+  // Iterate through input tensors
+  for (size_t j=0; j<input_buffer_list.size(); ++j) {
+    const auto& input_buffer = input_buffer_list[j];
+    const auto& input_dims = input_dims_list[j];
+    const auto& input_strides = input_strides_list[j];
+    const auto& output_offset = output_offset_list[j];
+
+    // Copy input tensor to corresponding position in output tensor
+    LBANN_OMP_PARALLEL_FOR_COLLAPSE4
+    for (size_t i0=0; i0<input_dims[0]; ++i0) {
+      for (size_t i1=0; i1<input_dims[1]; ++i1) {
+        for (size_t i2=0; i2<input_dims[2]; ++i2) {
+          for (size_t i3=0; i3<input_dims[3]; ++i3) {
+            const auto& x = input_buffer[i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[output_offset
+                                    + i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+/** @brief Slice 4D tensors. */
+template <typename T>
+void slice4d(
+  size_t slice_dim,
+  const T* input_buffer,
+  const dim4& input_strides,
+  const std::vector<T*>& output_buffer_list,
+  const std::vector<dim4>& output_dims_list,
+  const std::vector<dim4>& output_strides_list) {
+
+  // Compute offset corresponding to each output tensor
+  std::vector<size_t> input_offset_list;
+  input_offset_list.push_back(0);
+  for (const auto& output_dims : output_dims_list) {
+    auto offset = input_offset_list.back();
+    offset += output_dims[slice_dim] * input_strides[slice_dim];
+    input_offset_list.push_back(offset);
+  }
+
+  // Iterate through output tensors
+  for (size_t j=0; j<output_buffer_list.size(); ++j) {
+    auto&& output_buffer = output_buffer_list[j];
+    const auto& output_dims = output_dims_list[j];
+    const auto& output_strides = output_strides_list[j];
+    const auto& input_offset = input_offset_list[j];
+
+    // Copy output tensor to corresponding position in input tensor
+    LBANN_OMP_PARALLEL_FOR_COLLAPSE4
+    for (size_t i0=0; i0<output_dims[0]; ++i0) {
+      for (size_t i1=0; i1<output_dims[1]; ++i1) {
+        for (size_t i2=0; i2<output_dims[2]; ++i2) {
+          for (size_t i3=0; i3<output_dims[3]; ++i3) {
+            auto& x = input_buffer[input_offset
+                                   + i0 * input_strides[0]
+                                   + i1 * input_strides[1]
+                                   + i2 * input_strides[2]
+                                   + i3 * input_strides[3]];
+            auto& y = output_buffer[i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::CPU>& l,
+  size_t concat_dim) {
+
+  // Stack Elemental matrices on top of each other
+  // Note: Assume each mini-batch sample is flat.
+  auto& output = l.get_activations();
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> output_v(
+    output.Construct(output.Grid(), output.Root()));
+  size_t offset = 0;
+  for (size_t j=0; j<static_cast<size_t>(l.get_num_parents()); ++j) {
+    const auto& input = l.get_prev_activations(j);
+    El::View(*output_v, output,
+             El::IR(offset, offset+input.Height()), El::ALL);
+    El::Copy(input, *output_v);
+    offset += input.Height();
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::CPU>& l,
+  size_t concat_dim) {
+  // Tensor views have already been setup in
+  // bp_setup_gradient_wrt_inputs
+}
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l,
+  size_t concat_dim) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const size_t num_dims = l.get_output_dims().size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get dimensions and strides for each input tensor
+  std::vector<const TensorDataType*> input_buffer_list;
+  std::vector<dim4> input_dims_list, input_strides_list;
+  for (size_t j=0; j<static_cast<size_t>(l.get_num_parents()); ++j) {
+    const auto& input = l.get_prev_activations(j);
+    const auto& input_dims = l.get_input_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input.LocalWidth());
+    rstrides.push_back(input.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_buffer_list.push_back(input.LockedBuffer());
+    input_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    input_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+  }
+
+  // Get strides for output tensor
+  dim4 output_strides;
+  auto& output = l.get_activations();
+  {
+    const auto& output_dims = l.get_output_dims();
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_dims.rbegin(), output_dims.rend());
+    std::vector<size_t> rstrides(output_dims.size(), 1);
+    for (size_t d=1; d<output_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output.LocalWidth());
+    rstrides.push_back(output.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Concatenate 4D tensors
+  concat4d<TensorDataType>(
+    concat_dim + (4-num_dims),
+    input_buffer_list,
+    input_dims_list,
+    input_strides_list,
+    output.Buffer(),
+    output_strides);
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l,
+  size_t concat_dim) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const size_t num_dims = l.get_output_dims().size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get dimensions and strides for each input gradient tensor
+  std::vector<TensorDataType*> input_grad_buffer_list;
+  std::vector<dim4> input_grad_dims_list, input_grad_strides_list;
+  const size_t num_inputs = l.get_num_parents();
+  for (size_t j=0; j<num_inputs; ++j) {
+    auto& input_grad = l.get_error_signals(j);
+    const auto& input_grad_dims = l.get_input_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_grad_dims.rbegin(), input_grad_dims.rend());
+    std::vector<size_t> rstrides(input_grad_dims.size(), 1);
+    for (size_t d=1; d<input_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input_grad.LocalWidth());
+    rstrides.push_back(input_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_grad_buffer_list.push_back(input_grad.Buffer());
+    input_grad_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    input_grad_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+  }
+
+  // Get strides for output gradient tensor
+  const auto& output_grad = l.get_prev_error_signals();
+  dim4 output_grad_strides;
+  {
+    const auto& output_grad_dims = l.get_output_dims();
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_grad_dims.rbegin(), output_grad_dims.rend());
+    std::vector<size_t> rstrides(output_grad_dims.size(), 1);
+    for (size_t d=1; d<output_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output_grad.LocalWidth());
+    rstrides.push_back(output_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_grad_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Slice 4D tensor
+  slice4d<TensorDataType>(
+    concat_dim + (4-num_dims),
+    output_grad.LockedBuffer(),
+    output_grad_strides,
+    input_grad_buffer_list,
+    input_grad_dims_list,
+    input_grad_strides_list);
+
+}
+
+#define PROTO(T)                                        \
+  template class concatenate_layer<                     \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;    \
+  template class concatenate_layer<                     \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/concatenate.cu b/src/layers/transform/concatenate.cu
new file mode 100644
index 00000000000..0733cc9bb09
--- /dev/null
+++ b/src/layers/transform/concatenate.cu
@@ -0,0 +1,497 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CONCATENATE_LAYER_INSTANTIATE
+#include "lbann/layers/transform/concatenate.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+using dim4 = cuda::array<size_t, 4>;
+
+/**
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (max_input_dims[3] / bsize) x max_input_dims[2] x max_input_dims[1]
+ */
+template <typename T>
+__global__ void concat4d_kernel(
+  size_t num_inputs,
+  const T* __restrict__ * __restrict__ input_buffer_list,
+  const dim4* __restrict__ input_dims_list,
+  const dim4* __restrict__ input_strides_list,
+  T* __restrict__ output_buffer,
+  dim4 output_strides,
+  const size_t* __restrict__ output_offset_list) {
+
+  // Indices
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = gridDim.x * blockDim.x;
+  const size_t nthreadsy = gridDim.y * blockDim.y;
+  const size_t nthreadsz = gridDim.z * blockDim.z;
+
+  for (size_t j=0; j<num_inputs; ++j) {
+
+    // Current input tensor
+    const auto& input_buffer = input_buffer_list[j];
+    const auto& input_dims = input_dims_list[j];
+    const auto& input_strides = input_strides_list[j];
+    const auto& output_offset = output_offset_list[j];
+
+    // Copy from input tensor to output tensor
+    for (size_t i0=0; i0<input_dims[0]; ++i0) {
+      for (size_t i1=gidz; i1<input_dims[1]; i1+=nthreadsz) {
+        for (size_t i2=gidy; i2<input_dims[2]; i2+=nthreadsy) {
+          for (size_t i3=gidx; i3<input_dims[3]; i3+=nthreadsx) {
+            const auto& x = input_buffer[i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[output_offset
+                                    + i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+/**
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (max_output_dims[3] / bsize) x max_output_dims[2] x max_output_dims[1]
+ *
+ */
+template <typename T>
+__global__ void slice4d_kernel(
+  size_t num_outputs,
+  const T* __restrict__ input_buffer,
+  dim4 input_strides,
+  const size_t* __restrict__ input_offset_list,
+  T* __restrict__ * __restrict__ output_buffer_list,
+  const dim4* __restrict__ output_dims_list,
+  const dim4* __restrict__ output_strides_list) {
+
+  // Indices
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = gridDim.x * blockDim.x;
+  const size_t nthreadsy = gridDim.y * blockDim.y;
+  const size_t nthreadsz = gridDim.z * blockDim.z;
+
+  for (size_t j=0; j<num_outputs; ++j) {
+
+    // Current output tensor
+    const auto& input_offset = input_offset_list[j];
+    auto& output_buffer = output_buffer_list[j];
+    const auto& output_dims = output_dims_list[j];
+    const auto& output_strides = output_strides_list[j];
+
+    // Copy from input tensor to output tensor
+    for (size_t i0=0; i0<output_dims[0]; ++i0) {
+      for (size_t i1=gidz; i1<output_dims[1]; i1+=nthreadsz) {
+        for (size_t i2=gidy; i2<output_dims[2]; i2+=nthreadsy) {
+          for (size_t i3=gidx; i3<output_dims[3]; i3+=nthreadsx) {
+            const auto& x = input_buffer[input_offset
+                                         + i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::GPU>& l,
+  size_t concat_dim) {
+
+  // Stack Elemental matrices on top of each other
+  // Note: Assume each mini-batch sample is flat.
+  auto& output = l.get_activations();
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> output_v(
+    output.Construct(output.Grid(), output.Root()));
+  size_t offset = 0;
+  for (size_t i=0; i<static_cast<size_t>(l.get_num_parents()); ++i) {
+    const auto& input = l.get_prev_activations(i);
+    El::View(*output_v, output,
+             El::IR(offset, offset+input.Height()), El::ALL);
+    El::Copy(input, *output_v);
+    offset += input.Height();
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::GPU>& l,
+  size_t concat_dim) {
+  // Tensor views have already been setup in
+  // bp_setup_gradient_wrt_inputs
+}
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l,
+  size_t concat_dim) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const size_t num_dims = l.get_output_dims().size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get synchronization info from output tensor
+  using LocalMatrix = El::Matrix<TensorDataType, El::Device::GPU>;
+  auto& output = l.get_activations();
+  auto& local_output = dynamic_cast<LocalMatrix&>(output.Matrix());
+  auto&& sync_info = El::SyncInfoFromMatrix(local_output);
+  auto&& stream = sync_info.stream_;
+
+  // Get dimensions and strides for each input tensor
+  const size_t num_inputs = l.get_num_parents();
+  std::vector<const TensorDataType*> input_buffer_list;
+  std::vector<dim4> input_dims_list, input_strides_list;
+  dim4 max_input_dims{0,0,0,0};
+  for (size_t j=0; j<num_inputs; ++j) {
+    const auto& input = l.get_prev_activations(j);
+    const auto& input_dims = l.get_input_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input.LocalWidth());
+    rstrides.push_back(input.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_buffer_list.push_back(input.LockedBuffer());
+    input_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    input_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+    for (size_t i=0; i<4; ++i) {
+      max_input_dims[i] = std::max(max_input_dims[i], rdims[3-i]);
+    }
+  }
+
+  // Get strides for output tensor
+  dim4 output_strides;
+  {
+    const auto& output_dims = l.get_output_dims();
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_dims.rbegin(), output_dims.rend());
+    std::vector<size_t> rstrides(output_dims.size(), 1);
+    for (size_t d=1; d<output_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(local_output.Width());
+    rstrides.push_back(local_output.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Compute each input tensor's offset in output tensor
+  concat_dim += 4 - num_dims;   // Tensor has been padded to 4-D
+  std::vector<size_t> output_offset_list;
+  output_offset_list.push_back(0);
+  for (const auto& input_dims : input_dims_list) {
+    auto offset = output_offset_list.back();
+    offset += input_dims[concat_dim] * output_strides[concat_dim];
+    output_offset_list.push_back(offset);
+  }
+
+  // Pack tensor data into a CPU buffer
+  l.m_workspace_event.synchronize();
+  l.m_workspace.resize(
+    sizeof(TensorDataType*) * input_buffer_list.size()
+    + sizeof(dim4) * input_dims_list.size()
+    + sizeof(dim4) * input_strides_list.size()
+    + sizeof(size_t) * output_offset_list.size());
+  size_t pos = 0;
+  std::memcpy(&l.m_workspace[pos], input_buffer_list.data(),
+              sizeof(TensorDataType*) * input_buffer_list.size());
+  pos += sizeof(TensorDataType*) * input_buffer_list.size();
+  std::memcpy(&l.m_workspace[pos], input_dims_list.data(),
+              sizeof(dim4) * input_dims_list.size());
+  pos += sizeof(dim4) * input_dims_list.size();
+  std::memcpy(&l.m_workspace[pos], input_strides_list.data(),
+              sizeof(dim4) * input_strides_list.size());
+  pos += sizeof(dim4) * input_strides_list.size();
+  std::memcpy(&l.m_workspace[pos], output_offset_list.data(),
+              sizeof(size_t) * output_offset_list.size());
+  pos += sizeof(size_t) * output_offset_list.size();
+
+  // Copy tensor data to GPU
+  hydrogen::simple_buffer<unsigned char, El::Device::GPU> device_workspace(
+    l.m_workspace.size(),
+    sync_info);
+  unsigned char* device_workspace_ptr = device_workspace.data();
+  cudaMemcpyAsync(device_workspace_ptr,
+                  l.m_workspace.data(),
+                  l.m_workspace.size(),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  l.m_workspace_event.record(stream);
+  pos = 0;
+  auto&& device_input_buffer_list
+    = reinterpret_cast<const TensorDataType**>(device_workspace_ptr+pos);
+  pos += sizeof(TensorDataType*) * input_buffer_list.size();
+  auto&& device_input_dims_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * input_dims_list.size();
+  auto&& device_input_strides_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * input_strides_list.size();
+  auto&& device_output_offset_list
+    = reinterpret_cast<const size_t*>(device_workspace_ptr+pos);
+  pos += sizeof(size_t) * output_offset_list.size();
+
+  // Launch CUDA kernel
+  const auto& max_input_size = (max_input_dims[0] * max_input_dims[1]
+                                * max_input_dims[2] * max_input_dims[3]);
+  if (max_input_size > 0) {
+    constexpr size_t block_size = 64;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (max_input_dims[3] + block_size - 1) / block_size;
+    grid_dims.y = max_input_dims[2];
+    grid_dims.z = max_input_dims[1];
+    concat4d_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      num_inputs,
+      device_input_buffer_list,
+      device_input_dims_list,
+      device_input_strides_list,
+      local_output.Buffer(),
+      output_strides,
+      device_output_offset_list);
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  concatenate_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l,
+  size_t concat_dim) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const size_t num_dims = l.get_output_dims().size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get synchronization info from output gradient tensor
+  using LocalMatrix = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& output_grad = l.get_prev_error_signals();
+  auto& local_output_grad = dynamic_cast<const LocalMatrix&>(output_grad.LockedMatrix());
+  auto&& sync_info = El::SyncInfoFromMatrix(local_output_grad);
+  auto&& stream = sync_info.stream_;
+
+  // Get dimensions and strides for each input gradient tensor
+  const size_t num_inputs = l.get_num_parents();
+  std::vector<TensorDataType*> input_grad_buffer_list;
+  std::vector<dim4> input_grad_dims_list, input_grad_strides_list;
+  dim4 max_input_grad_dims{0,0,0,0};
+  for (size_t j=0; j<num_inputs; ++j) {
+    auto& input_grad = l.get_error_signals(j);
+    const auto& input_grad_dims = l.get_input_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_grad_dims.rbegin(), input_grad_dims.rend());
+    std::vector<size_t> rstrides(input_grad_dims.size(), 1);
+    for (size_t d=1; d<input_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input_grad.LocalWidth());
+    rstrides.push_back(input_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_grad_buffer_list.push_back(input_grad.Buffer());
+    input_grad_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    input_grad_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+    for (size_t i=0; i<4; ++i) {
+      max_input_grad_dims[i] = std::max(max_input_grad_dims[i], rdims[3-i]);
+    }
+  }
+
+  // Get strides for output gradient tensor
+  dim4 output_grad_strides;
+  {
+    const auto& output_grad_dims = l.get_output_dims();
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_grad_dims.rbegin(), output_grad_dims.rend());
+    std::vector<size_t> rstrides(output_grad_dims.size(), 1);
+    for (size_t d=1; d<output_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(local_output_grad.Width());
+    rstrides.push_back(local_output_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_grad_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Compute each input gradient tensor's offset in output gradient tensor
+  concat_dim += 4 - num_dims;   // Tensor has been padded to 4-D
+  std::vector<size_t> output_grad_offset_list;
+  output_grad_offset_list.push_back(0);
+  for (const auto& input_grad_dims : input_grad_dims_list) {
+    auto offset = output_grad_offset_list.back();
+    offset += input_grad_dims[concat_dim] * output_grad_strides[concat_dim];
+    output_grad_offset_list.push_back(offset);
+  }
+
+  // Pack tensor data into a CPU buffer
+  l.m_workspace_event.synchronize();
+  l.m_workspace.resize(
+    sizeof(size_t) * output_grad_offset_list.size()
+    + sizeof(TensorDataType*) * input_grad_buffer_list.size()
+    + sizeof(dim4) * input_grad_dims_list.size()
+    + sizeof(dim4) * input_grad_strides_list.size());
+  size_t pos = 0;
+  std::memcpy(&l.m_workspace[pos], output_grad_offset_list.data(),
+              sizeof(size_t) * output_grad_offset_list.size());
+  pos += sizeof(size_t) * output_grad_offset_list.size();
+  std::memcpy(&l.m_workspace[pos], input_grad_buffer_list.data(),
+              sizeof(TensorDataType*) * input_grad_buffer_list.size());
+  pos += sizeof(TensorDataType*) * input_grad_buffer_list.size();
+  std::memcpy(&l.m_workspace[pos], input_grad_dims_list.data(),
+              sizeof(dim4) * input_grad_dims_list.size());
+  pos += sizeof(dim4) * input_grad_dims_list.size();
+  std::memcpy(&l.m_workspace[pos], input_grad_strides_list.data(),
+              sizeof(dim4) * input_grad_strides_list.size());
+  pos += sizeof(dim4) * input_grad_strides_list.size();
+
+  // Copy tensor data to GPU
+  hydrogen::simple_buffer<unsigned char, El::Device::GPU> device_workspace(
+    l.m_workspace.size(),
+    sync_info);
+  unsigned char* device_workspace_ptr = device_workspace.data();
+  cudaMemcpyAsync(device_workspace_ptr,
+                  l.m_workspace.data(),
+                  l.m_workspace.size(),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  l.m_workspace_event.record(stream);
+  pos = 0;
+  auto&& device_output_grad_offset_list
+    = reinterpret_cast<const size_t*>(device_workspace_ptr+pos);
+  pos += sizeof(size_t) * output_grad_offset_list.size();
+  auto&& device_input_grad_buffer_list
+    = reinterpret_cast<TensorDataType**>(device_workspace_ptr+pos);
+  pos += sizeof(TensorDataType*) * input_grad_buffer_list.size();
+  auto&& device_input_grad_dims_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * input_grad_dims_list.size();
+  auto&& device_input_grad_strides_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * input_grad_strides_list.size();
+
+  // Launch CUDA kernel
+  const auto& max_input_grad_size = (max_input_grad_dims[0]
+                                     * max_input_grad_dims[1]
+                                     * max_input_grad_dims[2]
+                                     * max_input_grad_dims[3]);
+  if (max_input_grad_size > 0) {
+    constexpr size_t block_size = 64;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (max_input_grad_dims[3] + block_size - 1) / block_size;
+    grid_dims.y = max_input_grad_dims[2];
+    grid_dims.z = max_input_grad_dims[1];
+    slice4d_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      num_inputs,
+      local_output_grad.LockedBuffer(),
+      output_grad_strides,
+      device_output_grad_offset_list,
+      device_input_grad_buffer_list,
+      device_input_grad_dims_list,
+      device_input_grad_strides_list);
+  }
+
+}
+
+// Explicit instantiation
+#define PROTO(T)                                                        \
+  template class concatenate_layer<                                     \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;                    \
+  template class concatenate_layer<                                     \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/concatenate_builder.cpp b/src/layers/transform/concatenate_builder.cpp
new file mode 100644
index 00000000000..297917e253e
--- /dev/null
+++ b/src/layers/transform/concatenate_builder.cpp
@@ -0,0 +1,48 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/transform/concatenate.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <layers.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_concatenate_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, concatenation);
+  using LayerType = concatenate_layer<TensorDataType, Layout, Device>;
+  const auto& axis = proto_layer.concatenation().axis();
+  return lbann::make_unique<LayerType>(comm, axis);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(concatenate, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/constant.cpp b/src/layers/transform/constant.cpp
new file mode 100644
index 00000000000..08511d75e09
--- /dev/null
+++ b/src/layers/transform/constant.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_CONSTANT_LAYER_INSTANTIATE
+#include "lbann/layers/transform/constant.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_constant_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, constant);
+  using LayerType = constant_layer<TensorDataType, Layout, Device>;
+
+  const auto& params = proto_layer.constant();
+  const auto& dims = parse_list<int>(params.num_neurons());
+  return lbann::make_unique<LayerType>(
+    comm, El::To<TensorDataType>(params.value()), dims);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  template class constant_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class constant_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(constant, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/crop.cpp b/src/layers/transform/crop.cpp
index b319cee2985..4874f62ac1b 100644
--- a/src/layers/transform/crop.cpp
+++ b/src/layers/transform/crop.cpp
@@ -24,18 +24,25 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CROP_LAYER_INSTANTIATE
 #include "lbann/layers/transform/crop.hpp"
 
 namespace lbann {
 
-template <>
-void crop_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute_3d() {
-  fp_compute_nd();
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void crop_layer<TensorDataType, T_layout, Dev>::fp_compute_3d() {
+  this->fp_compute_nd();
 }
 
-template <>
-void crop_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute_3d() {
-  bp_compute_nd();
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void crop_layer<TensorDataType, T_layout, Dev>::bp_compute_3d() {
+  this->bp_compute_nd();
 }
 
+#define PROTO(T)                                      \
+  template class crop_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/crop.cu b/src/layers/transform/crop.cu
index 460b7e4404c..b84e6d364c0 100644
--- a/src/layers/transform/crop.cu
+++ b/src/layers/transform/crop.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_CROP_LAYER_INSTANTIATE
 #include "lbann/layers/transform/crop.hpp"
 #include "lbann/utils/cuda.hpp"
 
@@ -37,13 +38,14 @@ namespace {
  *  - Output - width x output_dimz x output_dimy x output_dimx
  *  - Crop position - width x 3 (i.e. a 3 x width matrix)
  */
+template <typename TensorDataType>
 __global__ void fp_compute_3d_kernel(
   El::Int input_dimx, El::Int input_dimy, El::Int input_dimz,
   El::Int output_dimx, El::Int output_dimy, El::Int output_dimz,
   El::Int width,
-  const DataType * __restrict__ input, int input_ldim,
-        DataType * __restrict__ output, int output_ldim,
-  const DataType * __restrict__ crop_pos, int crop_pos_ldim) {
+  const TensorDataType * __restrict__ input, int input_ldim,
+        TensorDataType * __restrict__ output, int output_ldim,
+  const TensorDataType * __restrict__ crop_pos, int crop_pos_ldim) {
 
   // Indices
   const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -60,9 +62,9 @@ __global__ void fp_compute_3d_kernel(
   for (El::Int col = bidy; col < width; col += num_blocks_y) {
 
     // Crop offsets
-    El::Int offz = num_offsets_z * crop_pos[col*crop_pos_ldim];
-    El::Int offy = num_offsets_y * crop_pos[col*crop_pos_ldim+1];
-    El::Int offx = num_offsets_x * crop_pos[col*crop_pos_ldim+2];
+    El::Int offz = num_offsets_z * static_cast<El::Int>(crop_pos[col*crop_pos_ldim]);
+    El::Int offy = num_offsets_y * static_cast<El::Int>(crop_pos[col*crop_pos_ldim+1]);
+    El::Int offx = num_offsets_x * static_cast<El::Int>(crop_pos[col*crop_pos_ldim+2]);
     offz = min(max(offz, El::Int(0)), num_offsets_z - 1);
     offy = min(max(offy, El::Int(0)), num_offsets_y - 1);
     offx = min(max(offx, El::Int(0)), num_offsets_x - 1);
@@ -101,13 +103,14 @@ __global__ void fp_compute_3d_kernel(
  *  - Gradient w.r.t. input - width x input_dimz x input_dimy x input_dimx
  *  - Crop position - width x 3 (i.e. a 3 x width matrix)
  */
+template <typename TensorDataType>
 __global__ void bp_compute_3d_kernel(
   El::Int input_dimx, El::Int input_dimy, El::Int input_dimz,
   El::Int output_dimx, El::Int output_dimy, El::Int output_dimz,
   El::Int width,
-  const DataType * __restrict__ gradient_wrt_output, int gradient_wrt_output_ldim,
-        DataType * __restrict__ gradient_wrt_input, int gradient_wrt_input_ldim,
-  const DataType * __restrict__ crop_pos, int crop_pos_ldim) {
+  const TensorDataType * __restrict__ gradient_wrt_output, int gradient_wrt_output_ldim,
+        TensorDataType * __restrict__ gradient_wrt_input, int gradient_wrt_input_ldim,
+  const TensorDataType * __restrict__ crop_pos, int crop_pos_ldim) {
 
   // Indices
   const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -124,9 +127,9 @@ __global__ void bp_compute_3d_kernel(
   for (El::Int col = bidy; col < width; col += num_blocks_y) {
 
     // Crop offsets
-    El::Int offz = num_offsets_z * crop_pos[col*crop_pos_ldim];
-    El::Int offy = num_offsets_y * crop_pos[col*crop_pos_ldim+1];
-    El::Int offx = num_offsets_x * crop_pos[col*crop_pos_ldim+2];
+    El::Int offz = num_offsets_z * static_cast<El::Int>(crop_pos[col*crop_pos_ldim]);
+    El::Int offy = num_offsets_y * static_cast<El::Int>(crop_pos[col*crop_pos_ldim+1]);
+    El::Int offx = num_offsets_x * static_cast<El::Int>(crop_pos[col*crop_pos_ldim+2]);
     offz = min(max(offz, El::Int(0)), num_offsets_z - 1);
     offy = min(max(offy, El::Int(0)), num_offsets_y - 1);
     offx = min(max(offx, El::Int(0)), num_offsets_x - 1);
@@ -161,19 +164,19 @@ __global__ void bp_compute_3d_kernel(
 
 } // namespace
 
-template <>
-void crop_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute_3d() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void crop_layer<TensorDataType, T_layout, Dev>::fp_compute_3d() {
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations(0);
-  const auto& local_crop_pos = get_local_prev_activations(1);
-  auto& local_output = get_local_activations();
+  const auto& local_input = this->get_local_prev_activations(0);
+  const auto& local_crop_pos = this->get_local_prev_activations(1);
+  auto& local_output = this->get_local_activations();
 
   // Tensor dimensions
   const auto& local_width = local_input.Width();
-  const auto input_dims = get_input_dims();
-  const auto output_dims = get_output_dims();
-  const auto& output_size = get_output_size();
+  const auto input_dims = this->get_input_dims();
+  const auto output_dims = this->get_output_dims();
+  const auto& output_size = this->get_output_size();
 
   // Launch CUDA kernel
   if (!local_output.IsEmpty()) {
@@ -193,23 +196,23 @@ void crop_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute_3d() {
 
 }
 
-template <>
-void crop_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute_3d() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void crop_layer<TensorDataType, T_layout, Dev>::bp_compute_3d() {
 
   // Clear error signals
-  El::Zero(get_error_signals(0));
-  El::Zero(get_error_signals(1));
+  El::Zero(this->get_error_signals(0));
+  El::Zero(this->get_error_signals(1));
 
   // Local matrices
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  const auto& local_crop_pos = get_local_prev_activations(1);
-  auto& local_gradient_wrt_input = get_local_error_signals(0);
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  const auto& local_crop_pos = this->get_local_prev_activations(1);
+  auto& local_gradient_wrt_input = this->get_local_error_signals(0);
 
   // Tensor dimensions
   const auto& local_width = local_gradient_wrt_input.Width();
-  const auto input_dims = get_input_dims();
-  const auto output_dims = get_output_dims();
-  const auto& output_size = get_output_size();
+  const auto input_dims = this->get_input_dims();
+  const auto output_dims = this->get_output_dims();
+  const auto& output_size = this->get_output_size();
 
   // Launch CUDA kernel
   if (!local_gradient_wrt_output.IsEmpty()) {
@@ -229,4 +232,10 @@ void crop_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute_3d() {
 
 }
 
+#define PROTO(T)                                      \
+  template class crop_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/crop_builder.cpp b/src/layers/transform/crop_builder.cpp
new file mode 100644
index 00000000000..5818b8c1073
--- /dev/null
+++ b/src/layers/transform/crop_builder.cpp
@@ -0,0 +1,78 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/transform/crop.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+namespace {
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"crop\" with "
+                "Layout=", to_string(L), ".\nThis layer is only "
+                "supported with DATA_PARALLEL data layout.");
+    return nullptr;
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct Builder<TensorDataType, data_layout::DATA_PARALLEL, Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = crop_layer<TensorDataType,
+                                 data_layout::DATA_PARALLEL,
+                                 Device>;
+    return lbann::make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+}// namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_crop_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, crop);
+
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  const auto& params = proto_layer.crop();
+  return BuilderType::Build(comm, parse_list<int>(params.dims()));
+}
+
+// Builder function ETI
+#define PROTO_DEVICE(T, Device) \
+  LBANN_LAYER_BUILDER_ETI(crop, T, Device)
+#include "lbann/macros/instantiate_device.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/discrete_random.cpp b/src/layers/transform/discrete_random.cpp
new file mode 100644
index 00000000000..cf49106e101
--- /dev/null
+++ b/src/layers/transform/discrete_random.cpp
@@ -0,0 +1,39 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/discrete_random.hpp"
+
+namespace lbann {
+
+#define PROTO(T)                                    \
+  template class discrete_random_layer<             \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/dummy.cpp b/src/layers/transform/dummy.cpp
new file mode 100644
index 00000000000..5bd6b8e33b5
--- /dev/null
+++ b/src/layers/transform/dummy.cpp
@@ -0,0 +1,42 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DUMMY_LAYER_INSTANTIATE
+#include "lbann/layers/transform/dummy.hpp"
+#include <lbann/utils/memory.hpp>
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(dummy)
+
+#define PROTO_DEVICE(T, Device) \
+  template class dummy_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class dummy_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(dummy, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp
index 9bbae41007d..10ffb3f0bf9 100644
--- a/src/layers/transform/evaluation.cpp
+++ b/src/layers/transform/evaluation.cpp
@@ -24,8 +24,12 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_EVALUATION_LAYER_INSTANTIATE
 #include "lbann/layers/transform/evaluation.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/hydrogen_utils.hpp"
 #ifdef LBANN_HAS_GPU
 #include "lbann/utils/cublas.hpp"
 #endif // LBANN_HAS_GPU
@@ -35,15 +39,16 @@ namespace lbann {
 namespace {
 
 /** CPU implementation of evaluation layer forward prop. */
+template <typename TensorDataType, typename EvalDataType>
 void fp_cpu(lbann_comm& comm,
-            const AbsDistMat& input,
-            DataType& value,
+            const El::AbstractDistMatrix<TensorDataType>& input,
+            EvalDataType& value,
             Al::request& req) {
   const auto& local_input = input.LockedMatrix();
   const auto& local_height = local_input.Height();
   const auto& local_width = local_input.Width();
   const auto& mini_batch_size = input.Width();
-  value = 0;
+  value = El::TypeTraits<EvalDataType>::Zero();
   LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:value) collapse(2))
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int row = 0; row < local_height; ++row) {
@@ -54,23 +59,45 @@ void fp_cpu(lbann_comm& comm,
   comm.nb_allreduce(&value, 1, input.DistComm(), req);
 }
 
+#ifdef LBANN_HAS_HALF
+template <typename EvalDataType>
+void fp_cpu(lbann_comm& comm,
+            const El::AbstractDistMatrix<cpu_fp16>& input,
+            EvalDataType& value,
+            Al::request& req) {
+    LBANN_ERROR("This function is not supported in FP16 on CPUs");
+}
+#endif // LBANN_HAS_HALF
+
+#ifdef LBANN_HAS_GPU_FP16
+template <typename EvalDataType>
+void fp_cpu(lbann_comm& comm,
+            const El::AbstractDistMatrix<fp16>& input,
+            EvalDataType& value,
+            Al::request& req) {
+    LBANN_ERROR("This function is not supported in FP16 on CPUs");
+}
+#endif // LBANN_HAS_GPU_HALF
+
 #ifdef LBANN_HAS_GPU
 /** GPU implementation of evaluation layer forward prop. */
+template <typename TensorDataType, typename EvalDataType>
 void fp_gpu(lbann_comm& comm,
-            const AbsDistMat& input,
-            DataType& value,
+            const El::AbstractDistMatrix<TensorDataType>& input,
+            EvalDataType& value,
             cuda::event_wrapper& copy_event) {
-  constexpr DataType zero = 0;
-  constexpr DataType one = 1;
+  const EvalDataType zero = El::TypeTraits<EvalDataType>::Zero();
+  const EvalDataType one = El::TypeTraits<EvalDataType>::One();
 
   // Local matrix
-  const auto& local_input = input.LockedMatrix();
-  const auto& local_height = local_input.Height();
-  const auto& local_width = local_input.Width();
+  const auto& local_tdf_input = input.LockedMatrix();
+  const auto local_input = ViewIfPossibleOrCopy<TensorDataType, EvalDataType>::get(local_tdf_input);
+  const auto& local_height = local_input->Height();
+  const auto& local_width = local_input->Width();
   const auto& mini_batch_size = input.Width();
 
   // GPU objects
-  GPUMat sum_d, ones_d;
+  El::Matrix<EvalDataType, El::Device::GPU> sum_d, ones_d;
 #ifdef HYDROGEN_HAVE_CUB
   sum_d.SetMemoryMode(1);  // Use CUB GPU memory pool
   ones_d.SetMemoryMode(1); // Use CUB GPU memory pool
@@ -81,14 +108,14 @@ void fp_gpu(lbann_comm& comm,
   CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
 
   // Compute sum of local input matrix entries
-  if (local_input.IsEmpty()) {
+  if (local_input->IsEmpty()) {
     El::Zero(sum_d);
-  } else if (local_input.Contiguous()) {
+  } else if (local_input->Contiguous()) {
     ones_d.Resize(local_height * local_width, 1);
     El::Fill(ones_d, one);
     cublas::dot(handle,
                 local_height * local_width,
-                local_input.LockedBuffer(), 1,
+                local_input->LockedBuffer(), 1,
                 ones_d.LockedBuffer(), 1,
                 sum_d.Buffer());
   } else if (local_height == 1) {
@@ -96,18 +123,18 @@ void fp_gpu(lbann_comm& comm,
     El::Fill(ones_d, one);
     cublas::dot(handle,
                 local_width,
-                local_input.LockedBuffer(), local_input.LDim(),
+                local_input->LockedBuffer(), local_input->LDim(),
                 ones_d.LockedBuffer(), 1,
                 sum_d.Buffer());
   } else {
-    GPUMat col_sums_d;
+    El::Matrix<EvalDataType, El::Device::GPU> col_sums_d;
 #ifdef HYDROGEN_HAVE_CUB
     col_sums_d.SetMemoryMode(1);  // Use CUB GPU memory pool
 #endif // HYDROGEN_HAVE_CUB
     col_sums_d.Resize(local_width, 1);
     ones_d.Resize(local_height, 1);
     El::Fill(ones_d, one);
-    El::Gemv(El::TRANSPOSE, one, local_input, ones_d, zero, col_sums_d);
+    El::Gemv(El::TRANSPOSE, one, *local_input, ones_d, zero, col_sums_d);
     if (local_width > local_height) {
       ones_d.Resize(local_width, 1);
       El::Fill(ones_d, one);
@@ -121,45 +148,62 @@ void fp_gpu(lbann_comm& comm,
   CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
 
   // Compute average value across mini-batch
-  El::Scale(one / mini_batch_size, sum_d);
-  comm.allreduce(static_cast<AbsMat&>(sum_d), input.DistComm());
+  El::Scale(one / El::To<EvalDataType>(mini_batch_size), sum_d);
+  comm.allreduce(static_cast<El::AbstractMatrix<EvalDataType>&>(sum_d), input.DistComm());
   CHECK_CUDA(cudaMemcpyAsync(&value,
                              sum_d.LockedBuffer(),
-                             sizeof(DataType),
+                             sizeof(EvalDataType),
                              cudaMemcpyDeviceToHost,
                              stream));
   copy_event.record(stream);
 
 }
+
+#ifdef LBANN_HAS_GPU_FP16
+template <typename EvalDataType>
+void fp_gpu(lbann_comm& comm,
+            const El::AbstractDistMatrix<cpu_fp16>& input,
+            EvalDataType& value,
+            cuda::event_wrapper& copy_event) {
+  LBANN_ERROR("This function is not supported with "
+              "the CPU FP16 type on GPUs. "
+              "A severe logic error has occured; please "
+              "report this bug to LBANN developers (or just Tim).");
+}
+#endif // LBANN_HAS_GPU_HALF
+
 #endif // LBANN_HAS_GPU
 
 } // namespace
 
-EvalType abstract_evaluation_layer::get_value(bool scaled) {
-  switch (get_device_allocation()) {
-  case El::Device::CPU: get_comm()->wait(m_allreduce_req); break;
+template <typename TensorDataType>
+EvalType abstract_evaluation_layer<TensorDataType>::get_value(bool scaled) {
+  switch (this->get_device_allocation()) {
+  case El::Device::CPU: this->get_comm()->wait(m_allreduce_req); break;
 #ifdef LBANN_HAS_GPU
-  case El::Device::GPU: m_copy_event.synchronize(); break;
+  case El::Device::GPU: this->m_copy_event.synchronize(); break;
 #endif // LBANN_HAS_GPU
   default: LBANN_ERROR("invalid device");
   }
-  if (scaled) { return m_scale * m_value(0, 0); }
-  else        { return m_value(0, 0); }
+  if (scaled) { return El::To<EvalDataType>(m_scale) * El::To<EvalDataType>(m_value(0,0)); }
+  else        { return m_value(0,0); }
 }
 
-abstract_evaluation_layer::abstract_evaluation_layer(lbann_comm *comm)
-  : transform_layer(comm) {
+template <typename TensorDataType>
+abstract_evaluation_layer<TensorDataType>::abstract_evaluation_layer(lbann_comm *comm)
+  : transform_layer<TensorDataType>(comm) {
   this->m_expected_num_child_layers = 0;
 }
 
-void abstract_evaluation_layer::setup_dims() {
-  transform_layer::setup_dims();
-  if (get_input_size() != 1) {
+template <typename TensorDataType>
+void abstract_evaluation_layer<TensorDataType>::setup_dims(DataReaderMetaData& dr_metadata) {
+  transform_layer<TensorDataType>::setup_dims(dr_metadata);
+  if (this->get_input_size() != 1) {
     std::stringstream err;
-    const auto& dims = get_input_dims();
-    err << get_type() << " layer \"" << get_name() << "\" "
+    const auto& dims = this->get_input_dims();
+    err << this->get_type() << " layer \"" << this->get_name() << "\" "
         << "expects a scalar input, but "
-        << "parent layer \"" << m_parent_layers[0]->get_name() << "\" "
+        << "parent layer \"" << this->get_parent_layers()[0]->get_name() << "\" "
         << "has dimensions of ";
     for (size_t i = 0; i < dims.size(); ++i) {
       err << (i > 0 ? " x " : "") << dims[i];
@@ -168,23 +212,29 @@ void abstract_evaluation_layer::setup_dims() {
   }
 }
 
-void abstract_evaluation_layer::setup_data() {
-  transform_layer::setup_data();
+template <typename TensorDataType>
+void abstract_evaluation_layer<TensorDataType>::setup_data(size_t max_mini_batch_size) {
+  transform_layer<TensorDataType>::setup_data(max_mini_batch_size);
 #ifdef LBANN_HAS_GPU
   m_value.SetMemoryMode(1); // Use pinned memory on host
 #endif // LBANN_HAS_GPU
   El::Zeros(m_value, 1, 1);
 }
 
-void abstract_evaluation_layer::fp_compute() {
-  switch (get_device_allocation()) {
+template <typename TensorDataType>
+void abstract_evaluation_layer<TensorDataType>::fp_compute() {
+  switch (this->get_device_allocation()) {
   case El::Device::CPU:
-    fp_cpu(*get_comm(), get_prev_activations(), m_value(0, 0),
+    fp_cpu(*this->get_comm(),
+           this->get_prev_activations(),
+           m_value(0, 0),
            m_allreduce_req);
     break;
 #ifdef LBANN_HAS_GPU
   case El::Device::GPU:
-    fp_gpu(*get_comm(), get_prev_activations(), m_value(0, 0),
+    fp_gpu(*this->get_comm(),
+           this->get_prev_activations(),
+           m_value(0, 0),
            m_copy_event);
     break;
 #endif // LBANN_HAS_GPU
@@ -192,18 +242,22 @@ void abstract_evaluation_layer::fp_compute() {
   }
 }
 
-void abstract_evaluation_layer::bp_compute() {
-  El::Fill(get_error_signals(), DataType(m_scale));
+template <typename TensorDataType>
+void abstract_evaluation_layer<TensorDataType>::bp_compute() {
+  const auto& context = static_cast<sgd_execution_context&>(this->m_model->get_execution_context());
+  const auto mini_batch_size = context.get_effective_mini_batch_size();
+  El::Fill(this->get_error_signals(), TensorDataType(m_scale / mini_batch_size));
 }
 
-abstract_evaluation_layer*
-abstract_evaluation_layer::construct(lbann_comm *comm,
-                                     data_layout layout,
-                                     El::Device device) {
+template <typename TensorDataType>
+abstract_evaluation_layer<TensorDataType>*
+abstract_evaluation_layer<TensorDataType>::construct(lbann_comm *comm,
+                                                     data_layout layout,
+                                                     El::Device device) {
 #define EVAL_LAYER_CONSTRUCT(T_layout, T_device)                \
   do {                                                          \
     if (layout == T_layout && device == T_device) {             \
-      return new evaluation_layer<T_layout, T_device>(comm);    \
+      return new evaluation_layer<TensorDataType, T_layout, T_device>(comm); \
     }                                                           \
   } while (false)
   EVAL_LAYER_CONSTRUCT(data_layout::DATA_PARALLEL, El::Device::CPU);
@@ -215,14 +269,30 @@ abstract_evaluation_layer::construct(lbann_comm *comm,
 #undef EVAL_LAYER_CONSTRUCT
 
   // Could not construct evaluation layer
-  std::stringstream err;
-  err << "attempted to construct evaluation layer "
-      << "with invalid parameters "
-      << "(data layout type " << static_cast<int>(layout) << ", "
-      << "device type " << static_cast<int>(device) << ")";
-  LBANN_ERROR(err.str());
+  LBANN_ERROR("Attempted to construct evaluation layer "
+              "with invalid parameters "
+              "(data layout type: ", to_string(layout), ", device type: ",
+              to_string(device), ")");
   return nullptr;
-
 }
 
+LBANN_LAYER_DEFAULT_BUILDER(evaluation)
+
+#define PROTO(T)                              \
+  template class abstract_evaluation_layer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+#undef LBANN_INSTANTIATE_CPU_HALF
+#undef LBANN_INSTANTIATE_GPU_HALF
+
+#define PROTO_DEVICE(T, Device)                                           \
+  template class evaluation_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class evaluation_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(evaluation, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/gaussian.cpp b/src/layers/transform/gaussian.cpp
new file mode 100644
index 00000000000..78b1749794f
--- /dev/null
+++ b/src/layers/transform/gaussian.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_GAUSSIAN_LAYER_INSTANTIATE
+#include "lbann/layers/transform/gaussian.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class gaussian_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class gaussian_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/hadamard.cpp b/src/layers/transform/hadamard.cpp
new file mode 100644
index 00000000000..1b65020896a
--- /dev/null
+++ b/src/layers/transform/hadamard.cpp
@@ -0,0 +1,44 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_HADAMARD_LAYER_INSTANTIATE
+#include "lbann/layers/transform/hadamard.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(hadamard)
+
+#define PROTO_DEVICE(T, Device)                                         \
+  template class hadamard_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class hadamard_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(hadamard, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/in_top_k.cpp b/src/layers/transform/in_top_k.cpp
index 0ce65b1e454..b7679551f04 100644
--- a/src/layers/transform/in_top_k.cpp
+++ b/src/layers/transform/in_top_k.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_IN_TOP_K_LAYER_INSTANTIATE
 #include "lbann/layers/transform/in_top_k.hpp"
 #include <algorithm>
 #include <limits>
@@ -34,15 +35,16 @@ namespace lbann {
 namespace {
 
 /** Sparse vector entry. */
+template <typename TensorDataType>
 struct entry {
 
   /** Vector entry value. */
-  DataType value = min_value;
+  TensorDataType value = min_value;
   /** Vector entry index. */
   El::Int index = max_index;
 
   /** Minimum possible value. */
-  static constexpr DataType min_value = -std::numeric_limits<DataType>::infinity();
+  static constexpr TensorDataType min_value = -std::numeric_limits<TensorDataType>::infinity();
   /** Maximum possible index. */
   static constexpr El::Int max_index = std::numeric_limits<El::Int>::max();
 
@@ -57,10 +59,11 @@ struct entry {
 };
 
 /** CPU implementation of in_top_k layer forward prop. */
+template <typename TensorDataType>
 void fp_cpu(lbann_comm& comm,
             El::Int k,
-            const AbsDistMat& input,
-            AbsDistMat& output) {
+            const El::AbstractDistMatrix<TensorDataType>& input,
+            El::AbstractDistMatrix<TensorDataType>& output) {
 
   // Local matrices
   const auto& local_input = input.LockedMatrix();
@@ -74,7 +77,7 @@ void fp_cpu(lbann_comm& comm,
     El::Zero(output);
     return;
   } else if (k >= height) {
-    El::Fill(output, DataType(1));
+    El::Fill(output, El::TypeTraits<TensorDataType>::One());
     return;
   } else if (local_width < 1) {
     return;
@@ -85,10 +88,10 @@ void fp_cpu(lbann_comm& comm,
   const auto& col_comm_size = El::mpi::Size(col_comm);
 
   // Find top-k entries in each column of local input matrix
-  std::vector<entry> top_entries(local_width * k);
+  std::vector<entry<TensorDataType>> top_entries(local_width * k);
   LBANN_OMP_PARALLEL_FOR
   for (El::Int col = 0; col < local_width; ++col) {
-    std::vector<entry> local_entries(std::max(local_height, k));
+    std::vector<entry<TensorDataType>> local_entries(std::max(local_height, k));
     for (El::Int row = 0; row < local_height; ++row) {
       local_entries[row].value = local_input(row, col);
       local_entries[row].index = input.GlobalRow(row);
@@ -97,20 +100,20 @@ void fp_cpu(lbann_comm& comm,
                            local_entries.end(),
                            &top_entries[col*k],
                            &top_entries[col*k] + k,
-                           entry::compare);
+                           entry<TensorDataType>::compare);
   }
 
   // Find top-k entries in each column of global input matrix
   if (col_comm_size > 1) {
-    std::vector<entry> global_top_entries(col_comm_size * local_width * k);
+    std::vector<entry<TensorDataType>> global_top_entries(col_comm_size * local_width * k);
     comm.all_gather(reinterpret_cast<El::byte*>(top_entries.data()),
-                    top_entries.size() * sizeof(entry),
+                    top_entries.size() * sizeof(entry<TensorDataType>),
                     reinterpret_cast<El::byte*>(global_top_entries.data()),
-                    top_entries.size() * sizeof(entry),
+                    top_entries.size() * sizeof(entry<TensorDataType>),
                     col_comm);
     LBANN_OMP_PARALLEL_FOR
     for (El::Int col = 0; col < local_width; ++col) {
-      std::vector<entry> col_entries(col_comm_size * k);
+      std::vector<entry<TensorDataType>> col_entries(col_comm_size * k);
       for (El::Int rank = 0; rank < col_comm_size; ++rank) {
         const auto* start = &global_top_entries[rank*local_width*k+col*k];
         std::copy(start, start + k, &col_entries[rank*k]);
@@ -119,7 +122,7 @@ void fp_cpu(lbann_comm& comm,
                              col_entries.end(),
                              &top_entries[col*k],
                              &top_entries[col*k] + k,
-                             entry::compare);
+                             entry<TensorDataType>::compare);
     }
   }
 
@@ -131,7 +134,7 @@ void fp_cpu(lbann_comm& comm,
       const auto& global_row = top_entries[col*k+i].index;
       if (global_row < height && output.IsLocalRow(global_row)) {
         const auto& row = output.LocalRow(global_row);
-        local_output(row, col) = DataType(1);
+        local_output(row, col) = El::TypeTraits<TensorDataType>::One();
       }
     }
   }
@@ -140,15 +143,21 @@ void fp_cpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void in_top_k_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(), m_k, get_prev_activations(), get_activations());
-}
-template <>
-void in_top_k_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
-  fp_cpu(*get_comm(), m_k, get_prev_activations(), get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void in_top_k_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_cpu(*this->get_comm(),
+         this->m_k,
+         this->get_prev_activations(),
+         this->get_activations());
 }
 
+#define PROTO(T)                                     \
+  template class in_top_k_layer<                     \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class in_top_k_layer<                     \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/in_top_k.cu b/src/layers/transform/in_top_k.cu
index 1331f1c18ee..35d4e073bf6 100644
--- a/src/layers/transform/in_top_k.cu
+++ b/src/layers/transform/in_top_k.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_IN_TOP_K_LAYER_INSTANTIATE
 #include "lbann/layers/transform/in_top_k.hpp"
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
@@ -36,9 +37,10 @@ namespace lbann {
 namespace {
 
 /** Sparse vector entry. */
+template <typename TensorDataType>
 struct entry {
   /** Vector entry value. */
-  DataType value;
+  TensorDataType value;
   /** Vector entry index. */
   El::Int index;
 };
@@ -47,8 +49,9 @@ struct entry {
  *  Entries are sorted by value in decreasing order, with ties broken
  *  in favor of entries with smaller indices.
  */
-struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
-  __host__ __device__ bool operator()(const entry& a, const entry& b) const {
+template <typename TensorDataType>
+struct entry_compare : ::thrust::binary_function<entry<TensorDataType>,entry<TensorDataType>,bool> {
+  __host__ __device__ bool operator()(const entry<TensorDataType>& a, const entry<TensorDataType>& b) const {
     return a.value > b.value || (a.value == b.value && a.index < b.index);
   }
 };
@@ -58,15 +61,16 @@ struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
  *  the sparse vectors correspond to global row indices in the dense
  *  matrix.
  */
+template <typename TensorDataType>
 __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
                                                El::Int local_matrix_height,
                                                El::Int local_matrix_width,
                                                El::Int global_matrix_height,
                                                El::Int global_matrix_col_shift,
                                                El::Int global_matrix_col_stride,
-                                               const DataType* __restrict__ local_matrix,
+                                               const TensorDataType* __restrict__ local_matrix,
                                                El::Int local_matrix_ldim,
-                                               entry* __restrict__ local_entries,
+                                               entry<TensorDataType>* __restrict__ local_entries,
                                                El::Int local_entries_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
@@ -98,6 +102,7 @@ __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
  *    dim         = d(k)
  *    dim_stride  = d(k+1) * ... * d(n)
  */
+template <typename TensorDataType>
 __global__ void fill_with_tensor_index(El::Int tensor_size,
                                        El::Int dim,
                                        El::Int dim_stride,
@@ -114,6 +119,7 @@ __global__ void fill_with_tensor_index(El::Int tensor_size,
  *  local matrix entry corresponds to one of the top-k entries, then
  *  it is set to one.
  */
+template <typename TensorDataType>
 __global__ void indicate_matrix_entries(El::Int k,
                                         El::Int global_matrix_height,
                                         El::Int local_matrix_height,
@@ -122,9 +128,9 @@ __global__ void indicate_matrix_entries(El::Int k,
                                         El::Int global_matrix_col_align,
                                         El::Int global_matrix_col_shift,
                                         El::Int global_matrix_col_stride,
-                                        DataType* __restrict__ local_matrix,
+                                        TensorDataType* __restrict__ local_matrix,
                                         El::Int local_matrix_ldim,
-                                        const entry*  __restrict__ entries,
+                                        const entry<TensorDataType>*  __restrict__ entries,
                                         El::Int entries_ldim) {
   const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
   const El::Int num_threads = blockDim.x * gridDim.x;
@@ -142,14 +148,17 @@ __global__ void indicate_matrix_entries(El::Int k,
         local_row = ((global_row - global_matrix_col_shift - 1)
                      / global_matrix_col_stride + 1);
       }
-      local_matrix[local_row + local_col * local_matrix_ldim] = DataType(1);
+      local_matrix[local_row + local_col * local_matrix_ldim] = TensorDataType(1.0);
     }
   }
 }
 
 /** GPU implementation of in_top_k layer forward prop. */
+template <typename TensorDataType>
 void fp_gpu(lbann_comm& comm,
-            El::Int k, const AbsDistMat& input, AbsDistMat& output) {
+            El::Int k,
+            const El::AbstractDistMatrix<TensorDataType>& input,
+            El::AbstractDistMatrix<TensorDataType>& output) {
   if (input.Wrap() != El::ELEMENT || output.Wrap() != El::ELEMENT) {
     LBANN_ERROR("in_top_k layer GPU implementation assumes elemental "
                 "distributed matrices");
@@ -167,7 +176,7 @@ void fp_gpu(lbann_comm& comm,
     El::Zero(output);
     return;
   } else if (k >= height) {
-    El::Fill(output, DataType(1));
+    El::Fill(output, El::TypeTraits<TensorDataType>::One());
     return;
   } else if (local_width < 1) {
     return;
@@ -184,36 +193,36 @@ void fp_gpu(lbann_comm& comm,
   cuda::thrust::allocator<> alloc(stream);
 
   // Find top-k entries in each column of local prediction matrix
-  cuda::thrust::vector<entry> top_entries(local_width * k);
+  cuda::thrust::vector<entry<TensorDataType>> top_entries(local_width * k);
   {
     const auto& num_local_entries_per_col = std::max(local_height, k);
     const auto& num_local_entries = local_width * num_local_entries_per_col;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_local_entries + block_dim - 1) / block_dim;
-    cuda::thrust::vector<entry> local_entries(num_local_entries);
+    cuda::thrust::vector<entry<TensorDataType>> local_entries(num_local_entries);
     cuda::thrust::vector<El::Int> local_entries_cols(num_local_entries);
     dense_matrix_to_sparse_vectors<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries_per_col, local_height, local_width, height,
       input.ColShift(), input.ColStride(),
       local_input.LockedBuffer(), local_input.LDim(),
       local_entries.data().get(), num_local_entries_per_col);
-    fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
+    fill_with_tensor_index<TensorDataType><<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries, local_width, num_local_entries_per_col,
       local_entries_cols.data().get());
     ::thrust::sort_by_key(alloc.system(),
                           local_entries.begin(),
                           local_entries.end(),
                           local_entries_cols.begin(),
-                          entry_compare());
+                          entry_compare<TensorDataType>());
     ::thrust::stable_sort_by_key(alloc.system(),
                                  local_entries_cols.begin(),
                                  local_entries_cols.end(),
                                  local_entries.begin());
     CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
-                                 k * sizeof(entry),
+                                 k * sizeof(entry<TensorDataType>),
                                  local_entries.data().get(),
-                                 num_local_entries_per_col * sizeof(entry),
-                                 k * sizeof(entry),
+                                 num_local_entries_per_col * sizeof(entry<TensorDataType>),
+                                 k * sizeof(entry<TensorDataType>),
                                  local_width,
                                  cudaMemcpyDeviceToDevice,
                                  stream));
@@ -225,29 +234,29 @@ void fp_gpu(lbann_comm& comm,
     const auto& num_entries = col_comm_size * num_entries_per_rank;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_entries + block_dim - 1) / block_dim;
-    cuda::thrust::vector<entry> global_top_entries(num_entries);
+    cuda::thrust::vector<entry<TensorDataType>> global_top_entries(num_entries);
     cuda::thrust::vector<El::Int> global_top_entries_cols(num_entries);
     comm.all_gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
-                    top_entries.size() * sizeof(entry),
+                    top_entries.size() * sizeof(entry<TensorDataType>),
                     reinterpret_cast<El::byte*>(global_top_entries.data().get()),
-                    top_entries.size() * sizeof(entry),
+                    top_entries.size() * sizeof(entry<TensorDataType>),
                     col_comm, El::SyncInfo<El::Device::GPU>{stream, event});
-    fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
+    fill_with_tensor_index<TensorDataType><<<grid_dim, block_dim, 0, stream>>>(
       num_entries, local_width, k, global_top_entries_cols.data().get());
     ::thrust::sort_by_key(alloc.system(),
                           global_top_entries.begin(),
                           global_top_entries.end(),
                           global_top_entries_cols.begin(),
-                          entry_compare());
+                          entry_compare<TensorDataType>());
     ::thrust::stable_sort_by_key(alloc.system(),
                                  global_top_entries_cols.begin(),
                                  global_top_entries_cols.end(),
                                  global_top_entries.begin());
     CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
-                                 k * sizeof(entry),
+                                 k * sizeof(entry<TensorDataType>),
                                  global_top_entries.data().get(),
-                                 col_comm_size * k * sizeof(entry),
-                                 k * sizeof(entry),
+                                 col_comm_size * k * sizeof(entry<TensorDataType>),
+                                 k * sizeof(entry<TensorDataType>),
                                  local_width,
                                  cudaMemcpyDeviceToDevice,
                                  stream));
@@ -271,15 +280,21 @@ void fp_gpu(lbann_comm& comm,
 
 } // namespace
 
-template <>
-void in_top_k_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(), m_k, get_prev_activations(), get_activations());
-}
-template <>
-void in_top_k_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
-  fp_gpu(*get_comm(), m_k, get_prev_activations(), get_activations());
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void in_top_k_layer<TensorDataType, T_layout, Dev>::fp_compute() {
+  fp_gpu(*this->get_comm(),
+         this->m_k,
+         this->get_prev_activations(),
+         this->get_activations());
 }
 
+#define PROTO(T)                                     \
+  template class in_top_k_layer<                     \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class in_top_k_layer<                     \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/pooling.cpp b/src/layers/transform/pooling.cpp
new file mode 100644
index 00000000000..7f88e5c9839
--- /dev/null
+++ b/src/layers/transform/pooling.cpp
@@ -0,0 +1,99 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_POOLING_LAYER_INSTANTIATE
+#include "lbann/layers/transform/pooling.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+namespace {
+
+template <typename T, data_layout L, El::Device D>
+struct Builder
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&...)
+  {
+    LBANN_ERROR("Attempted to instantiate layer \"pooling\" with "
+                "Layout=", to_string(L), ".\nThis layer is only "
+                "supported with DATA_PARALLEL data layout.");
+    return nullptr;
+  }
+};
+
+template <typename TensorDataType, El::Device Device>
+struct Builder<TensorDataType, data_layout::DATA_PARALLEL, Device>
+{
+  template <typename... Args>
+  static std::unique_ptr<Layer> Build(Args&&... args)
+  {
+    using LayerType = pooling_layer<TensorDataType,
+                                    data_layout::DATA_PARALLEL,
+                                    Device>;
+    return lbann::make_unique<LayerType>(std::forward<Args>(args)...);
+  }
+};
+}// namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_pooling_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, pooling);
+
+  using BuilderType = Builder<TensorDataType, Layout, Device>;
+  const auto& params = proto_layer.pooling();
+
+  const auto& mode_str = params.pool_mode();
+  pool_mode mode = pool_mode::invalid;
+  if (mode_str == "max" )            { mode = pool_mode::max; }
+  if (mode_str == "average" )        { mode = pool_mode::average; }
+  if (mode_str == "average_no_pad" ) { mode = pool_mode::average_no_pad; }
+  if (params.has_vectors()) {
+    const auto& dims = parse_list<int>(params.pool_dims());
+    const auto& pads = parse_list<int>(params.pool_pads());
+    const auto& strides = parse_list<int>(params.pool_strides());
+    return BuilderType::Build(
+      comm, dims.size(), dims, pads, strides, mode);
+  } else {
+    const auto& num_dims = params.num_dims();
+    const auto& dim = params.pool_dims_i();
+    const auto& pad = params.pool_pads_i();
+    const auto& stride = params.pool_strides_i();
+    return BuilderType::Build(
+      comm, num_dims, dim, pad, stride, mode);
+  }
+}
+
+#define PROTO_DEVICE(T, Device) \
+  template class pooling_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(pooling, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/reduction.cpp b/src/layers/transform/reduction.cpp
new file mode 100644
index 00000000000..9315fd8a124
--- /dev/null
+++ b/src/layers/transform/reduction.cpp
@@ -0,0 +1,37 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_REDUCTION_LAYER_INSTANTIATE
+#include "lbann/layers/transform/reduction.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class reduction_layer<T, data_layout::DATA_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/reshape.cpp b/src/layers/transform/reshape.cpp
new file mode 100644
index 00000000000..ae2fb71f055
--- /dev/null
+++ b/src/layers/transform/reshape.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_RESHAPE_LAYER_INSTANTIATE
+#include "lbann/layers/transform/reshape.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class reshape_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class reshape_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/slice.cpp b/src/layers/transform/slice.cpp
new file mode 100644
index 00000000000..1cdabfc2d0b
--- /dev/null
+++ b/src/layers/transform/slice.cpp
@@ -0,0 +1,328 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SLICE_LAYER_INSTANTIATE
+#include "lbann/layers/transform/slice.hpp"
+
+namespace lbann {
+
+namespace {
+
+using dim4 = std::array<size_t, 4>;
+
+/** @brief Concatenate 4D tensors. */
+template <typename T>
+void concat4d(
+  size_t concat_dim,
+  const std::vector<const T*>& input_buffer_list,
+  const std::vector<dim4>& input_dims_list,
+  const std::vector<dim4>& input_strides_list,
+  T* output_buffer,
+  const dim4& output_strides) {
+
+  // Compute offset corresponding to each input tensor
+  std::vector<size_t> output_offset_list;
+  output_offset_list.push_back(0);
+  for (const auto& input_dims : input_dims_list) {
+    auto offset = output_offset_list.back();
+    offset += input_dims[concat_dim] * output_strides[concat_dim];
+    output_offset_list.push_back(offset);
+  }
+
+  // Iterate through input tensors
+  for (size_t j=0; j<input_buffer_list.size(); ++j) {
+    const auto& input_buffer = input_buffer_list[j];
+    const auto& input_dims = input_dims_list[j];
+    const auto& input_strides = input_strides_list[j];
+    const auto& output_offset = output_offset_list[j];
+
+    // Copy input tensor to corresponding position in output tensor
+    LBANN_OMP_PARALLEL_FOR_COLLAPSE4
+    for (size_t i0=0; i0<input_dims[0]; ++i0) {
+      for (size_t i1=0; i1<input_dims[1]; ++i1) {
+        for (size_t i2=0; i2<input_dims[2]; ++i2) {
+          for (size_t i3=0; i3<input_dims[3]; ++i3) {
+            const auto& x = input_buffer[i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[output_offset
+                                    + i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+/** @brief Slice 4D tensors. */
+template <typename T>
+void slice4d(
+  size_t slice_dim,
+  const T* input_buffer,
+  const dim4& input_strides,
+  const std::vector<T*>& output_buffer_list,
+  const std::vector<dim4>& output_dims_list,
+  const std::vector<dim4>& output_strides_list) {
+
+  // Compute offset corresponding to each output tensor
+  std::vector<size_t> input_offset_list;
+  input_offset_list.push_back(0);
+  for (const auto& output_dims : output_dims_list) {
+    auto offset = input_offset_list.back();
+    offset += output_dims[slice_dim] * input_strides[slice_dim];
+    input_offset_list.push_back(offset);
+  }
+
+  // Iterate through output tensors
+  for (size_t j=0; j<output_buffer_list.size(); ++j) {
+    auto&& output_buffer = output_buffer_list[j];
+    const auto& output_dims = output_dims_list[j];
+    const auto& output_strides = output_strides_list[j];
+    const auto& input_offset = input_offset_list[j];
+
+    // Copy output tensor to corresponding position in input tensor
+    LBANN_OMP_PARALLEL_FOR_COLLAPSE4
+    for (size_t i0=0; i0<output_dims[0]; ++i0) {
+      for (size_t i1=0; i1<output_dims[1]; ++i1) {
+        for (size_t i2=0; i2<output_dims[2]; ++i2) {
+          for (size_t i3=0; i3<output_dims[3]; ++i3) {
+            auto& x = input_buffer[input_offset
+                                   + i0 * input_strides[0]
+                                   + i1 * input_strides[1]
+                                   + i2 * input_strides[2]
+                                   + i3 * input_strides[3]];
+            auto& y = output_buffer[i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  slice_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::CPU>& l) {
+  // Tensor views have already been setup in fp_setup_outputs
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  slice_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::CPU>& l) {
+
+  // Stack Elemental matrices on top of each other
+  // Note: Assume each mini-batch sample is flat.
+  auto& input_grad = l.get_error_signals();
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> input_grad_v(
+    input_grad.Construct(input_grad.Grid(), input_grad.Root()));
+  size_t offset = l.m_slice_points.front();
+  for (size_t j=0; j<static_cast<size_t>(l.get_num_children()); ++j) {
+    const auto& output_grad = l.get_prev_error_signals(j);
+    El::View(*input_grad_v, input_grad,
+             El::IR(offset, offset+output_grad.Height()), El::ALL);
+    El::Copy(output_grad, *input_grad_v);
+    offset += output_grad.Height();
+  }
+
+}
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  slice_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const auto& input_dims = l.get_input_dims();
+  const size_t num_dims = input_dims.size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get dimensions and strides for each output tensor
+  std::vector<TensorDataType*> output_buffer_list;
+  std::vector<dim4> output_dims_list, output_strides_list;
+  const size_t num_outputs = l.get_num_children();
+  for (size_t j=0; j<num_outputs; ++j) {
+    auto& output = l.get_activations(j);
+    const auto& output_dims = l.get_output_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_dims.rbegin(), output_dims.rend());
+    std::vector<size_t> rstrides(output_dims.size(), 1);
+    for (size_t d=1; d<output_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output.LocalWidth());
+    rstrides.push_back(output.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_buffer_list.push_back(output.Buffer());
+    output_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    output_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+  }
+
+  // Get strides for input tensor
+  const auto& input = l.get_prev_activations();
+  dim4 input_strides;
+  {
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input.LocalWidth());
+    rstrides.push_back(input.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Slice 4D tensor
+  const size_t slice_dim_stride = input_strides[l.m_slice_dim+(4-num_dims)];
+  const size_t input_offset = l.m_slice_points.front() * slice_dim_stride;
+  slice4d<TensorDataType>(
+    l.m_slice_dim + (4-num_dims),
+    input.LockedBuffer() + input_offset,
+    input_strides,
+    output_buffer_list,
+    output_dims_list,
+    output_strides_list);
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  slice_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::CPU>& l) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const auto& input_dims = l.get_input_dims();
+  const size_t num_dims = input_dims.size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get dimensions and strides for each output tensor
+  std::vector<const TensorDataType*> output_grad_buffer_list;
+  std::vector<dim4> output_grad_dims_list, output_grad_strides_list;
+  for (size_t j=0; j<static_cast<size_t>(l.get_num_children()); ++j) {
+    const auto& output_grad = l.get_prev_error_signals(j);
+    const auto& output_grad_dims = l.get_output_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_grad_dims.rbegin(), output_grad_dims.rend());
+    std::vector<size_t> rstrides(output_grad_dims.size(), 1);
+    for (size_t d=1; d<output_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output_grad.LocalWidth());
+    rstrides.push_back(output_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_grad_buffer_list.push_back(output_grad.LockedBuffer());
+    output_grad_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    output_grad_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+  }
+
+  // Get strides for input tensor
+  dim4 input_grad_strides;
+  auto& input_grad = l.get_error_signals();
+  {
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(input_grad.LocalWidth());
+    rstrides.push_back(input_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_grad_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Concatenate 4D tensors
+  const size_t slice_dim_stride = input_grad_strides[l.m_slice_dim+(4-num_dims)];
+  const size_t input_grad_offset = l.m_slice_points.front() * slice_dim_stride;
+  concat4d<TensorDataType>(
+    l.m_slice_dim + (4-num_dims),
+    output_grad_buffer_list,
+    output_grad_dims_list,
+    output_grad_strides_list,
+    input_grad.Buffer() + input_grad_offset,
+    input_grad_strides);
+
+}
+
+#define PROTO(T)                                        \
+  template class slice_layer<                           \
+    T, data_layout::DATA_PARALLEL, El::Device::CPU>;    \
+  template class slice_layer<                           \
+    T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/slice.cu b/src/layers/transform/slice.cu
new file mode 100644
index 00000000000..f1e478632fa
--- /dev/null
+++ b/src/layers/transform/slice.cu
@@ -0,0 +1,486 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SLICE_LAYER_INSTANTIATE
+#include "lbann/layers/transform/slice.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+using dim4 = cuda::array<size_t, 4>;
+
+/**
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (max_input_dims[3] / bsize) x max_input_dims[2] x max_input_dims[1]
+ */
+template <typename T>
+__global__ void concat4d_kernel(
+  size_t num_inputs,
+  const T* __restrict__ * __restrict__ input_buffer_list,
+  const dim4* __restrict__ input_dims_list,
+  const dim4* __restrict__ input_strides_list,
+  T* __restrict__ output_buffer,
+  dim4 output_strides,
+  const size_t* __restrict__ output_offset_list) {
+
+  // Indices
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = gridDim.x * blockDim.x;
+  const size_t nthreadsy = gridDim.y * blockDim.y;
+  const size_t nthreadsz = gridDim.z * blockDim.z;
+
+  for (size_t j=0; j<num_inputs; ++j) {
+
+    // Current input tensor
+    const auto& input_buffer = input_buffer_list[j];
+    const auto& input_dims = input_dims_list[j];
+    const auto& input_strides = input_strides_list[j];
+    const auto& output_offset = output_offset_list[j];
+
+    // Copy from input tensor to output tensor
+    for (size_t i0=0; i0<input_dims[0]; ++i0) {
+      for (size_t i1=gidz; i1<input_dims[1]; i1+=nthreadsz) {
+        for (size_t i2=gidy; i2<input_dims[2]; i2+=nthreadsy) {
+          for (size_t i3=gidx; i3<input_dims[3]; i3+=nthreadsx) {
+            const auto& x = input_buffer[i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[output_offset
+                                    + i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+/**
+ *  Block dimensions: bsize x 1 x 1
+ *
+ *  Grid dimensions: (max_output_dims[3] / bsize) x max_output_dims[2] x max_output_dims[1]
+ *
+ */
+template <typename T>
+__global__ void slice4d_kernel(
+  size_t num_outputs,
+  const T* __restrict__ input_buffer,
+  dim4 input_strides,
+  const size_t* __restrict__ input_offset_list,
+  T* __restrict__ * __restrict__ output_buffer_list,
+  const dim4* __restrict__ output_dims_list,
+  const dim4* __restrict__ output_strides_list) {
+
+  // Indices
+  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+  const size_t nthreadsx = gridDim.x * blockDim.x;
+  const size_t nthreadsy = gridDim.y * blockDim.y;
+  const size_t nthreadsz = gridDim.z * blockDim.z;
+
+  for (size_t j=0; j<num_outputs; ++j) {
+
+    // Current output tensor
+    const auto& input_offset = input_offset_list[j];
+    auto& output_buffer = output_buffer_list[j];
+    const auto& output_dims = output_dims_list[j];
+    const auto& output_strides = output_strides_list[j];
+
+    // Copy from input tensor to output tensor
+    for (size_t i0=0; i0<output_dims[0]; ++i0) {
+      for (size_t i1=gidz; i1<output_dims[1]; i1+=nthreadsz) {
+        for (size_t i2=gidy; i2<output_dims[2]; i2+=nthreadsy) {
+          for (size_t i3=gidx; i3<output_dims[3]; i3+=nthreadsx) {
+            const auto& x = input_buffer[input_offset
+                                         + i0 * input_strides[0]
+                                         + i1 * input_strides[1]
+                                         + i2 * input_strides[2]
+                                         + i3 * input_strides[3]];
+            auto& y = output_buffer[i0 * output_strides[0]
+                                    + i1 * output_strides[1]
+                                    + i2 * output_strides[2]
+                                    + i3 * output_strides[3]];
+            y = x;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+} // namespace <anon>
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  slice_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::GPU>& l) {
+  // Tensor views have already been setup in fp_setup_outputs
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  slice_layer<TensorDataType,data_layout::MODEL_PARALLEL,El::Device::GPU>& l) {
+
+  // Stack Elemental matrices on top of each other
+  // Note: Assume each mini-batch sample is flat.
+  auto& input_grad = l.get_error_signals();
+  std::unique_ptr<El::AbstractDistMatrix<TensorDataType>> input_grad_v(
+    input_grad.Construct(input_grad.Grid(), input_grad.Root()));
+  size_t offset = l.m_slice_points.front();
+  for (size_t j=0; j<static_cast<size_t>(l.get_num_children()); ++j) {
+    const auto& output_grad = l.get_prev_error_signals(j);
+    El::View(*input_grad_v, input_grad,
+             El::IR(offset, offset+output_grad.Height()), El::ALL);
+    El::Copy(output_grad, *input_grad_v);
+    offset += output_grad.Height();
+  }
+
+}
+
+template <typename TensorDataType>
+void fp_compute_impl(
+  slice_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const auto& input_dims = l.get_input_dims();
+  const size_t num_dims = input_dims.size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get synchronization info from input tensor
+  using LocalMatrix = El::Matrix<TensorDataType, El::Device::GPU>;
+  const auto& input = l.get_prev_activations();
+  const auto& local_input = dynamic_cast<const LocalMatrix&>(input.LockedMatrix());
+  auto&& sync_info = El::SyncInfoFromMatrix(local_input);
+  auto&& stream = sync_info.stream_;
+
+  // Get dimensions and strides for each output tensor
+  const size_t num_outputs = l.get_num_children();
+  std::vector<TensorDataType*> output_buffer_list;
+  std::vector<dim4> output_dims_list, output_strides_list;
+  dim4 max_output_dims{0,0,0,0};
+  for (size_t j=0; j<num_outputs; ++j) {
+    auto& output = l.get_activations(j);
+    const auto& output_dims = l.get_output_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_dims.rbegin(), output_dims.rend());
+    std::vector<size_t> rstrides(output_dims.size(), 1);
+    for (size_t d=1; d<output_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output.LocalWidth());
+    rstrides.push_back(output.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_buffer_list.push_back(output.Buffer());
+    output_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    output_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+    for (size_t i=0; i<4; ++i) {
+      max_output_dims[i] = std::max(max_output_dims[i], rdims[3-i]);
+    }
+  }
+
+  // Get strides for input tensor
+  dim4 input_strides;
+  {
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(local_input.Width());
+    rstrides.push_back(local_input.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Compute each output tensor's offset in input tensor
+  const size_t slice_dim_stride = input_strides[l.m_slice_dim+(4-num_dims)];
+  std::vector<size_t> input_offset_list;
+  for (const auto& slice_point : l.m_slice_points) {
+    input_offset_list.push_back(slice_point * slice_dim_stride);
+  }
+
+  // Pack tensor data into a CPU buffer
+  l.m_workspace_event.synchronize();
+  l.m_workspace.resize(
+    sizeof(size_t) * input_offset_list.size()
+    + sizeof(TensorDataType*) * output_buffer_list.size()
+    + sizeof(dim4) * output_dims_list.size()
+    + sizeof(dim4) * output_strides_list.size());
+  size_t pos = 0;
+  std::memcpy(&l.m_workspace[pos], input_offset_list.data(),
+              sizeof(size_t) * input_offset_list.size());
+  pos += sizeof(size_t) * input_offset_list.size();
+  std::memcpy(&l.m_workspace[pos], output_buffer_list.data(),
+              sizeof(TensorDataType*) * output_buffer_list.size());
+  pos += sizeof(TensorDataType*) * output_buffer_list.size();
+  std::memcpy(&l.m_workspace[pos], output_dims_list.data(),
+              sizeof(dim4) * output_dims_list.size());
+  pos += sizeof(dim4) * output_dims_list.size();
+  std::memcpy(&l.m_workspace[pos], output_strides_list.data(),
+              sizeof(dim4) * output_strides_list.size());
+  pos += sizeof(dim4) * output_strides_list.size();
+
+  // Copy tensor data to GPU
+  hydrogen::simple_buffer<unsigned char, El::Device::GPU> device_workspace(
+    l.m_workspace.size(),
+    sync_info);
+  unsigned char* device_workspace_ptr = device_workspace.data();
+  cudaMemcpyAsync(device_workspace_ptr,
+                  l.m_workspace.data(),
+                  l.m_workspace.size(),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  l.m_workspace_event.record(stream);
+  pos = 0;
+  auto&& device_input_offset_list
+    = reinterpret_cast<const size_t*>(device_workspace_ptr+pos);
+  pos += sizeof(size_t) * input_offset_list.size();
+  auto&& device_output_buffer_list
+    = reinterpret_cast<TensorDataType**>(device_workspace_ptr+pos);
+  pos += sizeof(TensorDataType*) * output_buffer_list.size();
+  auto&& device_output_dims_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * output_dims_list.size();
+  auto&& device_output_strides_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * output_strides_list.size();
+
+  // Launch CUDA kernel
+  const auto& max_output_size = (max_output_dims[0] * max_output_dims[1]
+                                 * max_output_dims[2] * max_output_dims[3]);
+  if (max_output_size > 0) {
+    constexpr size_t block_size = 64;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (max_output_dims[3] + block_size - 1) / block_size;
+    grid_dims.y = max_output_dims[2];
+    grid_dims.z = max_output_dims[1];
+    slice4d_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      num_outputs,
+      local_input.LockedBuffer(),
+      input_strides,
+      device_input_offset_list,
+      device_output_buffer_list,
+      device_output_dims_list,
+      device_output_strides_list);
+  }
+
+}
+
+template <typename TensorDataType>
+void bp_compute_impl(
+  slice_layer<TensorDataType,data_layout::DATA_PARALLEL,El::Device::GPU>& l) {
+
+  // Check that number of dimensions is valid
+  /// @todo Support tensors with arbitrary number of dimensions
+  const auto& input_dims = l.get_input_dims();
+  const size_t num_dims = input_dims.size();
+  if (num_dims > 3) {
+    LBANN_ERROR(l.get_type()," layer \"",l.get_name(),"\" ",
+                "is operating on ",num_dims,"-D tensors, ",
+                "but only 3-D tensors are currently supported");
+  }
+
+  // Get synchronization info from input gradient tensor
+  using LocalMatrix = El::Matrix<TensorDataType, El::Device::GPU>;
+  auto& input_grad = l.get_error_signals();
+  auto& local_input_grad = dynamic_cast<LocalMatrix&>(input_grad.Matrix());
+  auto&& sync_info = El::SyncInfoFromMatrix(local_input_grad);
+  auto&& stream = sync_info.stream_;
+
+  // Get dimensions and strides for each output gradient tensor
+  const size_t num_outputs = l.get_num_children();
+  std::vector<const TensorDataType*> output_grad_buffer_list;
+  std::vector<dim4> output_grad_dims_list, output_grad_strides_list;
+  dim4 max_output_grad_dims{0,0,0,0};
+  for (size_t j=0; j<num_outputs; ++j) {
+    const auto& output_grad = l.get_prev_error_signals(j);
+    const auto& output_grad_dims = l.get_output_dims(j);
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(output_grad_dims.rbegin(), output_grad_dims.rend());
+    std::vector<size_t> rstrides(output_grad_dims.size(), 1);
+    for (size_t d=1; d<output_grad_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(output_grad.LocalWidth());
+    rstrides.push_back(output_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    output_grad_buffer_list.push_back(output_grad.LockedBuffer());
+    output_grad_dims_list.push_back({rdims[3], rdims[2], rdims[1], rdims[0]});
+    output_grad_strides_list.push_back(
+      {rstrides[3], rstrides[2], rstrides[1], rstrides[0]});
+    for (size_t i=0; i<4; ++i) {
+      max_output_grad_dims[i] = std::max(max_output_grad_dims[i], rdims[3-i]);
+    }
+  }
+
+  // Get strides for input gradient tensor
+  dim4 input_grad_strides;
+  {
+
+    // Construct dimensions and strides in reverse order
+    // Note: Assume each mini-batch sample is fully packed.
+    std::vector<size_t> rdims(input_dims.rbegin(), input_dims.rend());
+    std::vector<size_t> rstrides(input_dims.size(), 1);
+    for (size_t d=1; d<input_dims.size(); ++d) {
+      rstrides[d] = rdims[d-1] * rstrides[d-1];
+    }
+    rdims.push_back(local_input_grad.Width());
+    rstrides.push_back(local_input_grad.LDim());
+
+    // Pad tensor dimensions to 4D
+    rdims.resize(4, 1);
+    rstrides.resize(4, rstrides.back());
+
+    input_grad_strides = {rstrides[3], rstrides[2], rstrides[1], rstrides[0]};
+  }
+
+  // Compute offsets in input gradient tensor
+  const size_t slice_dim_stride = input_grad_strides[l.m_slice_dim+(4-num_dims)];
+  std::vector<size_t> input_grad_offset_list;
+  for (const auto& slice_point : l.m_slice_points) {
+    input_grad_offset_list.push_back(slice_point * slice_dim_stride);
+  }
+
+  // Pack tensor data into a CPU buffer
+  l.m_workspace_event.synchronize();
+  l.m_workspace.resize(
+    sizeof(TensorDataType*) * output_grad_buffer_list.size()
+    + sizeof(dim4) * output_grad_dims_list.size()
+    + sizeof(dim4) * output_grad_strides_list.size()
+    + sizeof(size_t) * input_grad_offset_list.size());
+  size_t pos = 0;
+  std::memcpy(&l.m_workspace[pos], output_grad_buffer_list.data(),
+              sizeof(TensorDataType*) * output_grad_buffer_list.size());
+  pos += sizeof(TensorDataType*) * output_grad_buffer_list.size();
+  std::memcpy(&l.m_workspace[pos], output_grad_dims_list.data(),
+              sizeof(dim4) * output_grad_dims_list.size());
+  pos += sizeof(dim4) * output_grad_dims_list.size();
+  std::memcpy(&l.m_workspace[pos], output_grad_strides_list.data(),
+              sizeof(dim4) * output_grad_strides_list.size());
+  pos += sizeof(dim4) * output_grad_strides_list.size();
+  std::memcpy(&l.m_workspace[pos], input_grad_offset_list.data(),
+              sizeof(size_t) * input_grad_offset_list.size());
+  pos += sizeof(size_t) * input_grad_offset_list.size();
+
+  // Copy tensor data to GPU
+  hydrogen::simple_buffer<unsigned char, El::Device::GPU> device_workspace(
+    l.m_workspace.size(),
+    sync_info);
+  unsigned char* device_workspace_ptr = device_workspace.data();
+  cudaMemcpyAsync(device_workspace_ptr,
+                  l.m_workspace.data(),
+                  l.m_workspace.size(),
+                  cudaMemcpyHostToDevice,
+                  stream);
+  l.m_workspace_event.record(stream);
+  pos = 0;
+  auto&& device_output_grad_buffer_list
+    = reinterpret_cast<const TensorDataType**>(device_workspace_ptr+pos);
+  pos += sizeof(TensorDataType*) * output_grad_buffer_list.size();
+  auto&& device_output_grad_dims_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * output_grad_dims_list.size();
+  auto&& device_output_grad_strides_list
+    = reinterpret_cast<const dim4*>(device_workspace_ptr+pos);
+  pos += sizeof(dim4) * output_grad_strides_list.size();
+  auto&& device_input_grad_offset_list
+    = reinterpret_cast<const size_t*>(device_workspace_ptr+pos);
+  pos += sizeof(size_t) * input_grad_offset_list.size();
+
+  // Launch CUDA kernel
+  const auto& max_output_grad_size = (max_output_grad_dims[0]
+                                      * max_output_grad_dims[1]
+                                      * max_output_grad_dims[2]
+                                      * max_output_grad_dims[3]);
+  if (max_output_grad_size > 0) {
+    constexpr size_t block_size = 64;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (max_output_grad_dims[3] + block_size - 1) / block_size;
+    grid_dims.y = max_output_grad_dims[2];
+    grid_dims.z = max_output_grad_dims[1];
+    concat4d_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      num_outputs,
+      device_output_grad_buffer_list,
+      device_output_grad_dims_list,
+      device_output_grad_strides_list,
+      local_input_grad.Buffer(),
+      input_grad_strides,
+      device_input_grad_offset_list);
+  }
+
+}
+
+// Explicit instantiation
+#define PROTO(T)                                        \
+  template class slice_layer<                           \
+    T, data_layout::DATA_PARALLEL, El::Device::GPU>;    \
+  template class slice_layer<                           \
+    T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/layers/transform/sort.cpp b/src/layers/transform/sort.cpp
index 72aa76835da..03980074691 100644
--- a/src/layers/transform/sort.cpp
+++ b/src/layers/transform/sort.cpp
@@ -24,18 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_SORT_LAYER_INSTANTIATE
 #include "lbann/layers/transform/sort.hpp"
 
 namespace lbann {
 
-template <>
-void sort_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void sort_layer<TensorDataType, T_layout, Dev>::fp_compute() {
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  auto& local_indices = *m_indices;
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
+  auto& local_indices = *this->m_indices;
   const auto& local_height = local_input.Height();
   const auto& local_width = local_input.Width();
 
@@ -46,7 +46,7 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
     for (El::Int row = 0; row < local_height; ++row) {
       sorted_list.emplace(local_input(row, col), row);
     }
-    if (m_descending) {
+    if (this->m_descending) {
       auto&& it = sorted_list.rbegin();
       for (El::Int row = 0; row < local_height; ++row, ++it) {
         local_output(row, col) = it->first;
@@ -63,14 +63,13 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
 
 }
 
-template <>
-void sort_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void sort_layer<TensorDataType, T_layout, Dev>::bp_compute() {
 
   // Local matrices
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  const auto& local_indices = *m_indices;
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
+  const auto& local_indices = *this->m_indices;
   const auto& local_height = local_gradient_wrt_input.Height();
   const auto& local_width = local_gradient_wrt_input.Width();
 
@@ -86,4 +85,10 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
 
 }
 
+#define PROTO(T)                                      \
+  template class sort_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/sort.cu b/src/layers/transform/sort.cu
index d8aeb743438..4707459a456 100644
--- a/src/layers/transform/sort.cu
+++ b/src/layers/transform/sort.cu
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_SORT_LAYER_INSTANTIATE
 #include "lbann/layers/transform/sort.hpp"
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
@@ -36,14 +37,13 @@
 
 namespace lbann {
 
-template <>
-void sort_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void sort_layer<TensorDataType, T_layout, Dev>::fp_compute() {
 
   // Local matrices
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  auto& local_indices = *m_indices;
+  const auto& local_input = this->get_local_prev_activations();
+  auto& local_output = this->get_local_activations();
+  auto& local_indices = *this->m_indices;
   const auto& local_height = local_input.Height();
   const auto& local_width = local_input.Width();
 
@@ -54,31 +54,30 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
   // Sort each matrix column
   El::Copy(local_input, local_output);
   for (El::Int col = 0; col < local_width; ++col) {
-    ::thrust::device_ptr<DataType> vals(local_output.Buffer(0, col));
+    ::thrust::device_ptr<TensorDataType> vals(local_output.Buffer(0, col));
     ::thrust::device_ptr<El::Int> inds(local_indices.Buffer(0, col));
     ::thrust::sequence(thrust::cuda::par(alloc).on(stream),
                        inds, inds + local_height);
-    if (m_descending) {
+    if (this->m_descending) {
       ::thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
                             vals, vals + local_height, inds,
-                            ::thrust::greater<DataType>());
+                            ::thrust::greater<TensorDataType>());
     } else {
       ::thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
                             vals, vals + local_height, inds,
-                            ::thrust::less<DataType>());
+                            ::thrust::less<TensorDataType>());
     }
   }
 
 }
 
-template <>
-void sort_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute() {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void sort_layer<TensorDataType, T_layout, Dev>::bp_compute() {
 
   // Local matrices
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  const auto& local_indices = *m_indices;
+  const auto& local_gradient_wrt_output = this->get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = this->get_local_error_signals();
+  const auto& local_indices = *this->m_indices;
   const auto& local_height = local_gradient_wrt_input.Height();
   const auto& local_width = local_gradient_wrt_input.Width();
 
@@ -88,9 +87,9 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
 
   // Scatter gradients based on sorted indices
   for (El::Int col = 0; col < local_width; ++col) {
-    const ::thrust::device_ptr<const El::Int> inds(m_indices->LockedBuffer(0, col));
-    const ::thrust::device_ptr<const DataType> grad_wrt_out(local_gradient_wrt_output.LockedBuffer(0, col));
-    ::thrust::device_ptr<DataType> grad_wrt_in(local_gradient_wrt_input.Buffer(0, col));
+    const ::thrust::device_ptr<const El::Int> inds(this->m_indices->LockedBuffer(0, col));
+    const ::thrust::device_ptr<const TensorDataType> grad_wrt_out(local_gradient_wrt_output.LockedBuffer(0, col));
+    ::thrust::device_ptr<TensorDataType> grad_wrt_in(local_gradient_wrt_input.Buffer(0, col));
     ::thrust::scatter(thrust::cuda::par(alloc).on(stream),
                       grad_wrt_out, grad_wrt_out + local_height, inds,
                       grad_wrt_in);
@@ -98,4 +97,10 @@ void sort_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
 
 }
 
+#define PROTO(T)                                      \
+  template class sort_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/layers/transform/split.cpp b/src/layers/transform/split.cpp
new file mode 100644
index 00000000000..e256b9f6c14
--- /dev/null
+++ b/src/layers/transform/split.cpp
@@ -0,0 +1,59 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SPLIT_LAYER_INSTANTIATE
+#include "lbann/layers/transform/split.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(split)
+
+#define PROTO(T)                                                        \
+  template class split_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class split_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>; \
+  LBANN_LAYER_BUILDER_ETI(split, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void split_distconv_adapter<TensorDataType, Layout, Dev>::bp_compute() {
+  LBANN_ERROR(this->get_name(), ": Distconv not supported");
+}
+
+#define PROTO(T)                                                        \
+  template class split_distconv_adapter<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class split_distconv_adapter<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#include "lbann/macros/instantiate.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+}// namespace lbann
diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu
new file mode 100644
index 00000000000..e964b2ee839
--- /dev/null
+++ b/src/layers/transform/split.cu
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SPLIT_LAYER_INSTANTIATE
+#include "lbann/layers/transform/split.hpp"
+#include "lbann/utils/exception.hpp"
+
+#ifdef LBANN_HAS_DISTCONV
+#include "distconv/tensor/algorithms_cuda.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(split)
+
+#define PROTO(T)                                                        \
+  template class split_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class split_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>; \
+  LBANN_LAYER_BUILDER_ETI(split, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_DISTCONV
+namespace {
+template <typename TensorDataType>
+struct accumulate_op {
+  __device__ void operator()(TensorDataType &x, const TensorDataType &y) const {
+    x += y;
+  }
+};
+
+template <typename TensorDataType>
+struct sum_op {
+  __device__ void operator()(TensorDataType &x, const TensorDataType &y,
+                             const TensorDataType &z) const {
+    x = y + z;
+  }
+};
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void split_distconv_adapter<TensorDataType, Layout, Dev>::bp_compute() {
+  if (Layout != data_layout::DATA_PARALLEL) {
+    LBANN_ERROR("Distconv not supported");
+  }
+  auto &error_signals = this->get_error_signals(0);
+  switch (this->layer().get_num_children()) {
+    case 0:
+      error_signals.zero(El::GPUManager::Stream());
+      break;
+    case 1:
+      dc::tensor::Copy(error_signals,
+                       this->get_prev_error_signals(0),
+                       El::GPUManager::Stream());
+      break;
+    case 2:
+      dc::tensor::Transform(error_signals,
+                            this->get_prev_error_signals(0),
+                            this->get_prev_error_signals(1),
+                            sum_op<TensorDataType>(),
+                            El::GPUManager::Stream());
+      break;
+    default:
+      dc::tensor::Copy(error_signals,
+                       this->get_prev_error_signals(1),
+                       El::GPUManager::Stream());
+      for (int i = 1; i < this->layer().get_num_children(); ++i) {
+        const auto &prev_error = this->get_prev_error_signals(i);
+        dc::tensor::Transform(error_signals, prev_error,
+                              accumulate_op<TensorDataType>(),
+                              El::GPUManager::Stream());
+      }
+  }
+  return;
+}
+
+#define PROTO(T)                                                        \
+  template class split_distconv_adapter<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class split_distconv_adapter<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+} // namespace lbann
diff --git a/src/layers/transform/stop_gradient.cpp b/src/layers/transform/stop_gradient.cpp
new file mode 100644
index 00000000000..58b1d437cae
--- /dev/null
+++ b/src/layers/transform/stop_gradient.cpp
@@ -0,0 +1,44 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_STOP_GRADIENT_LAYER_INSTANTIATE
+#include "lbann/layers/transform/stop_gradient.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(stop_gradient)
+
+#define PROTO_DEVICE(T, Device) \
+  template class stop_gradient_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class stop_gradient_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(stop_gradient, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/sum.cpp b/src/layers/transform/sum.cpp
new file mode 100644
index 00000000000..352a89027d0
--- /dev/null
+++ b/src/layers/transform/sum.cpp
@@ -0,0 +1,59 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SUM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/sum.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(sum)
+
+#define PROTO(T)                                    \
+  template class sum_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class sum_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>; \
+  LBANN_LAYER_BUILDER_ETI(sum, T, El::Device::CPU)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_DISTCONV
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
+  LBANN_ERROR(this->get_name(), ": Distconv not supported");
+}
+
+#define PROTO(T)                                                        \
+  template class sum_distconv_adapter<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class sum_distconv_adapter<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
+
+#include "lbann/macros/instantiate.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+}// namespace lbann
diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu
new file mode 100644
index 00000000000..4aaa819dd76
--- /dev/null
+++ b/src/layers/transform/sum.cu
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_SUM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/sum.hpp"
+#include "lbann/utils/exception.hpp"
+
+#ifdef LBANN_HAS_DISTCONV
+#include "distconv/tensor/algorithms_cuda.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+namespace lbann {
+
+LBANN_LAYER_DEFAULT_BUILDER(sum)
+
+#define PROTO(T)                                                        \
+  template class sum_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class sum_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>; \
+  LBANN_LAYER_BUILDER_ETI(sum, T, El::Device::GPU)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+#undef PROTO
+
+#ifdef LBANN_HAS_DISTCONV
+namespace {
+template <typename TensorDataType>
+struct accumulate_op {
+  __device__ void operator()(TensorDataType &x, TensorDataType &y) const {
+    x += y;
+  }
+};
+
+template <typename TensorDataType>
+struct sum_op {
+  __device__ void operator()(TensorDataType &x, TensorDataType &y, TensorDataType &z) const {
+    x = y + z;
+  }
+};
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Dev>
+void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
+  auto &activations = this->get_activations();
+  switch (this->layer().get_num_parents()) {
+    case 0:
+      activations.zero(El::GPUManager::Stream());
+      break;
+    case 1:
+      dc::tensor::Copy(activations, this->get_prev_activations(),
+                       El::GPUManager::Stream());
+      break;
+    case 2:
+      // Optimization for layers with 2 parents (e.g.,
+      // Resnet50). Avoids loading destination tensors multiple times
+      this->get_prev_activations(1).set_outermost_dimension(
+          activations.get_shape()[-1]);
+      dc::tensor::Transform(activations,
+                            this->get_prev_activations(0),
+                            this->get_prev_activations(1),
+                            sum_op<TensorDataType>(),
+                            El::GPUManager::Stream());
+      break;
+    default:
+      for (int i = 0; i < this->layer().get_num_parents(); ++i) {
+        auto &prev_activations = this->get_prev_activations(i);
+        prev_activations.set_outermost_dimension(activations.get_shape()[-1]);
+        if (i == 0) {
+          dc::tensor::Copy(activations, prev_activations,
+                           El::GPUManager::Stream());
+        } else {
+          distconv::tensor::Transform(activations, prev_activations,
+                                      accumulate_op<TensorDataType>(),
+                                      El::GPUManager::Stream());
+        }
+      }
+  }
+}
+
+#define PROTO(T)                                                        \
+  template class sum_distconv_adapter<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class sum_distconv_adapter<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
+
+#include "lbann/macros/instantiate.hpp"
+#endif // LBANN_HAS_DISTCONV
+
+} // namespace lbann
diff --git a/src/layers/transform/tessellate.cpp b/src/layers/transform/tessellate.cpp
index 6c5c0dd09c9..0032384a12a 100644
--- a/src/layers/transform/tessellate.cpp
+++ b/src/layers/transform/tessellate.cpp
@@ -24,48 +24,51 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_TESSELLATE_LAYER_INSTANTIATE
 #include "lbann/layers/transform/tessellate.hpp"
 
 namespace lbann {
 
-namespace {
-
-void fp_cpu_3d(const std::vector<int>& input_dims,
-               const std::vector<int>& output_dims,
-               const AbsMat& input,
-               AbsDistMat& output) {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void tessellate_layer<TensorDataType, T_layout, Dev>
+::fp_compute_3d(const std::vector<int>& input_dims,
+                const std::vector<int>& output_dims,
+                const El::AbstractMatrix<TensorDataType>& input,
+                El::AbstractDistMatrix<TensorDataType>& output) {
   auto& local_output = output.Matrix();
   const auto& local_height = local_output.Height();
   const auto& local_width = local_output.Width();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
-  for (El::Int local_col = 0; local_col < local_width; ++local_col) {
-    for (El::Int local_row = 0; local_row < local_height; ++local_row) {
-
-      // Get output entry
-      const auto& output_index = output.GlobalRow(local_row);
-      const auto& output_pos2 = output_index % output_dims[2];
-      const auto& output_pos1 = (output_index / output_dims[2]) % output_dims[1];
-      const auto& output_pos0 = output_index / (output_dims[1] * output_dims[2]);
-      auto& y = local_output(local_row, local_col);
-
-      // Get corresponding input entry
-      const auto& input_pos0 = output_pos0 % input_dims[0];
-      const auto& input_pos1 = output_pos1 % input_dims[1];
-      const auto& input_pos2 = output_pos2 % input_dims[2];
-      const auto& input_index = (input_pos0 * input_dims[1] * input_dims[2]
-                                 + input_pos1 * input_dims[2]
-                                 + input_pos2);
-      const auto& x = input(input_index, local_col);
-      y = x;
-
+    for (El::Int local_col = 0; local_col < local_width; ++local_col) {
+      for (El::Int local_row = 0; local_row < local_height; ++local_row) {
+
+        // Get output entry
+        const auto& output_index = output.GlobalRow(local_row);
+        const auto& output_pos2 = output_index % output_dims[2];
+        const auto& output_pos1 = (output_index / output_dims[2]) % output_dims[1];
+        const auto& output_pos0 = output_index / (output_dims[1] * output_dims[2]);
+        auto& y = local_output(local_row, local_col);
+
+        // Get corresponding input entry
+        const auto& input_pos0 = output_pos0 % input_dims[0];
+        const auto& input_pos1 = output_pos1 % input_dims[1];
+        const auto& input_pos2 = output_pos2 % input_dims[2];
+        const auto& input_index = (input_pos0 * input_dims[1] * input_dims[2]
+                                   + input_pos1 * input_dims[2]
+                                   + input_pos2);
+        const auto& x = input(input_index, local_col);
+        y = x;
+
+      }
     }
-  }
 }
 
-void bp_cpu_3d(const std::vector<int>& input_dims,
-               const std::vector<int>& output_dims,
-               const AbsDistMat& gradient_wrt_output,
-               AbsMat& gradient_wrt_input) {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void tessellate_layer<TensorDataType, T_layout, Dev>
+::bp_compute_3d(const std::vector<int>& input_dims,
+                const std::vector<int>& output_dims,
+                const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
 
   // Local data
   const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
@@ -75,67 +78,36 @@ void bp_cpu_3d(const std::vector<int>& input_dims,
   // Compute local contribution to error signal
   El::Zero(gradient_wrt_input);
   LBANN_OMP_PARALLEL_FOR
-  for (El::Int local_col = 0; local_col < local_width; ++local_col) {
-    for (El::Int local_row = 0; local_row < local_height; ++local_row) {
-
-      // Get gradient w.r.t. output entry
-      const auto& output_index = gradient_wrt_output.GlobalRow(local_row);
-      const auto& output_pos2 = output_index % output_dims[2];
-      const auto& output_pos1 = (output_index / output_dims[2]) % output_dims[1];
-      const auto& output_pos0 = output_index / (output_dims[1] * output_dims[2]);
-      const auto& dy = local_gradient_wrt_output(local_row, local_col);
-
-      // Update corresponding gradient w.r.t. input entry
-      const auto& input_pos0 = output_pos0 % input_dims[0];
-      const auto& input_pos1 = output_pos1 % input_dims[1];
-      const auto& input_pos2 = output_pos2 % input_dims[2];
-      const auto& input_index = (input_pos0 * input_dims[1] * input_dims[2]
-                                 + input_pos1 * input_dims[2]
-                                 + input_pos2);
-      auto& dx = gradient_wrt_input(input_index, local_col);
-      dx += dy;
-
+    for (El::Int local_col = 0; local_col < local_width; ++local_col) {
+      for (El::Int local_row = 0; local_row < local_height; ++local_row) {
+
+        // Get gradient w.r.t. output entry
+        const auto& output_index = gradient_wrt_output.GlobalRow(local_row);
+        const auto& output_pos2 = output_index % output_dims[2];
+        const auto& output_pos1 = (output_index / output_dims[2]) % output_dims[1];
+        const auto& output_pos0 = output_index / (output_dims[1] * output_dims[2]);
+        const auto& dy = local_gradient_wrt_output(local_row, local_col);
+
+        // Update corresponding gradient w.r.t. input entry
+        const auto& input_pos0 = output_pos0 % input_dims[0];
+        const auto& input_pos1 = output_pos1 % input_dims[1];
+        const auto& input_pos2 = output_pos2 % input_dims[2];
+        const auto& input_index = (input_pos0 * input_dims[1] * input_dims[2]
+                                   + input_pos1 * input_dims[2]
+                                   + input_pos2);
+        auto& dx = gradient_wrt_input(input_index, local_col);
+        dx += dy;
+
+      }
     }
-  }
 
 }
 
-} // namespace
-
-template <>
-void tessellate_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::fp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsMat& input,
-                     AbsDistMat& output) {
-  fp_cpu_3d(input_dims, output_dims, input, output);
-}
-template <>
-void tessellate_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::fp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsMat& input,
-                     AbsDistMat& output) {
-  fp_cpu_3d(input_dims, output_dims, input, output);
-}
+#define PROTO(T)                                      \
+  template class tessellate_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>; \
+  template class tessellate_layer<T, data_layout::MODEL_PARALLEL, El::Device::CPU>
 
-template <>
-void tessellate_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-     ::bp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsDistMat& gradient_wrt_output,
-                     AbsMat& gradient_wrt_input) {
-  bp_cpu_3d(input_dims, output_dims,
-            gradient_wrt_output, gradient_wrt_input);
-}
-template <>
-void tessellate_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-     ::bp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsDistMat& gradient_wrt_output,
-                     AbsMat& gradient_wrt_input) {
-  bp_cpu_3d(input_dims, output_dims,
-            gradient_wrt_output, gradient_wrt_input);
-}
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/transform/tessellate.cu b/src/layers/transform/tessellate.cu
index 94f4bd7110b..771a225c3d9 100644
--- a/src/layers/transform/tessellate.cu
+++ b/src/layers/transform/tessellate.cu
@@ -24,18 +24,20 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_TESSELLATE_LAYER_INSTANTIATE
 #include "lbann/layers/transform/tessellate.hpp"
 
 namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 __global__ void fp_gpu_3d_kernel(
   El::Int input_dim0, El::Int input_dim1, El::Int input_dim2,
   El::Int output_dim0, El::Int output_dim1, El::Int output_dim2,
   El::Int local_output_height, El::Int local_output_width,
-  const DataType * __restrict__ input, El::Int input_ldim,
-  DataType * __restrict__ local_output, El::Int local_output_ldim,
+  const TensorDataType * __restrict__ input, El::Int input_ldim,
+  TensorDataType * __restrict__ local_output, El::Int local_output_ldim,
   El::Int output_col_shift, El::Int output_col_stride) {
 
   // Indices
@@ -70,15 +72,16 @@ __global__ void fp_gpu_3d_kernel(
 
 }
 
+template <typename TensorDataType>
 __global__ void bp_gpu_3d_kernel(
   El::Int input_dim0, El::Int input_dim1, El::Int input_dim2,
   El::Int output_dim0, El::Int output_dim1, El::Int output_dim2,
   El::Int local_output_height, El::Int local_output_width,
-  const DataType * __restrict__ local_gradient_wrt_output,
+  const TensorDataType * __restrict__ local_gradient_wrt_output,
   El::Int local_gradient_wrt_output_ldim,
   El::Int gradient_wrt_output_col_shift,
   El::Int gradient_wrt_output_col_stride,
-  DataType * __restrict__ gradient_wrt_input,
+  TensorDataType * __restrict__ gradient_wrt_input,
   El::Int gradient_wrt_input_ldim) {
 
   // Indices
@@ -113,31 +116,36 @@ __global__ void bp_gpu_3d_kernel(
 
 }
 
-void fp_gpu_3d(const std::vector<int>& input_dims,
-               const std::vector<int>& output_dims,
-               const AbsMat& input,
-               AbsDistMat& output) {
+}// namespace <anon>
+
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void tessellate_layer<TensorDataType, T_layout, Dev>
+::fp_compute_3d(const std::vector<int>& input_dims,
+                const std::vector<int>& output_dims,
+                const El::AbstractMatrix<TensorDataType>& input,
+                El::AbstractDistMatrix<TensorDataType>& output) {
   auto& local_output = output.Matrix();
   if (!local_output.IsEmpty()) {
     const auto& local_height = local_output.Height();
     const auto& local_width = local_output.Width();
     const auto& block_size = 256;
     const auto& grid_size = (local_height * local_width + block_size - 1) / block_size;
-    fp_gpu_3d_kernel
-      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
-        input_dims[0], input_dims[1], input_dims[2],
-        output_dims[0], output_dims[1], output_dims[2],
-        local_height, local_width,
-        input.LockedBuffer(), input.LDim(),
-        local_output.Buffer(), local_output.LDim(),
-        output.ColShift(), output.ColStride());
+    fp_gpu_3d_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+      input_dims[0], input_dims[1], input_dims[2],
+      output_dims[0], output_dims[1], output_dims[2],
+      local_height, local_width,
+      input.LockedBuffer(), input.LDim(),
+      local_output.Buffer(), local_output.LDim(),
+      output.ColShift(), output.ColStride());
   }
 }
 
-void bp_gpu_3d(const std::vector<int>& input_dims,
-               const std::vector<int>& output_dims,
-               const AbsDistMat& gradient_wrt_output,
-               AbsMat& gradient_wrt_input) {
+template <typename TensorDataType, data_layout T_layout, El::Device Dev>
+void tessellate_layer<TensorDataType, T_layout, Dev>
+::bp_compute_3d(const std::vector<int>& input_dims,
+                const std::vector<int>& output_dims,
+                const El::AbstractDistMatrix<TensorDataType>& gradient_wrt_output,
+                El::AbstractMatrix<TensorDataType>& gradient_wrt_input) {
   const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
   El::Zero(gradient_wrt_input);
   if (!local_gradient_wrt_output.IsEmpty()) {
@@ -145,56 +153,24 @@ void bp_gpu_3d(const std::vector<int>& input_dims,
     const auto& local_width = local_gradient_wrt_output.Width();
     const auto& block_size = 256;
     const auto& grid_size = (local_height * local_width + block_size - 1) / block_size;
-    bp_gpu_3d_kernel
-      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
-        input_dims[0], input_dims[1], input_dims[2],
-        output_dims[0], output_dims[1], output_dims[2],
-        local_height, local_width,
-        local_gradient_wrt_output.LockedBuffer(),
-        local_gradient_wrt_output.LDim(),
-        gradient_wrt_output.ColShift(),
-        gradient_wrt_output.ColStride(),
-        gradient_wrt_input.Buffer(),
-        gradient_wrt_input.LDim());
+    bp_gpu_3d_kernel<<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+      input_dims[0], input_dims[1], input_dims[2],
+      output_dims[0], output_dims[1], output_dims[2],
+      local_height, local_width,
+      local_gradient_wrt_output.LockedBuffer(),
+      local_gradient_wrt_output.LDim(),
+      gradient_wrt_output.ColShift(),
+      gradient_wrt_output.ColStride(),
+      gradient_wrt_input.Buffer(),
+      gradient_wrt_input.LDim());
   }
 }
 
-} // namespace
-
-template <>
-void tessellate_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::fp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsMat& input,
-                     AbsDistMat& output) {
-  fp_gpu_3d(input_dims, output_dims, input, output);
-}
-template <>
-void tessellate_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::fp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsMat& input,
-                     AbsDistMat& output) {
-  fp_gpu_3d(input_dims, output_dims, input, output);
-}
+#define PROTO(T)                                      \
+  template class tessellate_layer<T, data_layout::DATA_PARALLEL, El::Device::GPU>; \
+  template class tessellate_layer<T, data_layout::MODEL_PARALLEL, El::Device::GPU>
 
-template <>
-void tessellate_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-     ::bp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsDistMat& gradient_wrt_output,
-                     AbsMat& gradient_wrt_input) {
-  bp_gpu_3d(input_dims, output_dims,
-            gradient_wrt_output, gradient_wrt_input);
-}
-template <>
-void tessellate_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-     ::bp_compute_3d(const std::vector<int>& input_dims,
-                     const std::vector<int>& output_dims,
-                     const AbsDistMat& gradient_wrt_output,
-                     AbsMat& gradient_wrt_input) {
-  bp_gpu_3d(input_dims, output_dims,
-            gradient_wrt_output, gradient_wrt_input);
-}
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/layers/transform/uniform.cpp b/src/layers/transform/uniform.cpp
new file mode 100644
index 00000000000..64bf28c1cd0
--- /dev/null
+++ b/src/layers/transform/uniform.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_UNIFORM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/uniform.hpp"
+
+namespace lbann {
+
+#define PROTO_DEVICE(T, Device) \
+  template class uniform_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class uniform_layer<T, data_layout::MODEL_PARALLEL, Device>
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/unpooling.cpp b/src/layers/transform/unpooling.cpp
new file mode 100644
index 00000000000..bad238c8ade
--- /dev/null
+++ b/src/layers/transform/unpooling.cpp
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_UNPOOLING_LAYER_INSTANTIATE
+#include "lbann/layers/transform/unpooling.hpp"
+
+namespace lbann {
+
+#define PROTO(T)                                      \
+  template class unpooling_layer<T, data_layout::DATA_PARALLEL, El::Device::CPU>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/weighted_sum.cpp b/src/layers/transform/weighted_sum.cpp
new file mode 100644
index 00000000000..395fba89649
--- /dev/null
+++ b/src/layers/transform/weighted_sum.cpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE
+#include "lbann/layers/transform/weighted_sum.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_weighted_sum_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  using LayerType = weighted_sum_layer<TensorDataType, Layout, Device>;
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, weighted_sum);
+  const auto& params = proto_layer.weighted_sum();
+  const auto& scaling_factors = parse_list<DataType>(params.scaling_factors());
+  return lbann::make_unique<LayerType>(comm, scaling_factors);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  template class weighted_sum_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class weighted_sum_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(weighted_sum, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/layers/transform/weights.cpp b/src/layers/transform/weights.cpp
new file mode 100644
index 00000000000..b4834e20583
--- /dev/null
+++ b/src/layers/transform/weights.cpp
@@ -0,0 +1,54 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_WEIGHTS_LAYER_INSTANTIATE
+#include "lbann/layers/transform/weights.hpp"
+
+#include <lbann/proto/proto_common.hpp>
+#include <lbann.pb.h>
+
+namespace lbann {
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> build_weights_layer_from_pbuf(
+  lbann_comm* comm, lbann_data::Layer const& proto_layer)
+{
+  LBANN_ASSERT_MSG_HAS_FIELD(proto_layer, weights_layer);
+  using LayerType = weights_layer<TensorDataType, Layout, Device>;
+
+  const auto& params = proto_layer.weights_layer();
+  const auto& dims = parse_list<El::Int>(params.dims());
+  return lbann::make_unique<LayerType>(comm, dims);
+}
+
+#define PROTO_DEVICE(T, Device) \
+  template class weights_layer<T, data_layout::DATA_PARALLEL, Device>; \
+  template class weights_layer<T, data_layout::MODEL_PARALLEL, Device>; \
+  LBANN_LAYER_BUILDER_ETI(weights, T, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
+}// namespace lbann
diff --git a/src/metrics/layer_metric.cpp b/src/metrics/layer_metric.cpp
index 16e193d185b..60e448b10b5 100644
--- a/src/metrics/layer_metric.cpp
+++ b/src/metrics/layer_metric.cpp
@@ -80,7 +80,7 @@ void layer_metric::setup(model& m) {
 EvalType layer_metric::evaluate(execution_mode mode,
                                 int mini_batch_size) {
   const auto& start = get_time();
-  auto value = get_evaluation_layer().get_value(false);
+  auto value = dynamic_cast<abstract_evaluation_layer<DataType>&>(get_evaluation_layer()).get_value(false);
   get_evaluate_time() += get_time() - start;
   if (m_unit == "%") { value *= 100; }
   get_statistics()[mode].add_value(value * mini_batch_size,
@@ -88,9 +88,9 @@ EvalType layer_metric::evaluate(execution_mode mode,
   return value;
 }
 
-abstract_evaluation_layer& layer_metric::get_evaluation_layer() {
+/*abstract_evaluation_*/Layer& layer_metric::get_evaluation_layer() {
   auto& l = get_layer();
-  auto* eval = dynamic_cast<abstract_evaluation_layer*>(&l);
+  auto* eval = dynamic_cast<abstract_evaluation_layer<DataType>*>(&l);
   if (eval == nullptr) {
     std::stringstream err;
     err << "attempted to get the evaluation layer corresponding to "
@@ -102,4 +102,27 @@ abstract_evaluation_layer& layer_metric::get_evaluation_layer() {
   return *eval;
 }
 
+bool layer_metric::save_to_checkpoint_shared(persist& p) {
+  // write out fields we need to save for model
+  if (get_comm().am_trainer_master()) {
+    write_cereal_archive<layer_metric>(*this, p, "metrics.xml");
+  }
+  return true;
+}
+
+bool layer_metric::load_from_checkpoint_shared(persist& p) {
+  load_from_shared_cereal_archive<layer_metric>(*this, p, get_comm(), "metrics.xml");
+  return true;
+}
+
+bool layer_metric::save_to_checkpoint_distributed(persist& p) {
+  write_cereal_archive<layer_metric>(*this, p, "metrics.xml");
+  return true;
+}
+
+bool layer_metric::load_from_checkpoint_distributed(persist& p) {
+  read_cereal_archive<layer_metric>(*this, p, "metrics.xml");
+  return true;
+}
+
 } // namespace lbann
diff --git a/src/metrics/metric.cpp b/src/metrics/metric.cpp
index 80c75d5df8f..c65f6aae45f 100644
--- a/src/metrics/metric.cpp
+++ b/src/metrics/metric.cpp
@@ -50,31 +50,6 @@ void metric_statistics::reset() {
   m_num_samples = 0;
 }
 
-bool metric_statistics::pack_scalars(persist& p) {
-  p.write_double(persist_type::validate, "sum", m_sum);
-  p.write_uint64(persist_type::validate, "num_samples", m_num_samples);
-  return true;
-}
-
-bool metric_statistics::unpack_scalars(persist& p, struct packing_header *header) {
-  double sum;
-  uint64_t num_samples;
-  p.read_double(persist_type::validate, "sum", &sum);
-  p.read_uint64(persist_type::validate, "num_samples", (uint64_t *) &num_samples);
-  m_sum = sum;
-  m_num_samples = num_samples;
-  if (header != nullptr) {
-    header->sum = sum;
-    header->num_samples = num_samples;
-  }
-  return true;
-}
-
-void metric_statistics::unpack_header(struct packing_header& header) {
-  m_sum = header.sum;
-  m_num_samples = header.num_samples;
-}
-
 metric::metric(lbann_comm *comm) : m_comm(comm) {}
 
 EvalType metric::get_mean_value(execution_mode mode) const {
@@ -111,52 +86,4 @@ void metric::set_layer_pointers(std::vector<Layer*> layers) {
   }
 }
 
-bool metric::save_to_checkpoint_shared(persist& p) {
-  // write out fields we need to save for model
-  if (m_comm->am_trainer_master()) {
-    m_statistics[execution_mode::training].pack_scalars(p);
-    m_statistics[execution_mode::testing].pack_scalars(p);
-    m_statistics[execution_mode::validation].pack_scalars(p);
-  }
-  return true;
-}
-
-bool metric::load_from_checkpoint_shared(persist& p) {
-  struct metric_statistics::packing_header training_header, validation_header, testing_header;
-  if (m_comm->am_trainer_master()) {
-    m_statistics[execution_mode::training].unpack_scalars(p, &training_header);
-    m_statistics[execution_mode::testing].unpack_scalars(p, &testing_header);
-    m_statistics[execution_mode::validation].unpack_scalars(p, &validation_header);
-  }
-
-  m_comm->trainer_broadcast(0, training_header);
-  m_comm->trainer_broadcast(0, validation_header);
-  m_comm->trainer_broadcast(0, testing_header);
-
-  m_statistics[execution_mode::training].unpack_header(training_header);
-  m_statistics[execution_mode::validation].unpack_header(validation_header);
-  m_statistics[execution_mode::testing].unpack_header(testing_header);
-  return true;
-}
-
-bool metric::save_to_checkpoint_distributed(persist& p) {
-  // write out fields we need to save for model
-  m_statistics[execution_mode::training].pack_scalars(p);
-  m_statistics[execution_mode::testing].pack_scalars(p);
-  m_statistics[execution_mode::validation].pack_scalars(p);
-  return true;
-}
-
-bool metric::load_from_checkpoint_distributed(persist& p) {
-  struct metric_statistics::packing_header training_header, validation_header, testing_header;
-  m_statistics[execution_mode::training].unpack_scalars(p, &training_header);
-  m_statistics[execution_mode::testing].unpack_scalars(p, &testing_header);
-  m_statistics[execution_mode::validation].unpack_scalars(p, &validation_header);
-
-  m_statistics[execution_mode::training].unpack_header(training_header);
-  m_statistics[execution_mode::validation].unpack_header(validation_header);
-  m_statistics[execution_mode::testing].unpack_header(testing_header);
-  return true;
-}
-
 }  // namespace lbann
diff --git a/src/models/directed_acyclic_graph.cpp b/src/models/directed_acyclic_graph.cpp
index cf0b8d623d9..3dfb20dbe0b 100644
--- a/src/models/directed_acyclic_graph.cpp
+++ b/src/models/directed_acyclic_graph.cpp
@@ -30,10 +30,9 @@
 namespace lbann {
 
 directed_acyclic_graph_model::directed_acyclic_graph_model(lbann_comm *comm,
-                                                           El::Int mini_batch_size,
-                                                           objective_function *obj_fn,
-                                                           optimizer* default_optimizer)
-  : model(comm, mini_batch_size, obj_fn, default_optimizer) {}
+                                                           std::unique_ptr<objective_function> obj_fn,
+                                                           std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg)
+  : model(comm, std::move(obj_fn), std::move(default_optimizer_msg)) {}
 
 void directed_acyclic_graph_model::setup_layer_execution_order() {
 
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 7d8f163d920..c3ea344fe6e 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -25,8 +25,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/models/model.hpp"
+#include "lbann/trainers/trainer.hpp"
 #include "lbann/callbacks/callback.hpp"
-#include "lbann/callbacks/callback_save_model.hpp"
+#include "lbann/callbacks/save_model.hpp"
 #include "lbann/io/persist.hpp"
 #include "lbann/layers/io/input/generic_input_layer.hpp"
 #include "lbann/layers/transform/dummy.hpp"
@@ -34,18 +35,25 @@
 #include "lbann/layers/transform/evaluation.hpp"
 #include "lbann/objective_functions/layer_term.hpp"
 #include "lbann/metrics/layer_metric.hpp"
-#include "lbann/utils/random.hpp"
 #include "lbann/utils/omp_diagnostics.hpp"
 #include "lbann/utils/description.hpp"
 #include "lbann/data_store/data_store_conduit.hpp"
+
+#include <cereal/types/base_class.hpp>
+#include <cereal/types/polymorphic.hpp>
+
+#include <model.pb.h>
+#include <optimizers.pb.h>
+
+#include <mpi.h>
+
 #include <string>
 #include <unistd.h>
 #include <iomanip>
 #include <queue>
 #include <unordered_set>
-#include <lbann.pb.h>
 
-#include "mpi.h"
+#include "lbann/utils/distconv.hpp"
 
 namespace lbann {
 
@@ -54,16 +62,14 @@ namespace lbann {
 // =============================================
 
 model::model(lbann_comm* comm,
-             El::Int mini_batch_size,
-             objective_function* obj_fn,
-             optimizer* default_optimizer)
-  : m_comm(comm),
-    m_current_mini_batch_size(mini_batch_size),
-    m_max_mini_batch_size(mini_batch_size),
-    m_effective_mini_batch_size(mini_batch_size),
-    m_default_optimizer(default_optimizer),
-    m_objective_function(obj_fn) {
-
+             std::unique_ptr<objective_function> obj_fn,
+             std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg)
+  : m_execution_context(nullptr),
+    m_comm(comm),
+    m_default_optimizer_msg(std::move(default_optimizer_msg))
+{
+
+  m_objective_function = std::move(obj_fn);
   // Default model name
   static El::Int num_models = 0;
   m_name = "model" + std::to_string(num_models);
@@ -72,50 +78,51 @@ model::model(lbann_comm* comm,
 }
 
 model::model(const model& other) :
+  m_execution_context(other.m_execution_context),
   m_comm(other.m_comm),
   m_name(other.m_name),
-  m_execution_mode(other.m_execution_mode),
-  m_epoch(other.m_epoch),
-  m_step(other.m_step),
-  m_terminate_training(other.m_terminate_training),
-  m_current_mini_batch_size(other.m_current_mini_batch_size),
-  m_max_mini_batch_size(other.m_max_mini_batch_size),
-  m_effective_mini_batch_size(other.m_effective_mini_batch_size),
-  m_background_io_allowed(other.m_background_io_allowed) {
+  m_model_is_setup(other.m_model_is_setup) {
 
   // Deep copies
-  m_default_optimizer = (other.m_default_optimizer ?
-                         other.m_default_optimizer->copy() : nullptr);
-  m_objective_function = (other.m_objective_function ?
-                          other.m_objective_function->copy() : nullptr);
+  m_default_optimizer_msg = (other.m_default_optimizer_msg
+                             ? make_unique<lbann_data::Optimizer>(
+                               *other.m_default_optimizer_msg)
+                             : nullptr);
+  m_objective_function = (other.m_objective_function
+                          ? make_unique<objective_function>(*other.m_objective_function)
+                          : nullptr);
   m_metrics = other.m_metrics;
   m_callbacks = other.m_callbacks;
   for (auto& m : m_metrics) {
     m = m->copy();
   }
   for (auto& cb : m_callbacks) {
-    cb = cb->copy();
+    cb.reset(cb->copy());
   }
 
   // Copy layers
   std::unordered_map<Layer*,Layer*> layer_map;
   m_layers.reserve(other.m_layers.size());
-  for (const auto& ptr : other.m_layers) {
-    if (ptr == nullptr) { LBANN_ERROR("unexpected null pointer"); }
-    auto* old_layer = ptr.get();
-    auto* new_layer = old_layer->copy();
-    new_layer->set_model(this);
-    m_layers.emplace_back(new_layer);
-    layer_map[old_layer] = new_layer;
+  for (const auto& other_layer : other.m_layers) {
+    if (other_layer == nullptr) {
+      LBANN_ERROR("model \"",other.get_name(),"\" ",
+                  "has a null pointer in its list of layers");
+    }
+    m_layers.emplace_back(other_layer->copy());
+    m_layers.back()->set_model(this);
+    layer_map[other_layer.get()] = m_layers.back().get();
   }
 
   // Copy weights
-  m_weights = other.m_weights;
   std::unordered_map<weights*,weights*> weights_map;
-  for (auto& w : m_weights) {
-    auto&& w_copy = w->copy();
-    weights_map[w] = w_copy;
-    w = w_copy;
+  m_weights.reserve(other.m_weights.size());
+  for (const auto& other_weights : other.m_weights) {
+    if (other_weights == nullptr) {
+      LBANN_ERROR("model \"",other.get_name(),"\" ",
+                  "has a null pointer in its list of weights");
+    }
+    m_weights.emplace_back(make_unique<data_type_weights<DataType>>(dynamic_cast<data_type_weights<DataType>&>(*other_weights)));
+    weights_map[other_weights.get()] = m_weights.back().get();
   }
 
   // Fix pointers
@@ -126,63 +133,63 @@ model::model(const model& other) :
 model& model::operator=(const model& other) {
 
   // Delete objects
-  if (m_objective_function != nullptr) { delete m_objective_function; }
+  if (m_execution_context  != nullptr) { delete m_execution_context; } /// @todo BVE FIXME what do we do with smart pointers here
   for (const auto& m : m_metrics)      { delete m; }
-  for (const auto& cb : m_callbacks)   { delete cb; }
-  for (const auto& w : m_weights)      { delete w; }
 
   // Shallow copies
   m_comm = other.m_comm;
   m_name = other.m_name;
-  m_execution_mode = other.m_execution_mode;
-  m_epoch = other.m_epoch;
-  m_step = other.m_step;
-  m_terminate_training = other.m_terminate_training;
-  m_current_mini_batch_size = other.m_current_mini_batch_size;
-  m_max_mini_batch_size = other.m_max_mini_batch_size;
-  m_effective_mini_batch_size = other.m_effective_mini_batch_size;
-  m_background_io_allowed = other.m_background_io_allowed;
+  m_model_is_setup = other.m_model_is_setup;
 
   // Deep copies
-  m_objective_function = other.m_objective_function;
+  m_execution_context  = other.m_execution_context;
+  m_objective_function = (other.m_objective_function
+                          ? make_unique<objective_function>(*other.m_objective_function)
+                          : nullptr);
   m_metrics            = other.m_metrics;
   m_callbacks          = other.m_callbacks;
-  m_weights            = other.m_weights;
-  if (m_objective_function != nullptr) {
-    m_objective_function = m_objective_function->copy();
-  }
   for (auto& m : m_metrics) {
     m = m->copy();
   }
   for (auto& cb : m_callbacks) {
-    cb = cb->copy();
+    cb.reset(cb->copy());
   }
+
+  // Copy layers
   std::unordered_map<Layer*,Layer*> layer_map;
   m_layers.clear();
   m_layers.reserve(other.m_layers.size());
-  for (const auto& ptr : other.m_layers) {
-    if (ptr == nullptr) { LBANN_ERROR("unexpected null pointer"); }
-    auto* old_layer = ptr.get();
-    auto* new_layer = old_layer->copy();
-    new_layer->set_model(this);
-    m_layers.emplace_back(new_layer);
-    layer_map[old_layer] = new_layer;
+  for (const auto& other_layer : other.m_layers) {
+    if (other_layer == nullptr) {
+      LBANN_ERROR("model \"",other.get_name(),"\" ",
+                  "has a null pointer in its list of layers");
+    }
+    m_layers.emplace_back(other_layer->copy());
+    m_layers.back()->set_model(this);
+    layer_map[other_layer.get()] = m_layers.back().get();
   }
+
+  // Copy weights
   std::unordered_map<weights*,weights*> weights_map;
-  for (auto& w : m_weights) {
-    w = weights_map[w] = w->copy();
+  m_weights.clear();
+  m_weights.reserve(other.m_weights.size());
+  for (const auto& other_weights : other.m_weights) {
+    if (other_weights == nullptr) {
+      LBANN_ERROR("model \"",other.get_name(),"\" ",
+                  "has a null pointer in its list of weights");
+    }
+    m_weights.emplace_back(make_unique<data_type_weights<DataType>>(dynamic_cast<data_type_weights<DataType>&>(*other_weights)));
+    weights_map[other_weights.get()] = m_weights.back().get();
   }
+
+  // Fix pointers
   remap_pointers(layer_map, weights_map);
 
   return *this;
 }
 
 model::~model() {
-  if (m_objective_function != nullptr) { delete m_objective_function; }
-  if (m_default_optimizer != nullptr)  { delete m_default_optimizer; }
-  for (const auto& w : m_weights)      { delete w; }
   for (const auto& m : m_metrics)      { delete m; }
-  for (const auto& cb : m_callbacks)   { delete cb; }
 }
 
 // =============================================
@@ -256,7 +263,7 @@ description model::get_description() const {
 
   // Weights
   description weights_desc("Weights:");
-  for (const auto* w : m_weights) {
+  for (const auto& w : m_weights) {
     if (w == nullptr) {
       weights_desc.add("unknown weights");
     } else {
@@ -266,7 +273,15 @@ description model::get_description() const {
   desc.add(std::string{});
   desc.add(weights_desc);
 
-  /// @todo Descriptions for objective function, metrics, callbacks
+  // Callbacks
+  description callback_desc("Callbacks:");
+  for (const auto& cb : m_callbacks) {
+    callback_desc.add(cb->get_description());
+  }
+  desc.add(std::string{});
+  desc.add(callback_desc);
+
+  /// @todo Descriptions for objective function, metrics
 
   // Result
   return desc;
@@ -314,7 +329,7 @@ const std::vector<Layer*> model::get_layers() const {
 std::vector<weights*> model::get_weights() {
   std::vector<weights*> weights_list;
   for (const auto& w : m_weights) {
-    weights_list.push_back(w);
+    weights_list.push_back(w.get());
   }
   return weights_list;
 }
@@ -322,34 +337,14 @@ std::vector<weights*> model::get_weights() {
 const std::vector<weights*> model::get_weights() const {
   std::vector<weights*> weights_list;
   for (const auto& w : m_weights) {
-    weights_list.push_back(w);
+    weights_list.push_back(w.get());
   }
   return weights_list;
 }
 
-void model::set_execution_mode(execution_mode mode) {
-  m_execution_mode = mode;
-}
-
-execution_mode model::get_execution_mode() const noexcept {
-  return m_execution_mode;
-}
-
-El::Int model::get_step() const noexcept {
-  return get_step(get_execution_mode());
-}
-
-El::Int model::get_step(execution_mode mode) const noexcept {
-  if (m_step.count(mode) > 0) {
-    return m_step.at(mode);
-  } else {
-    return 0;
-  }
-}
-
-int model::get_num_iterations_per_epoch(execution_mode mode) const {
+size_t model::get_num_iterations_per_epoch(execution_mode mode) const {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    const auto* input = dynamic_cast<const generic_input_layer*>(&get_layer(i));
+    const auto* input = dynamic_cast<const generic_input_layer<DataType>*>(&get_layer(i));
     if (input != nullptr) {
       return input->get_num_iterations_per_epoch(mode);
     }
@@ -361,14 +356,12 @@ int model::get_num_iterations_per_epoch(execution_mode mode) const {
 // Model specification
 // =============================================
 
-void model::add_layer(std::unique_ptr<Layer> l) {
-  std::stringstream err;
+void model::add_layer(std::unique_ptr<Layer> ptr) {
 
   // Check for null pointer
-  if (l == nullptr) {
-    err << "attempted to add a null pointer as a layer to "
-        << "model \"" << get_name() << "\"";
-    LBANN_ERROR(err.str());
+  if (ptr == nullptr) {
+    LBANN_ERROR("attempted to add a null pointer as layer to ",
+                "model \"",get_name(),"\"");
   }
 
   // Check that the new layer name is unique
@@ -376,30 +369,27 @@ void model::add_layer(std::unique_ptr<Layer> l) {
   // bottleneck. If it is, consider maintaining a hash table
   // containing all layer names (and properly updating it during
   // copies and pointer remaps).
-  const auto& name = l->get_name();
-  for (El::Int i = 0; i < get_num_layers(); ++i) {
-    if (get_layer(i).get_name() == name) {
-      err << "attempted to add layer \"" << name << "\" to "
-          << "model \"" << get_name() << "\", "
-          << "but the model already contains a layer with that name";
-      LBANN_ERROR(err.str());
+  const auto& name = ptr->get_name();
+  for (const auto& l : m_layers) {
+    if (l->get_name() == name) {
+      LBANN_ERROR("attempted to add layer \"",name,"\" to ",
+                  "model \"",get_name(),"\", ",
+                  "but the model already contains a layer with that name");
     }
   }
 
   // Add layer to model
-  m_layers.emplace_back(std::move(l));
+  m_layers.emplace_back(std::move(ptr));
   m_layers.back()->set_model(this);
 
 }
 
-void model::add_weights(weights* w) {
-  std::stringstream err;
+void model::add_weights(std::unique_ptr<weights> ptr) {
 
   // Check for null pointer
-  if (w == nullptr) {
-    err << "attempted to add a null pointer as weights to "
-        << "model \"" << get_name() << "\"";
-    LBANN_ERROR(err.str());
+  if (ptr == nullptr) {
+    LBANN_ERROR("attempted to add a null pointer as weights to ",
+                "model \"",get_name(),"\"");
   }
 
   // Check that the new weights name is unique
@@ -407,26 +397,25 @@ void model::add_weights(weights* w) {
   // bottleneck. If it is, consider maintaining a hash table
   // containing all weights names (and properly updating it during
   // copies and pointer remaps).
-  const auto& name = w->get_name();
-  for (const auto& w2 : m_weights) {
-    if (w2->get_name() == name) {
-      err << "attempted to add weights \"" << name << "\" to "
-          << "model \"" << get_name() << "\", "
-          << "but the model already contains weights with that name";
-      LBANN_ERROR(err.str());
+  const auto& name = ptr->get_name();
+  for (const auto& w : m_weights) {
+    if (w->get_name() == name) {
+      LBANN_ERROR("attempted to add weights \"",name,"\" to ",
+                  "model \"",get_name(),"\", ",
+                  "but the model already contains weights with that name");
     }
   }
 
   // Add weights to model
-  m_weights.push_back(w);
+  m_weights.emplace_back(std::move(ptr));
 
 }
 
-void model::add_callback(lbann_callback *cb) {
+void model::add_callback(std::shared_ptr<callback_base> cb) {
   if (cb == nullptr) {
     throw lbann_exception("model: Attempted to add null pointer as a callback.");
   }
-  m_callbacks.push_back(cb);
+  m_callbacks.push_back(std::move(cb));
 }
 
 void model::add_metric(metric *m) {
@@ -437,31 +426,26 @@ void model::add_metric(metric *m) {
 }
 
 void model::replace_weights(std::vector<weights*>& new_weights) {
+  /// @todo tym (9/9/19): This function isn't used anywhere. It's
+  /// probably safe to delete?
 
   // Check that number of weights is valid
   if (new_weights.size() > m_weights.size()) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: "
-        << "attempted to replace weights with an invalid number of weights "
-        << "(expected at most " << m_weights.size() << ", found " << new_weights.size() << ")";
-    throw lbann_exception(err.str());
+    LBANN_ERROR("attempted to replace weights with ",
+                "an invalid number of weights ",
+                "(expected at most ",m_weights.size(),", ",
+                "found ",new_weights.size(),")");
   }
 
   // Replace weights in list
-  std::vector<weights *> old_weights(m_weights.begin(),
-                                     m_weights.begin() + new_weights.size());
   std::unordered_map<weights*,weights*> weights_map;
   std::unordered_map<Layer*,Layer*> layer_map;
   for (size_t i = 0; i < new_weights.size(); ++i) {
-    m_weights[i] = weights_map[old_weights[i]] = new_weights[i];
+    weights_map[m_weights[i].get()] = new_weights[i];
+    m_weights[i].reset(new_weights[i]);
   }
   remap_pointers(layer_map, weights_map);
 
-  // Delete old weights
-  for (const auto& w : old_weights) {
-    delete w;
-  }
-
 }
 
 void model::copy_trained_weights_from(std::vector<weights*>& new_weights) {
@@ -476,23 +460,16 @@ void model::copy_trained_weights_from(std::vector<weights*>& new_weights) {
          #ifdef LBANN_DEBUG
          if(m_comm->am_world_master()) std::cout << " Replacing " << m_weights[j]->get_name() << " with " << new_weights[i]->get_name() << std::endl;
          #endif
-         m_weights[j]->set_values(new_weights[i]->get_values());
+         dynamic_cast<observer_ptr<data_type_weights<DataType>>>(m_weights[j].get())->set_values(
+           dynamic_cast<data_type_weights<DataType>*>(new_weights[i])->get_values());
        }
      }
    }
 }
 
-optimizer* model::create_optimizer() const {
-  if (m_default_optimizer != nullptr) {
-    return m_default_optimizer->copy();
-  } else {
-    return nullptr;
-  }
-}
-
 bool model::is_execution_mode_valid(execution_mode mode) const {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    const auto* input = dynamic_cast<const generic_input_layer*>(&get_layer(i));
+    const auto* input = dynamic_cast<const generic_input_layer<DataType>*>(&get_layer(i));
     if (input != nullptr && !input->is_execution_mode_valid(mode)) {
       return false;
     }
@@ -569,7 +546,7 @@ void model::remap_pointers(const std::unordered_map<Layer*,Layer*>& layer_map,
   for (El::Int i = 0; i < get_num_layers(); ++i) {
     auto& l = get_layer(i);
     auto layer_pointers = l.get_layer_pointers();
-    auto weights_pointers = l.get_weights();
+    auto weights_pointers = extract_weights(l);
     for (auto& ptr : layer_pointers) {
       if (layer_map.count(ptr) > 0) {
         ptr = layer_map.at(ptr);
@@ -590,15 +567,15 @@ void model::remap_pointers(const std::unordered_map<Layer*,Layer*>& layer_map,
 // Setup
 // =============================================
 
-void model::setup(std::shared_ptr<thread_pool> io_thread_pool) {
-  // Setup I/O threads - set up before setting up the layers (input
-  // layer depends on having a properly initialized thread pool)
-  m_io_thread_pool = std::move(io_thread_pool);
+void model::setup(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
+
+  // Bail out if the model is already setup
+  if(m_model_is_setup) { return; }
 
   // Setup layers
   setup_layer_topology();
   setup_layer_execution_order();
-  setup_layers();
+  setup_layers(max_mini_batch_size, dr_metadata);
 
   // Setup weights
   setup_weights();
@@ -615,6 +592,15 @@ void model::setup(std::shared_ptr<thread_pool> io_thread_pool) {
   for (const auto& cb : m_callbacks) {
     cb->setup(this);
   }
+
+#ifdef LBANN_HAS_DISTCONV
+  setup_distconv();
+#endif
+
+  // Callback hooks at end of setup
+  do_setup_end_cbs();
+
+  m_model_is_setup = true;
 }
 
 void model::setup_layer_topology() {
@@ -676,7 +662,7 @@ void model::setup_layer_execution_order() {
   // Find input layers
   std::vector<El::Int> input_layers, other_layers;
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    if (dynamic_cast<generic_input_layer*>(&get_layer(i)) != nullptr) {
+    if (dynamic_cast<generic_input_layer<DataType>*>(&get_layer(i)) != nullptr) {
       input_layers.push_back(i);
     } else {
       other_layers.push_back(i);
@@ -693,51 +679,27 @@ void model::setup_layer_execution_order() {
 
 }
 
-void model::setup_layers() {
+void model::setup_layers(size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
     auto& l = get_layer(i);
     l.set_model(this);
-    l.setup();
+    l.setup(max_mini_batch_size, dr_metadata);
     l.check_setup();
   }
 }
 
 void model::setup_weights() {
 
-  // List of used and unused weights
-  std::unordered_set<weights*> weights_set(m_weights.begin(),
-                                           m_weights.end());
-  std::set<weights*> unused_weights(m_weights.begin(),
-                                    m_weights.end());
-
-  // Find weights used by layers
-  for (El::Int i = 0; i < get_num_layers(); ++i) {
-    for (const auto& w : get_layer(i).get_weights()) {
-      if (weights_set.count(w) == 0) {
-        m_weights.push_back(w);
-        weights_set.insert(w);
-      }
-      unused_weights.erase(w);
-    }
-  }
-
-  // Find weights used by objective function
-  for (const auto& w : m_objective_function->get_weights_pointers()) {
-    if (weights_set.count(w) == 0) {
-      m_weights.push_back(w);
-      weights_set.insert(w);
-    }
-    unused_weights.erase(w);
-  }
-
-  // Delete unused weights
-  for (auto&& w : unused_weights) {
-    m_weights.erase(std::remove(m_weights.begin(), m_weights.end(), w),
-                    m_weights.end());
-  }
+  // Sort weights by name
+  // Note: For run-to-run consistency. Names are assumed to be unique.
+  std::sort(m_weights.begin(), m_weights.end(),
+            [] (const std::unique_ptr<weights>& x,
+                const std::unique_ptr<weights>& y) {
+              return x->get_name().compare(y->get_name()) < 0;
+            });
 
   // Setup weights
-  for (auto* w : m_weights) { w->setup(); }
+  for (auto&& w : m_weights) { w->setup(); }
 
 }
 
@@ -757,10 +719,10 @@ void model::add_evaluation_layers(std::unordered_set<Layer*>& layer_set,
             << "which isn't in the model's list of layers";
         LBANN_ERROR(err.str());
       }
-      if (dynamic_cast<abstract_evaluation_layer*>(&l) == nullptr) {
+      if (dynamic_cast<abstract_evaluation_layer<DataType>*>(&l) == nullptr) {
 
         // Create evaluation layer
-        std::unique_ptr<Layer> eval(abstract_evaluation_layer::construct(
+        std::unique_ptr<Layer> eval(abstract_evaluation_layer<DataType>::construct(
                                       l.get_comm(),
                                       l.get_data_layout(),
                                       l.get_device_allocation()));
@@ -799,10 +761,10 @@ void model::add_evaluation_layers(std::unordered_set<Layer*>& layer_set,
             << "which is not in model \"" << get_name() << "\"";
         LBANN_ERROR(err.str());
       }
-      if (dynamic_cast<abstract_evaluation_layer*>(&l) == nullptr) {
+      if (dynamic_cast<abstract_evaluation_layer<DataType>*>(&l) == nullptr) {
 
         // Create evaluation layer
-        std::unique_ptr<Layer> eval(abstract_evaluation_layer::construct(
+        std::unique_ptr<Layer> eval(abstract_evaluation_layer<DataType>::construct(
                                       l.get_comm(),
                                       l.get_data_layout(),
                                       l.get_device_allocation()));
@@ -842,17 +804,17 @@ void model::add_dummy_layers(std::unordered_set<std::string>& layer_names) {
       using args_tuple = std::tuple<data_layout,El::Device>;
       args_tuple args(l.get_data_layout(), l.get_device_allocation());
       if (args == args_tuple(data_layout::DATA_PARALLEL, El::Device::CPU)) {
-        dummy.reset(new dummy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>(m_comm));
+        dummy.reset(new dummy_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>(m_comm));
       }
       if (args == args_tuple(data_layout::MODEL_PARALLEL, El::Device::CPU)) {
-        dummy.reset(new dummy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>(m_comm));
+        dummy.reset(new dummy_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>(m_comm));
       }
 #ifdef LBANN_HAS_GPU
       if (args == args_tuple(data_layout::DATA_PARALLEL, El::Device::GPU)) {
-        dummy.reset(new dummy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>(m_comm));
+        dummy.reset(new dummy_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>(m_comm));
       }
       if (args == args_tuple(data_layout::MODEL_PARALLEL, El::Device::GPU)) {
-        dummy.reset(new dummy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>(m_comm));
+        dummy.reset(new dummy_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>(m_comm));
       }
 #endif // LBANN_HAS_GPU
       if (dummy == nullptr) {
@@ -895,17 +857,17 @@ void model::add_split_layers(std::unordered_set<std::string>& layer_names) {
       using args_tuple = std::tuple<data_layout,El::Device>;
       args_tuple args(l.get_data_layout(), l.get_device_allocation());
       if (args == args_tuple(data_layout::DATA_PARALLEL, El::Device::CPU)) {
-        split.reset(new split_layer<data_layout::DATA_PARALLEL, El::Device::CPU>(m_comm));
+        split.reset(new split_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>(m_comm));
       }
       if (args == args_tuple(data_layout::MODEL_PARALLEL, El::Device::CPU)) {
-        split.reset(new split_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>(m_comm));
+        split.reset(new split_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>(m_comm));
       }
 #ifdef LBANN_HAS_GPU
       if (args == args_tuple(data_layout::DATA_PARALLEL, El::Device::GPU)) {
-        split.reset(new split_layer<data_layout::DATA_PARALLEL, El::Device::GPU>(m_comm));
+        split.reset(new split_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>(m_comm));
       }
       if (args == args_tuple(data_layout::MODEL_PARALLEL, El::Device::GPU)) {
-        split.reset(new split_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>(m_comm));
+        split.reset(new split_layer<DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>(m_comm));
       }
 #endif // LBANN_HAS_GPU
       if (split == nullptr) {
@@ -926,6 +888,11 @@ void model::add_split_layers(std::unordered_set<std::string>& layer_names) {
       split->set_name(name);
       layer_names.insert(name);
 
+      // Copy parallel strategy from parent.
+      ParallelStrategy& ps = split->get_parallel_strategy();
+      ParallelStrategy& orig_ps = l.get_parallel_strategy();
+      ps = orig_ps;
+
       // Setup relationships between split layer and child layers
       for (auto&& const_child : children) {
         auto* child = const_cast<Layer*>(const_child);
@@ -952,89 +919,41 @@ void model::add_split_layers(std::unordered_set<std::string>& layer_names) {
 // Execution
 // =============================================
 
-void model::evaluate(execution_mode mode, int num_batches) {
-
-  // Return early if execution mode is invalid
-  if (!is_execution_mode_valid(mode)) return;
-  if (mode != execution_mode::validation
-      && mode != execution_mode::testing) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: "
-        << "invalid execution mode for evaluation";
-    throw lbann_exception(err.str());
-  }
-
-  // Evaluate on all mini-batches
-  reset_epoch_statistics(mode);
-  reset_mode_and_model(mode);
-  do_evaluate_begin_cbs(mode);
-  if (num_batches > 0) {
-    for (int i = 0; i < num_batches; i++) { evaluate_mini_batch(mode); }
-  } else {
-    while (!evaluate_mini_batch(mode)) {}
-  }
-  do_evaluate_end_cbs(mode);
-}
-
-void model::train(int num_epochs, int num_batches) {
-  do_train_begin_cbs();
-  for (int epoch = m_epoch; epoch < num_epochs; ++epoch) {
-    if (get_terminate_training()) { break; }
-
-    // Initialize epoch
-    reset_mode_and_model(execution_mode::training);
-    do_epoch_begin_cbs();
-
-    // Training iterations
-    if (num_batches > 0) {
-      for (int i = 0; i < num_batches; i++) { train_mini_batch(); }
-    } else {
-      while (!train_mini_batch()) {}
-    }
-
-    // Finalize epoch
-    ++m_epoch;
-    reconcile_weight_values();
-    do_epoch_end_cbs();
-    reset_epoch_statistics(execution_mode::training);
-
-    // Evaluate on validation set
-    evaluate(execution_mode::validation);
-
-  }
-  do_train_end_cbs();
-}
-
-
 void model::collect_background_data_fetch(execution_mode mode) {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    auto *input = dynamic_cast<generic_input_layer*>(&get_layer(i));
+    auto *input = dynamic_cast<generic_input_layer<DataType>*>(&get_layer(i));
     if (input != nullptr) {
       input->collect_background_data_fetch(mode);
     }
   }
 }
 
+// only used in callbacks/ltfb.cpp; from that file:
+// "Note that this is a temporary fix
+// for the current use of the tournament"
 void model::make_data_store_preloaded(execution_mode mode) {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    auto *input = dynamic_cast<generic_input_layer*>(&get_layer(i));
+    auto *input = dynamic_cast<generic_input_layer<DataType>*>(&get_layer(i));
     if (input != nullptr) {
       auto *data_store = input->get_data_reader(mode)->get_data_store_ptr();
-      if(data_store != nullptr && !data_store->is_preloaded()) {
-        input->get_data_reader(mode)->get_data_store_ptr()->set_preload();
-        input->get_data_reader(mode)->get_data_store_ptr()->set_explicit_loading(false);
+      if(data_store != nullptr && !data_store->is_fully_loaded()) {
+        input->get_data_reader(mode)->get_data_store_ptr()->set_loading_is_complete();
+        input->get_data_reader(mode)->get_data_store_ptr()->set_is_explicitly_loading(false);
       }
     }
   }
 }
 
+// only used in callbacks/ltfb.cpp; from that file:
+// "Note that this is a temporary fix
+// for the current use of the tournament"
 void model::mark_data_store_explicitly_loading(execution_mode mode) {
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    auto *input = dynamic_cast<generic_input_layer*>(&get_layer(i));
+    auto *input = dynamic_cast<generic_input_layer<DataType>*>(&get_layer(i));
     if (input != nullptr) {
       auto *data_store = input->get_data_reader(mode)->get_data_store_ptr();
-      if(data_store != nullptr && !data_store->is_preloaded()) {
-        input->get_data_reader(mode)->get_data_store_ptr()->set_explicit_loading(true);
+      if(data_store != nullptr && !data_store->is_fully_loaded()) {
+        input->get_data_reader(mode)->get_data_store_ptr()->set_is_explicitly_loading(true);
       }
     }
   }
@@ -1042,8 +961,9 @@ void model::mark_data_store_explicitly_loading(execution_mode mode) {
 
 // At the start of the epoch, set the execution mode and make sure
 // that each layer points to this model
-void model::reset_mode_and_model(execution_mode mode) {
-  set_execution_mode(mode);
+void model::reset_mode(execution_context& context, execution_mode mode) {
+  m_execution_context = static_cast<observer_ptr<execution_context>>(&context);
+  //  set_execution_mode(mode);
   for (El::Int i = 0; i < get_num_layers(); ++i) {
     get_layer(i).set_model(this);
   }
@@ -1051,83 +971,21 @@ void model::reset_mode_and_model(execution_mode mode) {
 
 // At the end of the epoch, clean up the objective function and metrics
 void model::reset_epoch_statistics(execution_mode mode) {
-  m_objective_function->reset_statistics(mode);
+  get_objective_function()->reset_statistics(mode);
   for (const auto& m : m_metrics) {
     m->reset_statistics(mode);
   }
 }
 
-bool model::evaluate_mini_batch(execution_mode mode) {
-  reset_mode_and_model(mode);
-  do_batch_begin_cbs(mode);
-  forward_prop(mode);
-  m_objective_function->start_evaluation(mode, get_current_mini_batch_size());
-  m_objective_function->finish_evaluation(mode, get_current_mini_batch_size());
+void model::evaluate_metrics(execution_mode mode, size_t current_mini_batch_size) {
   for (const auto& m : m_metrics) {
-    m->evaluate(mode, get_current_mini_batch_size());
+    m->evaluate(mode, current_mini_batch_size);
   }
-  const bool finished = update_layers();
-
-  // Increment mini-batch step
-  /// @todo Move after the callbacks
-  if (m_step.count(mode) < 1) { m_step[mode] = 0; }
-  ++m_step[mode];
-
-  do_batch_end_cbs(mode);
-  return finished;
-}
-
-bool model::train_mini_batch() {
-  constexpr execution_mode mode = execution_mode::training;
-  reset_mode_and_model(mode);
-  do_batch_begin_cbs(mode);
-
-
-  bool finished;
-
-#if defined(LBANN_HAVE_OMP_TASKLOOP)
-  LBANN_OMP_PARALLEL
-  {
-    #pragma omp single
-    {
-#endif
-  // Forward prop step
-  clear_gradients();
-  forward_prop(mode);
-  // Result is not needed until the end of the mini-batch.
-  m_objective_function->start_evaluation(mode, get_current_mini_batch_size());
-
-  // Backward prop step
-  m_objective_function->differentiate();
-  backward_prop();
-  m_objective_function->compute_weight_regularization();
-
-  // Finish evaluation.
-  m_objective_function->finish_evaluation(mode, get_current_mini_batch_size());
-  for (const auto& m : m_metrics) {
-    m->evaluate(mode, get_current_mini_batch_size());
-  }
-
-  // Update step
-  update_weights();
-  finished = update_layers();
-#if defined(LBANN_HAVE_OMP_TASKLOOP)
-    }
-  }
-#endif
-
-  // Increment mini-batch step
-  /// @todo Move after the callbacks
-  if (m_step.count(mode) < 1) { m_step[mode] = 0; }
-  ++m_step[mode];
-
-  do_batch_end_cbs(execution_mode::training);
-  return finished;
 }
 
 void model::clear_gradients() {
-  for (const auto& w : m_weights) {
-    optimizer* opt = w->get_optimizer();
+  for (auto&& w : m_weights) {
+    auto&& opt = w->get_optimizer();
     if (opt != nullptr) { opt->clear_gradient(); }
   }
 }
@@ -1170,15 +1028,24 @@ void model::backward_prop() {
 
 void model::update_weights() {
   do_model_optimize_begin_cbs();
-  for (El::Int i = m_weights.size()-1; i >= 0; --i) {
-    auto& w = *m_weights[i];
-    optimizer* opt = w.get_optimizer();
+
+  // Apply optimization step to weights
+  // Note: Heuristically, forward prop consumes weights in the same
+  // order as m_weights and backprop computes weights gradients in
+  // reverse order. Also, we often launch a non-blocking allreduce
+  // after a weights gradient has been computed. Thus, iterating in
+  // reverse order will use gradients that have already finished their
+  // allreduce, giving more time for more recent allreduces to finish.
+  for (auto rit = m_weights.rbegin(); rit != m_weights.rend(); ++rit) {
+    auto& w = **rit;
+    auto&& opt = w.get_optimizer();
     if (opt != nullptr) {
       do_weight_optimize_begin_cbs(&w);
       opt->step();
       do_weight_optimize_end_cbs(&w);
     }
   }
+
   do_model_optimize_end_cbs();
 }
 
@@ -1191,106 +1058,33 @@ bool model::update_layers() {
 }
 
 void model::reconcile_weight_values() {
-  std::vector<Al::request> reqs(m_weights.size());
-  for (El::Int i = m_weights.size()-1; i >= 0; --i) {
-    m_weights[i]->reconcile_values(reqs[i]);
-  }
+
+  // Launch non-blocking communication to reconcile weights
+  // Note: Heuristically, forward prop consumes weights in the same
+  // order as m_weights. Also, weights tend to get larger as you get
+  // deeper into a neural network. Thus, iterating in reverse order
+  // means that we perform the expensive communication first, covering
+  // up the launch overheads for the subsequent cheap communication.
+  std::vector<Al::request> reqs;
+  reqs.reserve(m_weights.size());
+  for (auto rit = m_weights.rbegin(); rit != m_weights.rend(); ++rit) {
+    auto& w = **rit;
+    reqs.emplace_back();
+    w.reconcile_values(reqs.back());
+  }
+
+  // Wait for communication to finish
   for (auto& req : reqs) { m_comm->wait(req); }
+
 }
 
 // =============================================
 // Callbacks
 // =============================================
 
-void model::do_train_begin_cbs() {
-  for (const auto& cb : m_callbacks) {
-    cb->on_train_begin(this);
-  }
-}
-
-void model::do_train_end_cbs() {
-  for (const auto& cb : m_callbacks) {
-    cb->on_train_end(this);
-  }
-}
-
-void model::do_evaluate_begin_cbs(execution_mode mode) {
-  for (const auto& cb : m_callbacks) {
-    switch (mode) {
-    case execution_mode::validation:
-      cb->on_validation_begin(this); break;
-    case execution_mode::testing:
-      cb->on_test_begin(this); break;
-    default:
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << "invalid execution mode";
-      throw lbann_exception(err.str());
-    }
-  }
-}
-
-void model::do_evaluate_end_cbs(execution_mode mode) {
-  for (const auto& cb : m_callbacks) {
-    switch (mode) {
-    case execution_mode::validation:
-      cb->on_validation_end(this); break;
-    case execution_mode::testing:
-      cb->on_test_end(this); break;
-    default:
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << "invalid execution mode";
-      throw lbann_exception(err.str());
-    }
-  }
-}
-
-void model::do_epoch_begin_cbs() {
-  for (const auto& cb : m_callbacks) {
-    cb->on_epoch_begin(this);
-  }
-}
-
-void model::do_epoch_end_cbs() {
-  for (const auto& cb : m_callbacks) {
-    cb->on_epoch_end(this);
-  }
-}
-
-void model::do_batch_begin_cbs(execution_mode mode) {
-  for (const auto& cb : m_callbacks) {
-    switch (mode) {
-    case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
-        cb->on_batch_begin(this);
-      }
-      break;
-    case execution_mode::validation:
-    case execution_mode::testing:
-      cb->on_batch_evaluate_begin(this);
-      break;
-    default:
-      LBANN_ERROR("invalid execution mode");
-    }
-  }
-}
-
-void model::do_batch_end_cbs(execution_mode mode) {
+void model::do_setup_end_cbs() {
   for (const auto& cb : m_callbacks) {
-    switch (mode) {
-    case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
-        cb->on_batch_end(this);
-      }
-      break;
-    case execution_mode::validation:
-    case execution_mode::testing:
-      cb->on_batch_evaluate_end(this);
-      break;
-    default:
-      LBANN_ERROR("invalid execution mode");
-    }
+    cb->on_setup_end(this);
   }
 }
 
@@ -1298,7 +1092,7 @@ void model::do_model_forward_prop_begin_cbs(execution_mode mode) {
   for (const auto& cb : m_callbacks) {
     switch (mode) {
     case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
+      if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
         cb->on_forward_prop_begin(this);
       }
       break;
@@ -1316,7 +1110,7 @@ void model::do_model_forward_prop_end_cbs(execution_mode mode) {
   for (const auto& cb : m_callbacks) {
     switch (mode) {
     case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
+      if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
         cb->on_forward_prop_end(this);
       }
       break;
@@ -1337,7 +1131,7 @@ void model::do_layer_forward_prop_begin_cbs(execution_mode mode, Layer *l) {
   for (const auto& cb : m_callbacks) {
     switch (mode) {
     case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
+      if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
         cb->on_forward_prop_begin(this, l);
       }
       break;
@@ -1358,7 +1152,7 @@ void model::do_layer_forward_prop_end_cbs(execution_mode mode, Layer *l) {
   for (const auto& cb : m_callbacks) {
     switch (mode) {
     case execution_mode::training:
-      if (get_step() % cb->get_batch_interval() == 0) {
+      if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
         cb->on_forward_prop_end(this, l);
       }
       break;
@@ -1374,7 +1168,7 @@ void model::do_layer_forward_prop_end_cbs(execution_mode mode, Layer *l) {
 
 void model::do_model_backward_prop_begin_cbs() {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_backward_prop_begin(this);
     }
   }
@@ -1382,7 +1176,7 @@ void model::do_model_backward_prop_begin_cbs() {
 
 void model::do_model_backward_prop_end_cbs() {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_backward_prop_end(this);
     }
   }
@@ -1390,7 +1184,7 @@ void model::do_model_backward_prop_end_cbs() {
 
 void model::do_layer_backward_prop_begin_cbs(Layer *l) {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_backward_prop_begin(this, l);
     }
   }
@@ -1398,7 +1192,7 @@ void model::do_layer_backward_prop_begin_cbs(Layer *l) {
 
 void model::do_layer_backward_prop_end_cbs(Layer *l) {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_backward_prop_end(this, l);
     }
   }
@@ -1406,7 +1200,7 @@ void model::do_layer_backward_prop_end_cbs(Layer *l) {
 
 void model::do_model_optimize_begin_cbs() {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_optimize_begin(this);
     }
   }
@@ -1414,7 +1208,7 @@ void model::do_model_optimize_begin_cbs() {
 
 void model::do_model_optimize_end_cbs() {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_optimize_end(this);
     }
   }
@@ -1422,7 +1216,7 @@ void model::do_model_optimize_end_cbs() {
 
 void model::do_weight_optimize_begin_cbs(weights *w) {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_optimize_begin(this, w);
     }
   }
@@ -1430,7 +1224,7 @@ void model::do_weight_optimize_begin_cbs(weights *w) {
 
 void model::do_weight_optimize_end_cbs(weights *w) {
   for (const auto& cb : m_callbacks) {
-    if (get_step() % cb->get_batch_interval() == 0) {
+    if (get_execution_context().get_step() % cb->get_batch_interval() == 0) {
       cb->on_optimize_end(this, w);
     }
   }
@@ -1441,20 +1235,21 @@ void model::do_weight_optimize_end_cbs(weights *w) {
 // =============================================
 
 void model::summarize_stats(lbann_summary& summarizer) {
+  const auto& c = get_execution_context();
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    get_layer(i).summarize_stats(summarizer, get_step(execution_mode::training));
+    get_layer(i).summarize_stats(summarizer, c.get_step());
   }
   summarizer.reduce_scalar("objective",
-                           m_objective_function->get_mean_value(m_execution_mode),
-                           get_step(execution_mode::training));
+                           m_objective_function->get_mean_value(c.get_execution_mode()),
+                           c.get_step());
   summarizer.reduce_scalar(
     "objective_evaluation_time",
     m_objective_function->get_evaluation_time(),
-    get_step(execution_mode::training));
+    c.get_step());
   summarizer.reduce_scalar(
     "objective_differentiation_time",
     m_objective_function->get_differentiation_time(),
-    get_step(execution_mode::training));
+    c.get_step());
   m_objective_function->reset_counters();
   double total_metric_time = 0.0;
   for (auto&& m : m_metrics) {
@@ -1464,12 +1259,13 @@ void model::summarize_stats(lbann_summary& summarizer) {
   summarizer.reduce_scalar(
     "metric_evaluation_time",
     total_metric_time,
-    get_step(execution_mode::training));
+    c.get_step());
 }
 
 void model::summarize_matrices(lbann_summary& summarizer) {
+  const auto& c = get_execution_context();
   for (El::Int i = 0; i < get_num_layers(); ++i) {
-    get_layer(i).summarize_matrices(summarizer, get_step(execution_mode::training));
+    get_layer(i).summarize_matrices(summarizer, c.get_step());
   }
 }
 
@@ -1479,126 +1275,60 @@ void model::summarize_matrices(lbann_summary& summarizer) {
 
 /* struct used to serialize mode fields in file and MPI transfer */
 struct lbann_model_header {
-  uint32_t execution_mode;
-  uint32_t terminate_training;
-  uint64_t epoch;
-  uint64_t training_step;
-  uint64_t validation_step;
-  uint64_t testing_step;
-  uint32_t max_mini_batch_size;
-  uint32_t current_mini_batch_size;
-  uint32_t callback_type;;
+  uint32_t callback_type;
 };
 
 bool model::save_to_checkpoint_shared(persist& p) {
+  const std::string trainer_dir = p.get_checkpoint_dir();
+  p.open_checkpoint_dir(trainer_dir + '/' + get_name() + '/', m_comm->am_trainer_master());
+  // Make sure that the master has had a chance to create the directories
+  m_comm->trainer_barrier();
   // write out fields we need to save for model
-  if (p.get_cb_type() != callback_type::validation) {
-    if (m_comm->am_trainer_master()) {
-      p.write_uint32(persist_type::train, "execution_mode",     (uint32_t) m_execution_mode);
-      p.write_uint32(persist_type::train, "terminate_training", (uint32_t) m_terminate_training);
-      p.write_uint64(persist_type::train, "epoch",              (uint64_t) m_epoch);
-      p.write_uint64(persist_type::train, "training_step",      (uint64_t) get_step(execution_mode::training));
-      p.write_uint64(persist_type::train, "testing_step",       (uint64_t) get_step(execution_mode::testing));
-      p.write_uint32(persist_type::train, "max_mini_batch_size",      (uint32_t) m_max_mini_batch_size);
-      p.write_uint32(persist_type::train, "current_mini_batch_size",      (uint32_t) m_current_mini_batch_size);
-      p.write_uint32(persist_type::train, "persist_callback_type",      (uint32_t) p.get_cb_type());
-      if(p.get_cb_type() == callback_type::batch)
-        p.write_uint64(persist_type::validate, "validation_step",       (uint64_t) get_step(execution_mode::validation));
-    }
+  if (m_comm->am_trainer_master()) {
+    write_cereal_archive(*this, p, "model.xml");
+  }
 
-    for (weights *w : m_weights) {
-      w->save_to_checkpoint_shared(p);
-    }
+  for (auto&& w : m_weights) {
+    w->save_to_checkpoint_shared(p);
+  }
 
-    for (El::Int i = 0; i < get_num_layers(); ++i) {
-      if (!get_layer(i).save_to_checkpoint_shared(p)) {
-        return false;
-      }
-    }
-    if(p.get_cb_type() == callback_type::batch || get_num_iterations_per_epoch(execution_mode::validation) == 0){
-      save_rng_to_checkpoint_shared(p, m_comm);
-      for (const auto& m : m_metrics) {
-        m->save_to_checkpoint_shared(p);
-      }
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    if (!get_layer(i).save_to_checkpoint_shared(p)) {
+      LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name());
     }
   }
-  else{
-    if (m_comm->am_trainer_master()) {
-      p.write_uint64(persist_type::validate, "validation_step",       (uint64_t) get_step(execution_mode::validation));
-    }
-    save_rng_to_checkpoint_shared(p, m_comm);
-    for (weights *w : m_weights) {
-      w->save_to_checkpoint_shared(p);
-    }
-    for (El::Int i = 0; i < get_num_layers(); ++i) {
-      if (!get_layer(i).save_to_checkpoint_shared(p)) {
-        return false;
-      }
-    }
-    for (const auto& m : m_metrics) {
-      m->save_to_checkpoint_shared(p);
-    }
+  for (const auto& m : m_metrics) {
+    m->save_to_checkpoint_shared(p);
   }
+  p.open_checkpoint_dir(trainer_dir, false);
   return true;
 }
 
 bool model::load_from_checkpoint_shared(persist& p) {
-  // have rank 0 read the file
-  // read state from file
-  struct lbann_model_header header;
+  const std::string trainer_dir = p.get_checkpoint_dir();
+  p.open_restart(trainer_dir + '/' + get_name() + '/');
   // Assume checkpoint reload from epoch end not step end
-  if (m_comm->am_trainer_master()) {
-    if (p.get_cb_type() != callback_type::validation) {
-      p.read_uint32(persist_type::train, "execution_mode",     &header.execution_mode);
-      p.read_uint32(persist_type::train, "terminate_training", &header.terminate_training);
-      p.read_uint64(persist_type::train, "epoch",              &header.epoch);
-      p.read_uint64(persist_type::train, "training_step",       &header.training_step);
-      if(get_num_iterations_per_epoch(execution_mode::validation) != 0)
-        p.read_uint64(persist_type::validate, "validation_step",       &header.validation_step);
-      p.read_uint64(persist_type::train, "testing_step",       &header.testing_step);
-      p.read_uint32(persist_type::train, "max_mini_batch_size",      &header.max_mini_batch_size);
-      p.read_uint32(persist_type::train, "current_mini_batch_size",      &header.current_mini_batch_size);
-      p.read_uint32(persist_type::train, "persist_callback_type",     &header.callback_type);
-    } else {
-      p.read_uint64(persist_type::validate, "validation_step",       &header.validation_step);
-    }
-  }
-  load_rng_from_checkpoint_shared(p, m_comm);
-  // TODO: this assumes homogeneous processors
-  // broadcast state from rank 0
-  m_comm->trainer_broadcast(0, header);
-  // set our member params from values read from disk
-  if (p.get_cb_type() != callback_type::validation) {
-    m_execution_mode     = (execution_mode) header.execution_mode;
-    m_terminate_training = (bool)           header.terminate_training;
-    m_epoch              = (int)            header.epoch;
-    m_step[execution_mode::training] = (int) header.training_step;
-    if(get_num_iterations_per_epoch(execution_mode::validation) != 0)
-      m_step[execution_mode::validation] = (int) header.validation_step;
-    m_step[execution_mode::testing] = (int) header.testing_step;
-    m_max_mini_batch_size = (int)           header.max_mini_batch_size;
-    m_current_mini_batch_size = (int)       header.current_mini_batch_size;
-    // set state of persist object to know which type of ckpt we are returning from.
-    p.set_cb_type((callback_type) header.callback_type);
-  } else {
-    m_step[execution_mode::validation] = (int) header.validation_step;
-  }
-
-  for (weights *w : m_weights) {
+
+  load_from_shared_cereal_archive(*this, p, *get_comm(), "model.xml");
+
+  for (auto&& w : m_weights) {
     w->load_from_checkpoint_shared(p);
   }
 
   // read in each layer
   for (El::Int i = 0; i < get_num_layers(); ++i) {
     if (!get_layer(i).load_from_checkpoint_shared(p)) {
-      return false;
+      LBANN_ERROR("Unable to load layer[",i,"]=", get_layer(i).get_name());
     }
   }
-  if(get_num_iterations_per_epoch(execution_mode::validation) != 0){
+  /// @todo FIXME BVE why are we only reloading the metrics if there
+  //  has been validation iterations?
+  //  if(get_num_iterations_per_epoch(execution_mode::validation) != 0){
     for (const auto& m : m_metrics) {
       m->load_from_checkpoint_shared(p);
     }
-  }
+    //  }
+  p.set_restart_dir(trainer_dir);
 #ifdef LBANN_HAS_GPU
   El::GPUManager::SynchronizeDevice();
 #endif // LBANN_HAS_GPU
@@ -1606,105 +1336,63 @@ bool model::load_from_checkpoint_shared(persist& p) {
 }
 
 bool model::save_to_checkpoint_distributed(persist& p){
-  // write out fields we need to save for model
-  if (p.get_cb_type() != callback_type::validation) {
-    p.write_uint32(persist_type::train, "execution_mode",     (uint32_t) m_execution_mode);
-    p.write_uint32(persist_type::train, "terminate_training", (uint32_t) m_terminate_training);
-    p.write_uint64(persist_type::train, "epoch",              (uint64_t) m_epoch);
-    p.write_uint64(persist_type::train, "training_step",      (uint64_t) get_step(execution_mode::training));
-    p.write_uint64(persist_type::train, "testing_step",       (uint64_t) get_step(execution_mode::testing));
-    p.write_uint32(persist_type::train, "max_mini_batch_size",      (uint32_t) m_max_mini_batch_size);
-    p.write_uint32(persist_type::train, "current_mini_batch_size",      (uint32_t) m_current_mini_batch_size);
-    p.write_uint32(persist_type::train, "persist_callback_type",      (uint32_t) p.get_cb_type());
-    if(p.get_cb_type() == callback_type::batch)
-      p.write_uint64(persist_type::validate, "validataion_step",       (uint64_t) get_step(execution_mode::validation));
-
-    for (weights *w : m_weights) {
-      w->save_to_checkpoint_distributed(p);
-    }
+  const std::string trainer_dir = p.get_checkpoint_dir();
+  p.open_checkpoint_dir(trainer_dir + '/' + get_name() + '/', true);
+  // Make sure that the master has had a chance to create the directories
+  m_comm->trainer_barrier();
 
-    for (El::Int i = 0; i < get_num_layers(); ++i) {
-      if (!get_layer(i).save_to_checkpoint_distributed(p)) {
-        return false;
-      }
-    }
-    if(p.get_cb_type() == callback_type::batch || get_num_iterations_per_epoch(execution_mode::validation) == 0){
-       save_rng_to_checkpoint_shared(p, m_comm);
-      for (const auto& m : m_metrics) {
-        m->save_to_checkpoint_distributed(p);
-      }
-    }
-  }
+  write_cereal_archive(*this, p, "model.xml");
 
-  else {
-    p.write_uint64(persist_type::validate, "validataion_step",       (uint64_t) get_step(execution_mode::validation));
-    save_rng_to_checkpoint_shared(p, m_comm);
+  // for each execution context write out them out
+  for (auto&& w : m_weights) {
+    w->save_to_checkpoint_distributed(p);
+  }
 
-    for (El::Int i = 0; i < get_num_layers(); ++i) {
-      if (!get_layer(i).save_to_checkpoint_distributed(p)) {
-        return false;
-      }
-    }
-    for (const auto& m : m_metrics) {
-      m->save_to_checkpoint_distributed(p);
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    if (!get_layer(i).save_to_checkpoint_distributed(p)) {
+      LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name());
     }
   }
+  for (const auto& m : m_metrics) {
+    m->save_to_checkpoint_distributed(p);
+  }
+
+  p.open_checkpoint_dir(trainer_dir, false);
   return true;
 }
 
 bool model::load_from_checkpoint_distributed(persist& p){
-  struct lbann_model_header header;
-  p.read_uint32(persist_type::train, "execution_mode",     &header.execution_mode);
-  p.read_uint32(persist_type::train, "terminate_training", &header.terminate_training);
-  p.read_uint64(persist_type::train, "epoch",              &header.epoch);
-  p.read_uint64(persist_type::train, "training_step",      &header.training_step);
-  if(get_num_iterations_per_epoch(execution_mode::validation) != 0)
-    p.read_uint64(persist_type::validate, "validation_step",       &header.validation_step);
-  p.read_uint64(persist_type::train, "testing_step",               &header.testing_step);
-  p.read_uint32(persist_type::train, "max_mini_batch_size",      &header.max_mini_batch_size);
-  p.read_uint32(persist_type::train, "current_mini_batch_size",      &header.current_mini_batch_size);
-  p.read_uint32(persist_type::train, "persist_callback_type",     &header.callback_type);
-
-  m_execution_mode     = (execution_mode) header.execution_mode;
-  m_terminate_training = (bool)           header.terminate_training;
-  m_epoch              = (int)            header.epoch;
-  m_step[execution_mode::training] = (int) header.training_step;
-  if(get_num_iterations_per_epoch(execution_mode::validation) != 0)
-    m_step[execution_mode::validation] = (int) header.validation_step;
-  m_step[execution_mode::testing] = (int) header.testing_step;
-  m_max_mini_batch_size = (int)           header.max_mini_batch_size;
-  m_current_mini_batch_size = (int)       header.current_mini_batch_size;
-
-  p.set_cb_type((callback_type) header.callback_type);
-  load_rng_from_checkpoint_shared(p, m_comm);
-
-  for (weights *w : m_weights) {
+  const std::string trainer_dir = p.get_checkpoint_dir();
+  p.open_restart(trainer_dir + '/' + get_name() + '/');
+
+  read_cereal_archive(*this, p, "model.xml");
+
+  for (auto&& w : m_weights) {
     w->load_from_checkpoint_distributed(p);
   }
 
   for (El::Int i = 0; i < get_num_layers(); ++i) {
     if (!get_layer(i).load_from_checkpoint_distributed(p)) {
-      return false;
+      LBANN_ERROR("Unable to load layer[",i,"]=", get_layer(i).get_name());
     }
   }
-  if(get_num_iterations_per_epoch(execution_mode::validation) != 0){
-    for (const auto& m : m_metrics) {
-      m->load_from_checkpoint_distributed(p);
-    }
+  for (const auto& m : m_metrics) {
+    m->load_from_checkpoint_distributed(p);
   }
+  p.set_restart_dir(trainer_dir);
   return true;
 }
 
 void model::write_proto(lbann_data::Model* proto) {
   proto->Clear();
-  if (m_comm->am_world_master())
-    proto->set_mini_batch_size(m_max_mini_batch_size);
+  // if (m_comm->am_world_master())
+  //   proto->set_mini_batch_size(m_max_mini_batch_size);
 }
 
 
 bool model::save_weights(persist& p) {
   // write out fields we need to save a model's weights
-  for (weights *w : m_weights) {
+  for (auto&& w : m_weights) {
     w->save_to_checkpoint_shared(p);
   }
   return true;
@@ -1712,23 +1400,95 @@ bool model::save_weights(persist& p) {
 
 bool model::reload_weights(const std::string latest, const std::vector<std::string>& weight_list) {
   // load weights that appear in weight list.
-  for(weights *w : m_weights) {
+  for(auto&& w : m_weights) {
     w->load_from_save(latest,weight_list);
   }
   return true;
 }
 
 bool model::save_model() {
-  for (auto* c : m_callbacks) {
-    auto *cb = dynamic_cast<lbann_callback_save_model*>(c);
-    if(cb != nullptr) {
-      return cb->save_model(this);
+  for (auto&& c : m_callbacks) {
+    if (auto *cb = dynamic_cast<callback::save_model*>(c.get())) {
+      return cb->do_save_model(this);
     }
   }
-  if(m_comm->am_trainer_master()) {
+  if (m_comm->am_trainer_master()) {
     LBANN_WARNING("save_model was called, but the callback_save_model was not loaded");
   }
   return false;
 }
 
+#ifdef LBANN_HAS_DISTCONV
+void model::setup_distconv() {
+  std::stringstream dc_enabled, dc_disabled;
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    auto &layer = get_layer(i);
+    if (layer.distconv_enabled()) {
+      dc_enabled << " " << layer.get_name();
+    } else {
+      dc_disabled << " " << layer.get_name();
+    }
+  }
+  if (m_comm->am_world_master()) {
+    std::cout << "Distconv-enabled layers: " << dc_enabled.str() << std::endl;
+    std::cout << "Distconv-disabled layers: " << dc_disabled.str() << std::endl;
+  }
+  setup_distributions();
+  print_distributions();
+  // Setup fp tensors
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    auto &layer = get_layer(i);
+    if (!layer.distconv_enabled()) continue;
+    layer.get_distconv_adapter().setup_fp_tensors();
+  }
+  // Setup bp tensors in an reverse order
+  for (El::Int i = get_num_layers() - 1; i >= 0; --i) {
+    auto &layer = get_layer(i);
+    if (!layer.distconv_enabled()) continue;
+    layer.get_distconv_adapter().setup_bp_tensors();
+  }
+  // Final setup.
+  auto workspace_capacity = dc::get_workspace_capacity();
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    auto &layer = get_layer(i);
+    if (!layer.distconv_enabled()) continue;
+    layer.get_distconv_adapter().setup_layer(workspace_capacity);
+  }
+}
+
+void model::setup_distributions() {
+  tensor_overlap_constraints constraints;
+  // Initialize the distributions and constraints
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    if (!get_layer(i).distconv_enabled()) continue;
+    get_layer(i).get_distconv_adapter().setup_distributions(constraints);
+  }
+  // Add inter-layer distribution constraints
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    if (!get_layer(i).distconv_enabled()) continue;
+
+    get_layer(i).get_distconv_adapter().impose_adjacent_overlap_constraints(constraints);
+  }
+  constraints.find_valid_overlap();
+}
+
+void model::print_distributions() const {
+  std::stringstream ss;
+  for (El::Int i = 0; i < get_num_layers(); ++i) {
+    const auto& layer = get_layer(i);
+    if (layer.distconv_enabled()) {
+      ss << layer.get_name()  << " disributions: "
+         << "prev_activations: " << layer.get_distconv_adapter().get_prev_activations_dist()
+         << ", activations: " << layer.get_distconv_adapter().get_activations_dist()
+         << ", error_signals: " << layer.get_distconv_adapter().get_error_signals_dist()
+         << ", prev_error_signals: " << layer.get_distconv_adapter().get_prev_activations_dist()
+         << "\n";
+    } else {
+      ss << layer.get_name() << ": distconv disabled" << "\n";
+    }
+  }
+  dc::MPIRootPrintStreamDebug() << ss.str();
+}
+#endif // LBANN_HAS_DISTCONV
+
 }  // namespace lbann
diff --git a/src/objective_functions/layer_term.cpp b/src/objective_functions/layer_term.cpp
index 74d3edb9d9a..149b281585f 100644
--- a/src/objective_functions/layer_term.cpp
+++ b/src/objective_functions/layer_term.cpp
@@ -49,9 +49,9 @@ const Layer& layer_term::get_layer() const {
   return *layer_pointers.front();
 }
 
-abstract_evaluation_layer& layer_term::get_evaluation_layer() {
+/*abstract_evaluation_*/Layer& layer_term::get_evaluation_layer() {
   auto& l = get_layer();
-  auto* eval = dynamic_cast<abstract_evaluation_layer*>(&l);
+  auto* eval = dynamic_cast<abstract_evaluation_layer<DataType>*>(&l);
   if (eval == nullptr) {
     std::stringstream err;
     err << "attempted to get the evaluation layer corresponding to "
@@ -65,20 +65,24 @@ abstract_evaluation_layer& layer_term::get_evaluation_layer() {
 
 void layer_term::setup(model& m) {
   objective_function_term::setup(m);
-  get_evaluation_layer().set_scale(m_scale_factor);
+  auto& eval = dynamic_cast<abstract_evaluation_layer<DataType>&>(get_evaluation_layer());
+  eval.set_scale(m_scale_factor);
+  //get_evaluation_layer().set_scale(m_scale_factor);
 }
 
 void layer_term::start_evaluation() {}
 
 EvalType layer_term::finish_evaluation() {
   if (m_scale_factor == EvalType(0)) { return EvalType(0); }
-  auto& eval = get_evaluation_layer();
+  auto& eval = dynamic_cast<abstract_evaluation_layer<DataType>&>(get_evaluation_layer());
   eval.set_scale(m_scale_factor);
   return eval.get_value();
 }
 
 void layer_term::differentiate() {
-  get_evaluation_layer().set_scale(m_scale_factor);
+  auto& eval = dynamic_cast<abstract_evaluation_layer<DataType>&>(get_evaluation_layer());
+  eval.set_scale(m_scale_factor);
+  // get_evaluation_layer().set_scale(m_scale_factor);
 }
 
 }  // namespace lbann
diff --git a/src/objective_functions/weight_regularization/l2.cpp b/src/objective_functions/weight_regularization/l2.cpp
index 288c5adc4cd..36c94602e34 100644
--- a/src/objective_functions/weight_regularization/l2.cpp
+++ b/src/objective_functions/weight_regularization/l2.cpp
@@ -29,32 +29,37 @@
 #ifdef LBANN_HAS_GPU
 #include "lbann/utils/cublas.hpp"
 #endif // LBANN_HAS_GPU
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/utils/h2_tmp.hpp"
 
 namespace lbann {
 
 template <>
-void l2_weight_regularization::accumulate_contribution<El::Device::CPU>(const CPUMat& vals,
-                                                                        CPUMat& contribution) {
+void l2_weight_regularization::accumulate_contribution<El::Device::CPU>(const CPUMatType& vals,
+                                                                        CPUMatType& contribution) {
   auto& sqsum = contribution(0, 0);
-  if (vals.IsEmpty()) {
-  } else if (vals.Contiguous()) {
-    const size_t size = vals.Height() * vals.Width();
-    const auto& __restrict__ vals_buf = vals.LockedBuffer();
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sqsum))
-    for (size_t i = 0; i < size; ++i) {
-      const auto& val = vals_buf[i];
-      sqsum += val * val;
-    }
-  } else {
-    const El::Int height = vals.Height();
-    const El::Int width = vals.Width();
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sqsum) collapse(2))
-    for (El::Int col = 0; col < width; ++col) {
-      for (El::Int row = 0; row < height; ++row) {
-        const EvalType val = vals(row, col);
+  if (!vals.IsEmpty()) {
+    if (vals.Contiguous()) {
+      const size_t size = vals.Height() * vals.Width();
+      const auto& __restrict__ vals_buf = vals.LockedBuffer();
+      LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sqsum))
+      for (size_t i = 0; i < size; ++i) {
+        const auto& val = vals_buf[i];
         sqsum += val * val;
       }
     }
+    else {
+      const El::Int height = vals.Height();
+      const El::Int width = vals.Width();
+      LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sqsum) collapse(2))
+      for (El::Int col = 0; col < width; ++col) {
+        for (El::Int row = 0; row < height; ++row) {
+          const EvalType val = vals(row, col);
+          sqsum += val * val;
+        }
+      }
+    }
   }
 }
 
@@ -66,7 +71,8 @@ void l2_weight_regularization::setup(model& m) {
 
   // Check that term has no layer pointers
   if (!m_layers.empty()) {
-    LBANN_ERROR("attempted to setup L2 weight regularization with layer pointers");
+    LBANN_ERROR("attempted to setup L2 weight regularization "
+                "with no layer pointers");
   }
 
   // Add all weights in model if no weights pointers are provided
@@ -76,7 +82,7 @@ void l2_weight_regularization::setup(model& m) {
 
   // Construct accumulation variables for each device
   for (auto* w : m_weights) {
-    const auto& device = w->get_values().GetLocalDevice();
+    const auto& device = dynamic_cast<WeightsType*>(w)->get_values().GetLocalDevice();
     if (m_contributions.count(device) == 0) {
 #ifdef LBANN_HAS_GPU
       m_contributions[device].SetMemoryMode(1); // Pinned memory
@@ -96,16 +102,16 @@ void l2_weight_regularization::start_evaluation() {
     auto& contribution = m_contributions[El::Device::CPU];
     contribution(0, 0) = DataType(0);
     for (El::Int i = 0; i < num_weights; ++i) {
-      const auto& vals = m_weights[i]->get_values();
+      const auto& vals = dynamic_cast<WeightsType*>(m_weights[i])->get_values();
       if (vals.GetLocalDevice() == El::Device::CPU
           && vals.Participating()
           && vals.RedundantRank() == i % vals.RedundantSize()) {
         accumulate_contribution<El::Device::CPU>(
-          static_cast<const CPUMat&>(vals.LockedMatrix()),
+          static_cast<const CPUMatType&>(vals.LockedMatrix()),
           contribution);
       }
     }
-    get_comm().nb_allreduce(static_cast<AbsMat&>(contribution),
+    get_comm().nb_allreduce(static_cast<El::AbstractMatrix<AccumulateDataType>&>(contribution),
                             get_comm().get_trainer_comm(),
                             m_allreduce_req);
   }
@@ -114,22 +120,22 @@ void l2_weight_regularization::start_evaluation() {
   // Compute contributions from GPU weights
   if (m_contributions.count(El::Device::GPU) > 0) {
     auto&& stream = El::GPUManager::Stream();
-    GPUMat contribution;
+    DMatType<El::Device::GPU> contribution;
 #ifdef HYDROGEN_HAVE_CUB
     contribution.SetMemoryMode(1); // CUB GPU memory pool
 #endif // HYDROGEN_HAVE_CUB
     El::Zeros(contribution, 1, 1);
     for (El::Int i = 0; i < num_weights; ++i) {
-      const auto& vals = m_weights[i]->get_values();
+      const auto& vals = dynamic_cast<WeightsType*>(m_weights[i])->get_values();
       if (vals.GetLocalDevice() == El::Device::GPU
           && vals.Participating()
           && vals.RedundantRank() == i % vals.RedundantSize()) {
         accumulate_contribution<El::Device::GPU>(
-          static_cast<const GPUMat&>(vals.LockedMatrix()),
+          static_cast<const DMatType<El::Device::GPU>&>(vals.LockedMatrix()),
           contribution);
       }
     }
-    get_comm().allreduce(static_cast<AbsMat&>(contribution),
+    get_comm().allreduce(static_cast<El::AbstractMatrix<AccumulateDataType>&>(contribution),
                          get_comm().get_trainer_comm());
     CHECK_CUDA(cudaMemcpyAsync(m_contributions[El::Device::GPU].Buffer(),
                                contribution.LockedBuffer(),
@@ -158,12 +164,53 @@ EvalType l2_weight_regularization::finish_evaluation() {
   return m_scale_factor * sqsum / 2;
 }
 
+// Somewhat hacky approach to avoid a BaseDistMat implementation of
+// optimizer::add_to_gradient, since this is literally the only
+// use-case. Given the line count, this was clearly the way to go... :/
+namespace {
+struct AddToGrad {
+  AddToGrad(optimizer& opt, EvalType scale_factor)
+    : opt_{&opt},
+      scale_{scale_factor}
+  {}
+  void DispatchError(El::BaseDistMatrix const&) {
+    LBANN_ERROR("Unable to dispatch!");
+  }
+  void DeductionError(El::BaseDistMatrix const&) {
+    LBANN_ERROR("Unable to deduce type!");
+  }
+  template <typename T>
+  void operator()(El::AbstractDistMatrix<T> const& contrib) {
+    opt_->add_to_gradient(contrib, El::To<T>(scale_));
+  }
+  optimizer* opt_;
+  EvalType scale_;
+};// struct AddToGrad
+
+using ValidDataTypes = h2::meta::TL<
+#ifdef LBANN_HAS_GPU_FP16
+  fp16,
+#endif
+#ifdef LBANN_HAS_HALF
+  cpu_fp16,
+#endif
+  float, double>;
+
+using MatTypes =
+  h2::meta::tlist::ExpandTL<El::AbstractDistMatrix, ValidDataTypes>;
+
+using DispatcherType =
+  h2::multimethods::SwitchDispatcher<AddToGrad,
+                                     void,
+                                     El::BaseDistMatrix, MatTypes>;
+}
+
 void l2_weight_regularization::compute_weight_regularization() {
   if (m_scale_factor == EvalType(0)) { return; }
-  for (auto&& w : m_weights) {
-    auto&& opt = w->get_optimizer();
+  for (auto* w : m_weights) {
+    auto* opt = w->get_optimizer();
     if (opt != nullptr) {
-      opt->add_to_gradient(w->get_values(), m_scale_factor);
+      DispatcherType::Exec(AddToGrad(*opt, m_scale_factor), w->get_values());
     }
   }
 }
diff --git a/src/objective_functions/weight_regularization/l2.cu b/src/objective_functions/weight_regularization/l2.cu
index f459a80ebc2..7a823a9d8b6 100644
--- a/src/objective_functions/weight_regularization/l2.cu
+++ b/src/objective_functions/weight_regularization/l2.cu
@@ -34,10 +34,10 @@ namespace lbann {
 
 namespace {
 
-template <El::Int block_size>
+template <typename TensorDataType, El::Int block_size>
 __global__ void accumulate_contribution_kernel(El::Int height,
                                                El::Int width,
-                                               const DataType * __restrict__ vals,
+                                               const TensorDataType * __restrict__ vals,
                                                El::Int vals_ldim,
                                                DataType * __restrict__ contribution) {
 
@@ -47,7 +47,7 @@ __global__ void accumulate_contribution_kernel(El::Int height,
   const El::Int nthreads = blockDim.x * gridDim.x;
 
   // Compute contributions for each thread
-  DataType private_contribution = 0;
+  TensorDataType private_contribution = 0;
   const auto& size = height * width;
   for (El::Int i = gid; i < size; i += nthreads) {
     const auto& row = i % height;
@@ -58,7 +58,7 @@ __global__ void accumulate_contribution_kernel(El::Int height,
 
   // Shared memory reduction to get contribution for each block
   /// @todo unroll loops
-  __shared__ DataType shared_contribution[block_size];
+  __shared__ TensorDataType shared_contribution[block_size];
   shared_contribution[tid] = private_contribution;
   for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
     __syncthreads();
@@ -75,15 +75,15 @@ __global__ void accumulate_contribution_kernel(El::Int height,
 } // namespace
 
 template <>
-void l2_weight_regularization::accumulate_contribution<El::Device::GPU>(const GPUMat& vals,
-                                                                        GPUMat& contribution) {
+void l2_weight_regularization::accumulate_contribution<El::Device::GPU>(const El::Matrix<AccumulateDataType, El::Device::GPU>& vals,
+                                                                        El::Matrix<AccumulateDataType, El::Device::GPU>& contribution) {
   if (!vals.IsEmpty()) {
     const auto& size = vals.Height() * vals.Width();
     const El::Int block_size = 256;
     const auto& grid_size = (size + block_size - 1) / block_size;
     auto&& stream = El::GPUManager::Stream();
     CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    accumulate_contribution_kernel<block_size>
+    accumulate_contribution_kernel<AccumulateDataType, block_size>
       <<<grid_size, block_size, 0, stream>>>(
         vals.Height(), vals.Width(),
         vals.LockedBuffer(), vals.LDim(),
diff --git a/src/optimizers/CMakeLists.txt b/src/optimizers/CMakeLists.txt
index bb2d84ada91..8f7992c3003 100644
--- a/src/optimizers/CMakeLists.txt
+++ b/src/optimizers/CMakeLists.txt
@@ -2,6 +2,7 @@
 set_full_path(THIS_DIR_SOURCES
   adagrad.cpp
   adam.cpp
+  data_type_optimizer.cpp
   hypergradient_adam.cpp
   optimizer.cpp
   rmsprop.cpp
diff --git a/src/optimizers/adagrad.cpp b/src/optimizers/adagrad.cpp
index 49198956f82..bf53cf9164c 100644
--- a/src/optimizers/adagrad.cpp
+++ b/src/optimizers/adagrad.cpp
@@ -26,38 +26,46 @@
 
 #include "lbann/optimizers/adagrad.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
 
 namespace lbann {
 
-adagrad::adagrad(lbann_comm *comm, DataType learning_rate, DataType eps)
-  : optimizer(comm, learning_rate), m_eps(eps) {}
+template <typename TensorDataType>
+adagrad<TensorDataType>::adagrad(TensorDataType learning_rate, TensorDataType eps)
+  : BaseType(learning_rate), m_eps(eps) {}
 
-adagrad::adagrad(const adagrad& other)
-  : optimizer(other),
+template <typename TensorDataType>
+adagrad<TensorDataType>::adagrad(const adagrad<TensorDataType>& other)
+  : BaseType(other),
     m_eps(other.m_eps),
     m_cache(other.m_cache ? other.m_cache->Copy() : nullptr) {}
 
-adagrad& adagrad::operator=(const adagrad& other) {
-  optimizer::operator=(other);
+template <typename TensorDataType>
+adagrad<TensorDataType>& adagrad<TensorDataType>::operator=(const adagrad<TensorDataType>& other) {
+  OptimizerType::operator=(other);
   m_eps = other.m_eps;
   m_cache.reset(other.m_cache ? other.m_cache->Copy() : nullptr);
   return *this;
 }
 
-description adagrad::get_description() const {
-  auto&& desc = optimizer::get_description();
+template <typename TensorDataType>
+description adagrad<TensorDataType>::get_description() const {
+  auto desc = OptimizerType::get_description();
   desc.add("eps", m_eps);
   return desc;
 }
 
-void adagrad::setup(weights* w) {
-  optimizer::setup(w);
+template <typename TensorDataType>
+void adagrad<TensorDataType>::setup(WeightsType* w) {
+  OptimizerType::setup(w);
   const auto& gradient = this->get_gradient();
-  m_cache.reset(AbsDistMat::Instantiate(gradient.DistData()));
+  m_cache.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
   El::Zeros(*m_cache, gradient.Height(), gradient.Width());
 }
 
-void adagrad::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void adagrad<TensorDataType>::step_compute(AbsDistMatrixType& values,
+                                           const AbsDistMatrixType& gradient) {
   switch (values.GetLocalDevice()) {
   case El::Device::CPU: step_compute_cpu(values, gradient); break;
 #ifdef LBANN_HAS_CUDA
@@ -71,7 +79,9 @@ void adagrad::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
-void adagrad::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void adagrad<TensorDataType>::step_compute_cpu(AbsDistMatrixType& values,
+                                               const AbsDistMatrixType& gradient) {
 
   // Get local matrix data
   const size_t local_height = values.LocalHeight();
@@ -84,7 +94,7 @@ void adagrad::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
   const size_t cache_ldim = m_cache->LDim();
 
   // Apply AdaGrad step
-  const auto& learning_rate = get_learning_rate();
+  const auto& learning_rate = this->get_learning_rate();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (size_t col = 0; col < local_width; ++col) {
     for (size_t row = 0; row < local_height; ++row) {
@@ -92,7 +102,7 @@ void adagrad::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
       const auto& g = gradient_buffer[row+col*gradient_ldim];
       auto& c = cache_buffer[row+col*cache_ldim];
       c += g * g;
-      x -= learning_rate * g / (std::sqrt(c) + m_eps);
+      x -= learning_rate * g / (El::Sqrt(c) + m_eps);
     }
   }
 
@@ -102,8 +112,11 @@ void adagrad::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
 // Checkpointing
 // =============================================
 
-bool adagrad::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_shared(p, name_prefix);
+template <typename TensorDataType>
+bool adagrad<TensorDataType>::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
+  if (this->get_comm().am_trainer_master()) {
+    write_cereal_archive(*this, p, "adagrad.xml");
+  }
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
@@ -112,18 +125,20 @@ bool adagrad::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool adagrad::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_shared(p, name_prefix);
-  char l_name[512];
+template <typename TensorDataType>
+bool adagrad<TensorDataType>::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
+  load_from_shared_cereal_archive(*this, p, this->get_comm(), "adagrad.xml");
 
+  char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld.bin", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
   p.read_distmat(persist_type::train, l_name, m_cache.get());
 
   return true;
 }
 
-bool adagrad::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_distributed(p, name_prefix);
+template <typename TensorDataType>
+bool adagrad<TensorDataType>::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
+  write_cereal_archive(*this, p, "adagrad.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
@@ -132,14 +147,35 @@ bool adagrad::save_to_checkpoint_distributed(persist& p, std::string name_prefix
   return true;
 }
 
-bool adagrad::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_distributed(p, name_prefix);
-  char l_name[512];
+template <typename TensorDataType>
+bool adagrad<TensorDataType>::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
+  read_cereal_archive(*this, p, "adagrad.xml");
 
+  char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
   p.read_rank_distmat(persist_type::train, l_name, *m_cache);
 
   return true;
 }
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_adagrad_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Optimizer::AdaGrad const&>(msg);
+  return make_unique<adagrad<TensorDataType>>(TensorDataType(params.learn_rate()),
+                                              TensorDataType(params.eps()));
+}
+
+#define PROTO(T)                                    \
+  template class adagrad<T>;                        \
+  template std::unique_ptr<optimizer>               \
+  build_adagrad_optimizer_from_pbuf<T>(             \
+    google::protobuf::Message const&)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/optimizers/adagrad.cu b/src/optimizers/adagrad.cu
index 39360e1f2be..9325c5efdb3 100644
--- a/src/optimizers/adagrad.cu
+++ b/src/optimizers/adagrad.cu
@@ -30,15 +30,16 @@ namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 __global__ void adagrad_kernel(size_t height,
                                size_t width,
-                               DataType learning_rate,
-                               DataType eps,
-                               DataType * __restrict__ values,
+                               TensorDataType learning_rate,
+                               TensorDataType eps,
+                               TensorDataType * __restrict__ values,
                                size_t values_ldim,
-                               const DataType * __restrict__ gradient,
+                               const TensorDataType * __restrict__ gradient,
                                size_t gradient_ldim,
-                               DataType * __restrict__ cache,
+                               TensorDataType * __restrict__ cache,
                                size_t cache_ldim) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t nthreads = blockDim.x * gridDim.x;
@@ -55,7 +56,9 @@ __global__ void adagrad_kernel(size_t height,
 
 } // namespace
 
-void adagrad::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void adagrad<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
+                                               const AbsDistMatrixType& gradient) {
   const size_t local_height = values.LocalHeight();
   const size_t local_width = values.LocalWidth();
   const size_t local_size = local_height * local_width;
@@ -63,7 +66,7 @@ void adagrad::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
     constexpr size_t block_size = 256;
     const size_t grid_size = (local_size + block_size - 1) / block_size;
     auto&& stream = El::GPUManager::Stream();
-    adagrad_kernel<<<grid_size, block_size, 0, stream>>>(
+    adagrad_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
       this->get_learning_rate(), m_eps,
       values.Buffer(), values.LDim(),
@@ -72,4 +75,20 @@ void adagrad::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
+#ifdef LBANN_HAS_HALF
+template <>
+void adagrad<cpu_fp16>::step_compute_gpu(AbsDistMatrixType&,
+                                         const AbsDistMatrixType&) {
+  LBANN_ERROR("Can't call this function with cpu_fp16!");
+}
+#endif // LBANN_HAS_HALF
+
+#define PROTO(T)                                \
+  template void adagrad<T>::step_compute_gpu(   \
+    El::AbstractDistMatrix<T>&,                 \
+    const El::AbstractDistMatrix<T>&)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp
index 37a6c912f5c..551920b4f94 100644
--- a/src/optimizers/adam.cpp
+++ b/src/optimizers/adam.cpp
@@ -26,19 +26,21 @@
 
 #include "lbann/optimizers/adam.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
 
 namespace lbann {
 
-adam::adam(lbann_comm* comm,
-           DataType learning_rate,
-           DataType beta1,
-           DataType beta2,
-           DataType eps)
-  : optimizer(comm, learning_rate),
+template <typename TensorDataType>
+adam<TensorDataType>::adam(TensorDataType learning_rate,
+                           TensorDataType beta1,
+                           TensorDataType beta2,
+                           TensorDataType eps)
+  : BaseType(learning_rate),
     m_beta1(beta1), m_beta2(beta2), m_eps(eps) {}
 
-adam::adam(const adam& other)
-  : optimizer(other),
+template <typename TensorDataType>
+adam<TensorDataType>::adam(const adam& other)
+  : BaseType(other),
     m_beta1(other.m_beta1),
     m_beta2(other.m_beta2),
     m_eps(other.m_eps),
@@ -47,8 +49,9 @@ adam::adam(const adam& other)
     m_moment1(other.m_moment1 ? other.m_moment1->Copy() : nullptr),
     m_moment2(other.m_moment2 ? other.m_moment2->Copy() : nullptr) {}
 
-adam& adam::operator=(const adam& other) {
-  optimizer::operator=(other);
+template <typename TensorDataType>
+adam<TensorDataType>& adam<TensorDataType>::operator=(const adam<TensorDataType>& other) {
+  OptimizerType::operator=(other);
   m_beta1 = other.m_beta1;
   m_beta2 = other.m_beta2;
   m_eps = other.m_eps;
@@ -61,51 +64,68 @@ adam& adam::operator=(const adam& other) {
   return *this;
 }
 
-description adam::get_description() const {
-  auto&& desc = optimizer::get_description();
+template <typename TensorDataType>
+description adam<TensorDataType>::get_description() const {
+  auto desc = OptimizerType::get_description();
   desc.add("beta1", m_beta1);
   desc.add("beta2", m_beta2);
   desc.add("eps", m_eps);
   return desc;
 }
 
-const AbsDistMat& adam::get_moment1() const {
+template <typename TensorDataType>
+auto adam<TensorDataType>::get_moment1() const -> const AbsDistMatrixType& {
   if (m_moment1 == nullptr) {
     LBANN_ERROR(this->get_type() + " optimizer "
                 + "attempted to access moment1 before it was setup");
   }
   return *m_moment1;
 }
-AbsDistMat& adam::get_moment1() {
+template <typename TensorDataType>
+auto adam<TensorDataType>::get_moment1() -> AbsDistMatrixType& {
   // Item 3, p. 23 in "Effective C++", 3rd ed., by Scott Meyers
-  return const_cast<AbsDistMat&>(static_cast<const adam&>(*this).get_moment1());
+  return const_cast<El::AbstractDistMatrix<TensorDataType>&>(static_cast<const adam<TensorDataType>&>(*this).get_moment1());
 }
-const AbsDistMat& adam::get_moment2() const {
+template <typename TensorDataType>
+auto adam<TensorDataType>::get_moment2() const -> const AbsDistMatrixType& {
   if (m_moment2 == nullptr) {
     LBANN_ERROR(this->get_type() + " optimizer "
                 + "attempted to access moment2 before it was setup");
   }
   return *m_moment2;
 }
-AbsDistMat& adam::get_moment2() {
+template <typename TensorDataType>
+auto adam<TensorDataType>::get_moment2() -> AbsDistMatrixType& {
   // Item 3, p. 23 in "Effective C++", 3rd ed., by Scott Meyers
-  return const_cast<AbsDistMat&>(static_cast<const adam&>(*this).get_moment2());
+  return const_cast<AbsDistMatrixType&>(static_cast<const adam<TensorDataType>&>(*this).get_moment2());
 }
 
-void adam::setup(weights* w) {
-  optimizer::setup(w);
+template <typename TensorDataType>
+void adam<TensorDataType>::setup(WeightsType* w) {
+  OptimizerType::setup(w);
   const auto& gradient = this->get_gradient();
-  m_moment1.reset(AbsDistMat::Instantiate(gradient.DistData()));
-  m_moment2.reset(AbsDistMat::Instantiate(gradient.DistData()));
+  m_moment1.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
+  m_moment2.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
   El::Zeros(*m_moment1, gradient.Height(), gradient.Width());
   El::Zeros(*m_moment2, gradient.Height(), gradient.Width());
 }
 
-void adam::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void adam<TensorDataType>::step_compute(AbsDistMatrixType& values,
+                                        const AbsDistMatrixType& gradient) {
+  static const auto one = TensorDataType(1.);
+
+  // Precompute the bias correction and learning rate.
+  m_current_beta1 *= m_beta1;
+  m_current_beta2 *= m_beta2;
+  const TensorDataType correction = this->get_learning_rate() *
+                              (El::Sqrt(one - m_current_beta2)
+                               / (one - m_current_beta1));
+
   switch (values.GetLocalDevice()) {
-  case El::Device::CPU: step_compute_cpu(values, gradient); break;
+  case El::Device::CPU: step_compute_cpu(values, gradient, correction); break;
 #ifdef LBANN_HAS_CUDA
-  case El::Device::GPU: step_compute_gpu(values, gradient); break;
+  case El::Device::GPU: step_compute_gpu(values, gradient, correction); break;
 #endif // LBANN_HAS_CUDA
   default:
     std::ostringstream err;
@@ -115,15 +135,11 @@ void adam::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
-void adam::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
-  constexpr DataType one = 1;
-
-  // Precompute the bias correction and learning rate.
-  m_current_beta1 *= m_beta1;
-  m_current_beta2 *= m_beta2;
-  const DataType correction = this->get_learning_rate() *
-                              (std::sqrt(one - m_current_beta2)
-                               / (one - m_current_beta1));
+template <typename TensorDataType>
+void adam<TensorDataType>::step_compute_cpu(AbsDistMatrixType& values,
+                                            const AbsDistMatrixType& gradient,
+                                            const TensorDataType& correction) {
+  static const auto one = TensorDataType(1.);
 
   // Get local matrix data
   const size_t local_height = values.LocalHeight();
@@ -146,7 +162,7 @@ void adam::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
       auto& m2 = moment2_buffer[i];
       m1 = m_beta1 * m1 + (one - m_beta1) * g;
       m2 = m_beta2 * m2 + (one - m_beta2) * g * g;
-      x -= correction * m1 / (std::sqrt(m2) + m_eps);
+      x -= correction * m1 / (El::Sqrt(m2) + m_eps);
     }
 
   } else {
@@ -165,7 +181,7 @@ void adam::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
         auto& m2 = moment2_buffer[row+col*moment2_ldim];
         m1 = m_beta1 * m1 + (one - m_beta1) * g;
         m2 = m_beta2 * m2 + (one - m_beta2) * g * g;
-        x -= correction * m1 / (std::sqrt(m2) + m_eps);
+        x -= correction * m1 / (El::Sqrt(m2) + m_eps);
       }
     }
 
@@ -177,11 +193,10 @@ void adam::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
 // Checkpointing
 // =============================================
 
-bool adam::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_shared(p, name_prefix);
-
-  if (get_comm().am_trainer_master()) {
-    pack_scalars(p);
+template <typename TensorDataType>
+bool adam<TensorDataType>::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
+  if (this->get_comm().am_trainer_master()) {
+    write_cereal_archive(*this, p, "adam.xml");
   }
 
   char l_name[512];
@@ -194,16 +209,9 @@ bool adam::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool adam::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_shared(p, name_prefix);
-  struct packing_header header;
-  if (get_comm().am_trainer_master()) {
-    unpack_scalars(p, &header);
-  }
-
-  get_comm().trainer_broadcast(0, header);
-
-  unpack_header(header);
+template <typename TensorDataType>
+bool adam<TensorDataType>::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
+  load_from_shared_cereal_archive(*this, p, this->get_comm(), "adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld.bin", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -215,10 +223,9 @@ bool adam::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool adam::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_distributed(p, name_prefix);
-
-  pack_scalars(p);
+template <typename TensorDataType>
+bool adam<TensorDataType>::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
+  write_cereal_archive(*this, p, "adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -230,10 +237,9 @@ bool adam::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_distributed(p, name_prefix);
-  struct packing_header header;
-  unpack_scalars(p, &header);
+template <typename TensorDataType>
+bool adam<TensorDataType>::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
+  read_cereal_archive(*this, p, "adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -245,4 +251,26 @@ bool adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix)
   return true;
 }
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_adam_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Optimizer::Adam const&>(msg);
+  return make_unique<adam<TensorDataType>>(TensorDataType(params.learn_rate()),
+                                           TensorDataType(params.beta1()),
+                                           TensorDataType(params.beta2()),
+                                           TensorDataType(params.eps()));
+}
+
+#define PROTO(T)                                    \
+  template class adam<T>;                           \
+  template std::unique_ptr<optimizer>               \
+  build_adam_optimizer_from_pbuf<T>(                \
+    google::protobuf::Message const&)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/optimizers/adam.cu b/src/optimizers/adam.cu
index 335ea3fcd6f..6901a990ead 100644
--- a/src/optimizers/adam.cu
+++ b/src/optimizers/adam.cu
@@ -30,19 +30,20 @@ namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 __global__ void adam_noncontiguous_kernel(size_t height,
                                           size_t width,
-                                          DataType correction,
-                                          DataType eps,
-                                          DataType beta1,
-                                          DataType beta2,
-                                          DataType * __restrict__ values,
+                                          TensorDataType correction,
+                                          TensorDataType eps,
+                                          TensorDataType beta1,
+                                          TensorDataType beta2,
+                                          TensorDataType * __restrict__ values,
                                           size_t values_ldim,
-                                          const DataType * __restrict__ gradient,
+                                          const TensorDataType * __restrict__ gradient,
                                           size_t gradient_ldim,
-                                          DataType * __restrict__ moment1,
+                                          TensorDataType * __restrict__ moment1,
                                           size_t moment1_ldim,
-                                          DataType * __restrict__ moment2,
+                                          TensorDataType * __restrict__ moment2,
                                           size_t moment2_ldim) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   if (gid < height * width) {
@@ -52,45 +53,40 @@ __global__ void adam_noncontiguous_kernel(size_t height,
     auto& m1 = moment1[row + col * moment1_ldim];
     auto& m2 = moment2[row + col * moment2_ldim];
     auto& x = values[row + col * values_ldim];
-    m1 = beta1 * m1 + (DataType(1) - beta1) * g;
-    m2 = beta2 * m2 + (DataType(1) - beta2) * g * g;
+    m1 = beta1 * m1 + (TensorDataType(1) - beta1) * g;
+    m2 = beta2 * m2 + (TensorDataType(1) - beta2) * g * g;
     x -= correction * m1 / (cuda::sqrt(m2) + eps);
   }
 }
 
+template <typename TensorDataType>
 __global__ void adam_contiguous_kernel(size_t size,
-                                       DataType correction,
-                                       DataType eps,
-                                       DataType beta1,
-                                       DataType beta2,
-                                       DataType * __restrict__ values,
-                                       const DataType * __restrict__ gradient,
-                                       DataType * __restrict__ moment1,
-                                       DataType * __restrict__ moment2) {
+                                       TensorDataType correction,
+                                       TensorDataType eps,
+                                       TensorDataType beta1,
+                                       TensorDataType beta2,
+                                       TensorDataType * __restrict__ values,
+                                       const TensorDataType * __restrict__ gradient,
+                                       TensorDataType * __restrict__ moment1,
+                                       TensorDataType * __restrict__ moment2) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   if (gid < size) {
     const auto& g = gradient[gid] + eps;
     auto& m1 = moment1[gid];
     auto& m2 = moment2[gid];
     auto& x = values[gid];
-    m1 = beta1 * m1 + (DataType(1) - beta1) * g;
-    m2 = beta2 * m2 + (DataType(1) - beta2) * g * g;
+    m1 = beta1 * m1 + (TensorDataType(1) - beta1) * g;
+    m2 = beta2 * m2 + (TensorDataType(1) - beta2) * g * g;
     x -= correction * m1 / (cuda::sqrt(m2) + eps);
   }
 }
 
 } // namespace
 
-void adam::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
-  constexpr DataType one = 1;
-
-  // Precompute the bias correction and learning rate.
-  m_current_beta1 *= m_beta1;
-  m_current_beta2 *= m_beta2;
-  const DataType correction = this->get_learning_rate() *
-                              (std::sqrt(one - m_current_beta2)
-                               / (one - m_current_beta1));
-
+template <typename TensorDataType>
+void adam<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
+                                            const AbsDistMatrixType& gradient,
+                                            const TensorDataType& correction) {
   // Get matrix dimensions
   const size_t local_height = values.LocalHeight();
   const size_t local_width = values.LocalWidth();
@@ -103,12 +99,12 @@ void adam::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
   auto&& stream = El::GPUManager::Stream();
   if (values.Contiguous() && gradient.Contiguous()
       && m_moment1->Contiguous() && m_moment2->Contiguous()) {
-    adam_contiguous_kernel<<<grid_size, block_size, 0, stream>>>(
+    adam_contiguous_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_size, correction, m_eps, m_beta1, m_beta2,
       values.Buffer(), gradient.LockedBuffer(),
       m_moment1->Buffer(), m_moment2->Buffer());
   } else {
-    adam_noncontiguous_kernel<<<grid_size, block_size, 0, stream>>>(
+    adam_noncontiguous_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width, correction, m_eps, m_beta1, m_beta2,
       values.Buffer(), values.LDim(),
       gradient.LockedBuffer(), gradient.LDim(),
@@ -118,4 +114,22 @@ void adam::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
 
 }
 
+#ifdef LBANN_HAS_HALF
+template <>
+void adam<cpu_fp16>::step_compute_gpu(AbsDistMatrixType&,
+                                      const AbsDistMatrixType&,
+                                      const cpu_fp16&) {
+  LBANN_ERROR("Can't call this function with cpu_fp16!");
+}
+#endif // LBANN_HAS_HALF
+
+
+#define PROTO(T)                                \
+  template void adam<T>::step_compute_gpu(      \
+    El::AbstractDistMatrix<T>&,                 \
+    const El::AbstractDistMatrix<T>&, const T&)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/optimizers/data_type_optimizer.cpp b/src/optimizers/data_type_optimizer.cpp
new file mode 100644
index 00000000000..73e9d58510e
--- /dev/null
+++ b/src/optimizers/data_type_optimizer.cpp
@@ -0,0 +1,171 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DATA_TYPE_OPTIMIZER_INSTANTIATE
+#include "lbann/optimizers/data_type_optimizer.hpp"
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/utils/timer.hpp"
+#include "lbann/io/persist.hpp"
+
+namespace lbann {
+
+template <typename TensorDataType>
+data_type_optimizer<TensorDataType>::data_type_optimizer(TensorDataType learning_rate)
+  : m_learning_rate(learning_rate) {}
+
+template <typename TensorDataType>
+data_type_optimizer<TensorDataType>::data_type_optimizer(const data_type_optimizer<TensorDataType>& other)
+  : BaseType(other),
+    m_weights(other.m_weights),
+    m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr),
+    m_gradient_v(other.m_gradient_v ? other.m_gradient_v->Copy() : nullptr),
+    m_learning_rate(other.m_learning_rate) {}
+
+template <typename TensorDataType>
+data_type_optimizer<TensorDataType>&
+data_type_optimizer<TensorDataType>::operator=(
+  const data_type_optimizer<TensorDataType>& other) {
+  optimizer::operator=(other);
+  m_weights = other.m_weights;
+  m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr);
+  m_gradient_v.reset(other.m_gradient_v ? other.m_gradient_v->Copy() : nullptr);
+  m_learning_rate = other.m_learning_rate;
+  return *this;
+}
+
+template <typename TensorDataType>
+description data_type_optimizer<TensorDataType>::get_description() const {
+  description desc = optimizer::get_description();
+  desc.add("Learning rate", m_learning_rate);
+  return desc;
+}
+
+template <typename TensorDataType>
+auto data_type_optimizer<TensorDataType>::get_weights() -> WeightsType& {
+  // Item 3, p. 23 in "Effective C++", 3rd ed., by Scott Meyers
+  return const_cast<WeightsType&>(static_cast<const data_type_optimizer&>(*this).get_weights());
+}
+
+template <typename TensorDataType>
+auto data_type_optimizer<TensorDataType>::get_weights() const -> const WeightsType& {
+  if (m_weights == nullptr) {
+    LBANN_ERROR("attempted to access the weights being optimized "
+                "before they are set");
+  }
+  return *m_weights;
+}
+
+template <typename TensorDataType>
+auto data_type_optimizer<TensorDataType>::get_gradient() -> AbsDistMatrixType& {
+
+  // Make sure gradient matrix has been setup
+  if (m_gradient == nullptr) {
+    LBANN_ERROR("attempted to access gradient before it is set up");
+  }
+
+  // Make sure gradient values are ready
+  this->start_gradient_allreduce();
+  this->finish_gradient_allreduce();
+
+  // Gather all gradients to the master precision
+  this->accumulate_all_gradient_contributions(*m_gradient);
+
+  // Return gradient
+  return *m_gradient;
+
+}
+
+template <typename TensorDataType>
+void data_type_optimizer<TensorDataType>::setup(WeightsType* w) {
+  this->set_comm(w->get_comm());
+  this->clear_gradient();
+
+  // Set weights being optimized
+  if (w != nullptr) { set_weights(w); }
+  if (m_weights == nullptr) {
+    LBANN_ERROR("attempted to setup optimizer without weights");
+  }
+
+  // Initialize matrices
+  const auto& height = m_weights->get_matrix_height();
+  const auto& width = m_weights->get_matrix_width();
+  const AbsDistMatrixType& values = m_weights->get_values();
+  m_gradient.reset(AbsDistMatrixType::Instantiate(values.DistData()));
+  m_gradient->AlignWith(values);
+  m_gradient->Resize(height, width);
+  m_gradient_v.reset(AbsDistMatrixType::Instantiate(values.DistData()));
+  m_gradient_v->AlignWith(values);
+#ifdef HYDROGEN_HAVE_CUB
+  if (m_gradient_v->GetLocalDevice() == El::Device::GPU) {
+    m_gradient_v->Matrix().SetMemoryMode(1); // CUB GPU memory pool
+  }
+#endif // HYDROGEN_HAVE_CUB
+
+}
+
+template <typename TensorDataType>
+TensorDataType data_type_optimizer<TensorDataType>::get_learning_rate() const {
+  return m_learning_rate;
+}
+
+template <typename TensorDataType>
+void data_type_optimizer<TensorDataType>::set_learning_rate(TensorDataType learning_rate) {
+  m_learning_rate = learning_rate;
+}
+
+template <typename TensorDataType>
+void data_type_optimizer<TensorDataType>::step() {
+  if (m_weights == nullptr) {
+    LBANN_ERROR("attempted to perform optimization step without weights");
+  }
+  const auto start_time = get_time();
+  this->step_compute(m_weights->get_values(), this->get_gradient());
+  this->inc_step_time(get_time() - start_time);
+}
+
+template <typename TensorDataType>
+std::tuple<El::Int,El::Int,El::DistData>
+data_type_optimizer<TensorDataType>::get_matrix_info() const {
+  auto const& w = this->get_weights();
+  return {
+    w.get_matrix_height(),
+    w.get_matrix_width(),
+    w.get_matrix_distribution()
+  };
+}
+
+// =============================
+// Checkpointing
+// =============================
+
+#define PROTO(T)                         \
+  template class data_type_optimizer<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace lbann
diff --git a/src/optimizers/hypergradient_adam.cpp b/src/optimizers/hypergradient_adam.cpp
index 0da3b9852bb..4b602ab52bd 100644
--- a/src/optimizers/hypergradient_adam.cpp
+++ b/src/optimizers/hypergradient_adam.cpp
@@ -26,25 +26,29 @@
 
 #include "lbann/optimizers/hypergradient_adam.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <optimizers.pb.h>
 
 namespace lbann {
 
-hypergradient_adam::hypergradient_adam(lbann_comm *comm,
-                                       DataType init_learning_rate,
-                                       DataType hyper_learning_rate,
-                                       DataType beta1,
-                                       DataType beta2,
-                                       DataType eps)
-  : optimizer(comm, init_learning_rate),
+template <typename TensorDataType>
+hypergradient_adam<TensorDataType>::hypergradient_adam(TensorDataType init_learning_rate,
+                                                       TensorDataType hyper_learning_rate,
+                                                       TensorDataType beta1,
+                                                       TensorDataType beta2,
+                                                       TensorDataType eps)
+  : BaseType(init_learning_rate),
     m_hyper_learning_rate(hyper_learning_rate),
     m_beta1(beta1),
     m_beta2(beta2),
     m_eps(eps),
-    m_current_beta1(1),
-    m_current_beta2(1) {}
+    m_current_beta1(1.),
+    m_current_beta2(1.) {}
 
-hypergradient_adam::hypergradient_adam(const hypergradient_adam& other)
-  : optimizer(other),
+template <typename TensorDataType>
+hypergradient_adam<TensorDataType>::hypergradient_adam(const hypergradient_adam& other)
+  : BaseType(other),
     m_hyper_learning_rate(other.m_hyper_learning_rate),
     m_beta1(other.m_beta1),
     m_beta2(other.m_beta2),
@@ -56,8 +60,9 @@ hypergradient_adam::hypergradient_adam(const hypergradient_adam& other)
     m_old_gradient(other.m_old_gradient ?
                    other.m_old_gradient->Copy() : nullptr) {}
 
-hypergradient_adam& hypergradient_adam::operator=(const hypergradient_adam& other) {
-  optimizer::operator=(other);
+template <typename TensorDataType>
+hypergradient_adam<TensorDataType>& hypergradient_adam<TensorDataType>::operator=(const hypergradient_adam<TensorDataType>& other) {
+  OptimizerType::operator=(other);
   m_hyper_learning_rate = other.m_hyper_learning_rate;
   m_beta1 = other.m_beta1;
   m_beta2 = other.m_beta2;
@@ -71,8 +76,9 @@ hypergradient_adam& hypergradient_adam::operator=(const hypergradient_adam& othe
   return *this;
 }
 
-description hypergradient_adam::get_description() const {
-  auto&& desc = optimizer::get_description();
+template <typename TensorDataType>
+description hypergradient_adam<TensorDataType>::get_description() const {
+  auto desc = OptimizerType::get_description();
   desc.add("Hypergradient learning rate", m_hyper_learning_rate);
   desc.add("beta1", m_beta1);
   desc.add("beta2", m_beta2);
@@ -80,19 +86,21 @@ description hypergradient_adam::get_description() const {
   return desc;
 }
 
-void hypergradient_adam::setup(weights* w) {
-  optimizer::setup(w);
+template <typename TensorDataType>
+void hypergradient_adam<TensorDataType>::setup(WeightsType* w) {
+  OptimizerType::setup(w);
   const auto& gradient = this->get_gradient();
-  m_moment1.reset(AbsDistMat::Instantiate(gradient.DistData()));
-  m_moment2.reset(AbsDistMat::Instantiate(gradient.DistData()));
-  m_old_gradient.reset(AbsDistMat::Instantiate(gradient.DistData()));
+  m_moment1.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
+  m_moment2.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
+  m_old_gradient.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
   El::Zeros(*m_moment1, gradient.Height(), gradient.Width());
   El::Zeros(*m_moment2, gradient.Height(), gradient.Width());
   El::Zeros(*m_old_gradient, gradient.Height(), gradient.Width());
 }
 
-void hypergradient_adam::step_compute(AbsDistMat& values,
-                                      const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void hypergradient_adam<TensorDataType>::step_compute(AbsDistMatrixType& values,
+                                                      const AbsDistMatrixType& gradient) {
   if (values.GetLocalDevice() != El::Device::CPU) {
     LBANN_ERROR("hypergradient Adam is only supported on CPU");
   }
@@ -100,15 +108,15 @@ void hypergradient_adam::step_compute(AbsDistMat& values,
   // Precompute the bias correction.
   m_current_beta1 *= m_beta1;
   m_current_beta2 *= m_beta2;
-  const DataType correction = std::sqrt(DataType(1) - m_current_beta2) /
-                              (DataType(1) - m_current_beta1);
+  const TensorDataType correction = El::Sqrt(TensorDataType(1.) - m_current_beta2) /
+                              (TensorDataType(1.) - m_current_beta1);
 
   // Get local matrix data
   const size_t local_height = values.LocalHeight();
   const size_t local_width = values.LocalWidth();
   auto* __restrict__ values_buffer = values.Buffer();
   const size_t values_ldim = values.LDim();
-  const DataType* __restrict__ gradient_buffer = gradient.LockedBuffer();
+  const TensorDataType* __restrict__ gradient_buffer = gradient.LockedBuffer();
   const size_t gradient_ldim = gradient.LDim();
   auto* __restrict__ moment1_buffer = m_moment1->Buffer();
   const size_t moment1_ldim = m_moment1->LDim();
@@ -118,7 +126,7 @@ void hypergradient_adam::step_compute(AbsDistMat& values,
   const size_t old_gradient_ldim = m_old_gradient->LDim();
 
   // Compute the learning rate update.
-  DataType lr_update = El::Dot(gradient, *m_old_gradient);
+  TensorDataType lr_update = El::Dot(gradient, *m_old_gradient);
   auto learning_rate = this->get_learning_rate();
   learning_rate += m_hyper_learning_rate * lr_update;
   this->set_learning_rate(learning_rate);
@@ -132,20 +140,19 @@ void hypergradient_adam::step_compute(AbsDistMat& values,
       auto& m1 = moment1_buffer[row+col*moment1_ldim];
       auto& m2 = moment2_buffer[row+col*moment2_ldim];
       auto& old_c = old_gradient_buffer[row+col*old_gradient_ldim];
-      m1 = m_beta1 * m1 + (DataType(1) - m_beta1) * g;
-      m2 = m_beta2 * m2 + (DataType(1) - m_beta2) * g * g;
-      old_c = correction * m1 / (std::sqrt(m2) + m_eps);
+      m1 = m_beta1 * m1 + (TensorDataType(1.) - m_beta1) * g;
+      m2 = m_beta2 * m2 + (TensorDataType(1.) - m_beta2) * g * g;
+      old_c = correction * m1 / (El::Sqrt(m2) + m_eps);
       x -= learning_rate * old_c;
     }
   }
 
 }
 
-bool hypergradient_adam::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
-  if(p.get_cb_type() == callback_type::batch)
-    optimizer::save_to_checkpoint_shared(p,name_prefix);
-  if (get_comm().am_trainer_master()) {
-    pack_scalars(p);
+template <typename TensorDataType>
+bool hypergradient_adam<TensorDataType>::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
+  if (this->get_comm().am_trainer_master()) {
+    write_cereal_archive(*this, p, "hypergradient_adam.xml");
   }
 
   char l_name[512];
@@ -161,17 +168,9 @@ bool hypergradient_adam::save_to_checkpoint_shared(persist& p, std::string name_
   return true;
 }
 
-bool hypergradient_adam::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
-  if(p.get_cb_type() == callback_type::batch)
-    optimizer::load_from_checkpoint_shared(p,name_prefix);
-  struct packing_header header;
-  if (get_comm().am_trainer_master()) {
-    unpack_scalars(p, &header);
-  }
-
-  get_comm().trainer_broadcast(0, header);
-
-  unpack_header(header);
+template <typename TensorDataType>
+bool hypergradient_adam<TensorDataType>::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
+  load_from_shared_cereal_archive(*this, p, this->get_comm(), "hypergradient_adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld.bin", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -185,10 +184,9 @@ bool hypergradient_adam::load_from_checkpoint_shared(persist& p, std::string nam
   return true;
 }
 
-bool hypergradient_adam::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
-  if(p.get_cb_type() == callback_type::batch)
-    optimizer::save_to_checkpoint_distributed(p,name_prefix);
-  pack_scalars(p);
+template <typename TensorDataType>
+bool hypergradient_adam<TensorDataType>::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
+  write_cereal_archive(*this, p, "hypergradient_adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -203,11 +201,9 @@ bool hypergradient_adam::save_to_checkpoint_distributed(persist& p, std::string
   return true;
 }
 
-bool hypergradient_adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
-  if(p.get_cb_type() == callback_type::batch)
-    optimizer::load_from_checkpoint_distributed(p,name_prefix);
-  struct packing_header header;
-  unpack_scalars(p, &header);
+template <typename TensorDataType>
+bool hypergradient_adam<TensorDataType>::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
+  read_cereal_archive(*this, p, "hypergradient_adam.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_adam_moment1_%lldx%lld", name_prefix.c_str(), m_moment1->Height(), m_moment2->Width());
@@ -221,4 +217,28 @@ bool hypergradient_adam::load_from_checkpoint_distributed(persist& p, std::strin
   return true;
 }
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_hypergradient_adam_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Optimizer::HypergradientAdam const&>(msg);
+  return make_unique<hypergradient_adam<TensorDataType>>(
+    TensorDataType(params.init_learning_rate()),
+    TensorDataType(params.hyper_learning_rate()),
+    TensorDataType(params.beta1()),
+    TensorDataType(params.beta2()),
+    TensorDataType(params.eps()));
+}
+
+#define PROTO(T)                                    \
+  template class hypergradient_adam<T>;             \
+  template std::unique_ptr<optimizer>               \
+  build_hypergradient_adam_optimizer_from_pbuf<T>(  \
+    google::protobuf::Message const&)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 }  // namespace lbann
diff --git a/src/optimizers/optimizer.cpp b/src/optimizers/optimizer.cpp
index 7320ddbe696..0fe5b1f17b5 100644
--- a/src/optimizers/optimizer.cpp
+++ b/src/optimizers/optimizer.cpp
@@ -44,21 +44,13 @@ std::string to_string(optimizer_gradient_status status) {
   }
 }
 
-optimizer::optimizer(lbann_comm* comm, DataType learning_rate)
-  : m_comm(comm), m_learning_rate(learning_rate) {
-  if (m_comm == nullptr) {
-    LBANN_ERROR("got null pointer for lbann_comm");
-  }
-}
+optimizer::optimizer()
+  : m_comm(nullptr) {}
 
 optimizer::optimizer(const optimizer& other)
   : m_comm(other.m_comm),
-    m_weights(other.m_weights),
-    m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr),
-    m_gradient_v(other.m_gradient_v ? other.m_gradient_v->Copy() : nullptr),
     m_gradient_sources(other.m_gradient_sources),
     m_gradient_status(other.m_gradient_status),
-    m_learning_rate(other.m_learning_rate),
     m_step_time(other.m_step_time) {
   if (m_gradient_status == optimizer_gradient_status::allreduce_started) {
     LBANN_ERROR("attempted to copy optimizer while a "
@@ -68,12 +60,8 @@ optimizer::optimizer(const optimizer& other)
 
 optimizer& optimizer::operator=(const optimizer& other) {
   m_comm = other.m_comm;
-  m_weights = other.m_weights;
-  m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr);
-  m_gradient_v.reset(other.m_gradient_v ? other.m_gradient_v->Copy() : nullptr);
   m_gradient_sources = other.m_gradient_sources;
   m_gradient_status = other.m_gradient_status;
-  m_learning_rate = other.m_learning_rate;
   m_step_time = other.m_step_time;
   if (m_gradient_status == optimizer_gradient_status::allreduce_started) {
     LBANN_ERROR("attempted to copy optimizer while a "
@@ -84,169 +72,9 @@ optimizer& optimizer::operator=(const optimizer& other) {
 
 description optimizer::get_description() const {
   description desc(get_type() + " optimizer");
-  desc.add("Learning rate", m_learning_rate);
   return desc;
 }
 
-weights& optimizer::get_weights() {
-  // Item 3, p. 23 in "Effective C++", 3rd ed., by Scott Meyers
-  return const_cast<weights&>(static_cast<const optimizer&>(*this).get_weights());
-}
-
-const weights& optimizer::get_weights() const {
-  if (m_weights == nullptr) {
-    LBANN_ERROR("attempted to access the weights being optimized "
-                "before they are set");
-  }
-  return *m_weights;
-}
-
-AbsDistMat& optimizer::get_gradient() {
-
-  // Make sure gradient matrix has been setup
-  if (m_gradient == nullptr) {
-    LBANN_ERROR("attempted to access gradient before it is set up");
-  }
-
-  // Make sure gradient values are ready
-  start_gradient_allreduce();
-  finish_gradient_allreduce();
-  if (m_gradient_status == optimizer_gradient_status::cleared) {
-    El::Zero(*m_gradient);
-    m_gradient_status = optimizer_gradient_status::ready;
-  }
-  if (m_gradient_status != optimizer_gradient_status::ready) {
-    std::ostringstream err;
-    err << "unexpected gradient status (expected "
-        << "\"" << to_string(optimizer_gradient_status::ready) << "\", "
-        << "but found \"" << to_string(m_gradient_status) << "\")";
-    LBANN_ERROR(err.str());
-  }
-
-  // Return gradient
-  return *m_gradient;
-
-}
-
-void optimizer::add_to_gradient(const AbsDistMat& gradient,
-                                DataType scale,
-                                bool allreduce_needed) {
-
-  // Check that matrices have been setup
-  if (m_gradient == nullptr || m_gradient_v == nullptr) {
-    LBANN_ERROR("attempted to access gradient before it is set up");
-  }
-  if (scale == DataType(0)) { return; }
-
-  // Make sure input matrix is in correct distribution
-  // Note: If input matrix is already in correct distribution, just
-  // make a matrix view. Otherwise redistribute and possibly allreduce
-  // the matrix.
-  m_gradient_v->Empty();
-  m_gradient_v->AlignWith(*m_gradient);
-  if (m_gradient_v->DistData() == gradient.DistData()) {
-    El::LockedView(*m_gradient_v, gradient);
-  } else if (allreduce_needed) {
-    std::unique_ptr<AbsDistMat> temp(gradient.Copy());
-    get_comm().allreduce(*temp, temp->RedundantComm());
-    El::Copy(*temp, *m_gradient_v);
-    allreduce_needed = false;
-  } else {
-    El::Copy(gradient, *m_gradient_v);
-  }
-
-  // Add to gradient
-  if (m_gradient_status == optimizer_gradient_status::allreduce_started) {
-    finish_gradient_allreduce();
-  }
-  switch (m_gradient_status) {
-  case optimizer_gradient_status::ready:
-    if (allreduce_needed) {
-      // Properly scale contributions that have already been allreduced or that
-      // do not need allreduces.
-      El::Scale(DataType(1) / m_gradient->RedundantSize(), *m_gradient);
-      m_gradient_status = optimizer_gradient_status::allreduce_needed;
-    }
-    El::Axpy(scale, *m_gradient_v, *m_gradient);
-    break;
-  case optimizer_gradient_status::cleared:
-    El::Copy(*m_gradient_v, *m_gradient);
-    El::Scale(scale, *m_gradient);
-    m_gradient_status = (allreduce_needed ?
-                         optimizer_gradient_status::allreduce_needed :
-                         optimizer_gradient_status::ready);
-    break;
-  case optimizer_gradient_status::allreduce_needed:
-    {
-      // Properly scale data that does not need to be allreduced.
-      const auto& scale_ = (allreduce_needed ?
-                            scale :
-                            scale / m_gradient->RedundantSize());
-      El::Axpy(scale_, *m_gradient_v, *m_gradient);
-    }
-    break;
-  case optimizer_gradient_status::allreduce_started:
-  default:
-    LBANN_ERROR("unexpected gradient status "
-                "(" + to_string(m_gradient_status) + ")");
-  }
-
-  // Clean up
-  m_gradient_v->Empty();
-
-}
-
-void optimizer::clear_gradient() {
-  if (m_gradient_status == optimizer_gradient_status::allreduce_started) {
-    finish_gradient_allreduce();
-  }
-  m_gradient_status = optimizer_gradient_status::cleared;
-  m_gradient_sources.clear();
-}
-
-AbsDistMat& optimizer::get_gradient_buffer(DataType& buf_scale,
-                                           DataType& in_scale,
-                                           bool allreduce_needed) {
-  if (m_gradient == nullptr) {
-    LBANN_ERROR("attempted to access gradient before it is set up");
-  }
-
-  // Complete outstanding allreduce.
-  if (m_gradient_status == optimizer_gradient_status::allreduce_started) {
-    finish_gradient_allreduce();
-  }
-  // Determine scaling factor and transition state.
-  switch (m_gradient_status) {
-  case optimizer_gradient_status::ready:
-    buf_scale = DataType(1);
-    in_scale = DataType(1);
-    if (allreduce_needed) {
-      buf_scale /= m_gradient->RedundantSize();
-      m_gradient_status = optimizer_gradient_status::allreduce_needed;
-    }
-    break;
-  case optimizer_gradient_status::cleared:
-    buf_scale = DataType(0);
-    in_scale = DataType(1);
-    m_gradient_status = (allreduce_needed ?
-                         optimizer_gradient_status::allreduce_needed :
-                         optimizer_gradient_status::ready);
-    break;
-  case optimizer_gradient_status::allreduce_needed:
-    buf_scale = DataType(1);
-    // Properly scale data that does not need to be allreduced.
-    in_scale = (allreduce_needed ?
-                DataType(1) :
-                DataType(1) / m_gradient->RedundantSize());
-    break;
-  case optimizer_gradient_status::allreduce_started:
-  default:
-    LBANN_ERROR("unexpected gradient status ("
-                + to_string(m_gradient_status) + ")");
-  }
-  return *m_gradient;
-}
-
 El::Int optimizer::get_num_gradient_sources() const {
   return m_gradient_sources.size();
 }
@@ -260,114 +88,9 @@ void optimizer::add_gradient_source(const void* source) {
 void optimizer::remove_gradient_source(const void* source) {
   m_gradient_sources.erase(nullptr);
   m_gradient_sources.erase(source);
-  if (m_gradient_sources.empty()) {
+  if (get_gradient_sources().empty()) {
     start_gradient_allreduce();
   }
 }
 
-void optimizer::setup(weights* w) {
-  clear_gradient();
-
-  // Set weights being optimized
-  if (w != nullptr) { set_weights(w); }
-  if (m_weights == nullptr) {
-    LBANN_ERROR("attempted to setup optimizer without weights");
-  }
-
-  // Initialize matrices
-  const auto& height = m_weights->get_matrix_height();
-  const auto& width = m_weights->get_matrix_width();
-  const AbsDistMat& values = m_weights->get_values();
-  m_gradient.reset(AbsDistMat::Instantiate(values.DistData()));
-  m_gradient->AlignWith(values);
-  m_gradient->Resize(height, width);
-  m_gradient_v.reset(AbsDistMat::Instantiate(values.DistData()));
-  m_gradient_v->AlignWith(values);
-#ifdef HYDROGEN_HAVE_CUB
-  if (m_gradient_v->GetLocalDevice() == El::Device::GPU) {
-    m_gradient_v->Matrix().SetMemoryMode(1); // CUB GPU memory pool
-  }
-#endif // HYDROGEN_HAVE_CUB
-
-}
-
-void optimizer::step() {
-  if (m_weights == nullptr) {
-    LBANN_ERROR("attempted to perform optimization step without weights");
-  }
-  const auto start_time = get_time();
-  step_compute(m_weights->get_values(), get_gradient());
-  m_step_time += get_time() - start_time;
-}
-
-DataType optimizer::get_learning_rate() const {
-  return m_learning_rate;
-}
-
-void optimizer::set_learning_rate(DataType learning_rate) {
-  m_learning_rate = learning_rate;
-};
-
-void optimizer::start_gradient_allreduce() {
-  switch (m_gradient_status) {
-  case optimizer_gradient_status::allreduce_needed:
-    get_comm().nb_allreduce(*m_gradient,
-                            m_gradient->RedundantComm(),
-                            m_gradient_allreduce_req);
-    m_gradient_status = optimizer_gradient_status::allreduce_started;
-    break;
-  case optimizer_gradient_status::ready:
-  case optimizer_gradient_status::cleared:
-  case optimizer_gradient_status::allreduce_started:
-    break;
-  default: LBANN_ERROR("unexpected gradient status "
-                       "(" + to_string(m_gradient_status) + ")");
-  }
-}
-
-void optimizer::finish_gradient_allreduce() {
-  switch (m_gradient_status) {
-  case optimizer_gradient_status::allreduce_started:
-    get_comm().wait(m_gradient_allreduce_req);
-    m_gradient_status = optimizer_gradient_status::ready;
-    break;
-  case optimizer_gradient_status::ready:
-  case optimizer_gradient_status::cleared:
-    break;
-  case optimizer_gradient_status::allreduce_needed:
-    LBANN_ERROR("attempted to finish gradient allreduce "
-                "before starting it");
-    break;
-  default:
-    LBANN_ERROR("unexpected gradient status "
-                "(" + to_string(m_gradient_status) + ")");
-  }
-}
-
-// =============================
-// Checkpointing
-// =============================
-
-bool optimizer::save_to_checkpoint_shared(persist& p, std::string m_name) {
-  //  m_learning_rate;
-  p.write_datatype(persist_type::train, "learning_rate", m_learning_rate);
-  return true;
-}
-
-bool optimizer::load_from_checkpoint_shared(persist& p, std::string m_name) {
-  p.read_datatype(persist_type::train, "learning_rate", &m_learning_rate);
-  get_comm().trainer_broadcast(0, m_learning_rate);
-  return true;
-}
-
-bool optimizer::save_to_checkpoint_distributed(persist& p, std::string m_name) {
-  p.write_datatype(persist_type::train, "learning_rate", m_learning_rate);
-  return true;
-}
-
-bool optimizer::load_from_checkpoint_distributed(persist& p, std::string m_name) {
-  p.read_datatype(persist_type::train, "learning_rate", &m_learning_rate);
-  return true;
-}
-
 } // namespace lbann
diff --git a/src/optimizers/rmsprop.cpp b/src/optimizers/rmsprop.cpp
index 870af9f0470..26c03105705 100644
--- a/src/optimizers/rmsprop.cpp
+++ b/src/optimizers/rmsprop.cpp
@@ -26,46 +26,53 @@
 
 #include "lbann/optimizers/rmsprop.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
 
 namespace lbann {
 
-rmsprop::rmsprop(lbann_comm *comm,
-                 DataType learning_rate,
-                 DataType decay_rate,
-                 DataType eps)
-  : optimizer(comm, learning_rate),
+template <typename TensorDataType>
+rmsprop<TensorDataType>::rmsprop(TensorDataType learning_rate,
+                 TensorDataType decay_rate,
+                 TensorDataType eps)
+  : BaseType(learning_rate),
     m_decay_rate(decay_rate),
     m_eps(eps) {}
 
-rmsprop::rmsprop(const rmsprop& other) :
-  optimizer(other),
+template <typename TensorDataType>
+rmsprop<TensorDataType>::rmsprop(const rmsprop& other) :
+  BaseType(other),
   m_decay_rate(other.m_decay_rate),
   m_eps(other.m_eps),
   m_cache(other.m_cache ? other.m_cache->Copy() : nullptr) {}
 
-rmsprop& rmsprop::operator=(const rmsprop& other) {
-  optimizer::operator=(other);
+template <typename TensorDataType>
+rmsprop<TensorDataType>& rmsprop<TensorDataType>::operator=(const rmsprop& other) {
+  OptimizerType::operator=(other);
   m_decay_rate = other.m_decay_rate;
   m_eps = other.m_eps;
   m_cache.reset(other.m_cache ? other.m_cache->Copy() : nullptr);
   return *this;
 }
 
-description rmsprop::get_description() const {
-  auto&& desc = optimizer::get_description();
+template <typename TensorDataType>
+description rmsprop<TensorDataType>::get_description() const {
+  auto desc = OptimizerType::get_description();
   desc.add("Decay rate", m_decay_rate);
   desc.add("eps", m_eps);
   return desc;
 }
 
-void rmsprop::setup(weights* w) {
-  optimizer::setup(w);
+template <typename TensorDataType>
+void rmsprop<TensorDataType>::setup(WeightsType* w) {
+  OptimizerType::setup(w);
   const auto& gradient = this->get_gradient();
-  m_cache.reset(AbsDistMat::Instantiate(gradient.DistData()));
+  m_cache.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
   El::Zeros(*m_cache, gradient.Height(), gradient.Width());
 }
 
-void rmsprop::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void rmsprop<TensorDataType>::step_compute(AbsDistMatrixType& values,
+                                           const AbsDistMatrixType& gradient) {
   switch (values.GetLocalDevice()) {
   case El::Device::CPU: step_compute_cpu(values, gradient); break;
 #ifdef LBANN_HAS_CUDA
@@ -79,7 +86,9 @@ void rmsprop::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
-void rmsprop::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void rmsprop<TensorDataType>::step_compute_cpu(AbsDistMatrixType& values,
+                                               const AbsDistMatrixType& gradient) {
 
   // Get local matrix data
   const size_t local_height = values.LocalHeight();
@@ -92,15 +101,15 @@ void rmsprop::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
   const size_t cache_ldim = m_cache->LDim();
 
   // Apply RMSprop step
-  const auto& learning_rate = get_learning_rate();
+  const auto& learning_rate = this->get_learning_rate();
   LBANN_OMP_PARALLEL_FOR_COLLAPSE2
   for (size_t col = 0; col < local_width; ++col) {
     for (size_t row = 0; row < local_height; ++row) {
       auto& x = values_buffer[row+col*values_ldim];
       const auto& g = gradient_buffer[row+col*gradient_ldim];
       auto& c = cache_buffer[row+col*cache_ldim];
-      c = m_decay_rate * c + (DataType(1) - m_decay_rate) * g * g;
-      x -= learning_rate * g / (std::sqrt(c) + m_eps);
+      c = m_decay_rate * c + (TensorDataType(1.) - m_decay_rate) * g * g;
+      x -= learning_rate * g / (El::Sqrt(c) + m_eps);
     }
   }
 
@@ -110,8 +119,11 @@ void rmsprop::step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
 // Checkpointing
 // =============================================
 
-bool rmsprop::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_shared(p, name_prefix);
+template <typename TensorDataType>
+bool rmsprop<TensorDataType>::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
+  if (this->get_comm().am_trainer_master()) {
+    write_cereal_archive(*this, p, "rmsprop.xml");
+  }
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
@@ -120,34 +132,59 @@ bool rmsprop::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool rmsprop::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_shared(p, name_prefix);
-  char l_name[512];
+template <typename TensorDataType>
+bool rmsprop<TensorDataType>::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
+  load_from_shared_cereal_archive(*this, p, this->get_comm(), "rmsprop.xml");
 
+  char l_name[512];
   sprintf(l_name, "%s_optimizer_cache_%lldx%lld.bin", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
   p.read_distmat(persist_type::train, l_name, m_cache.get());
 
   return true;
 }
 
- bool rmsprop::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
-   optimizer::save_to_checkpoint_distributed(p, name_prefix);
+template <typename TensorDataType>
+bool rmsprop<TensorDataType>::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
+  write_cereal_archive(*this, p, "rmsprop.xml");
+
+  char l_name[512];
+  sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
+  p.write_rank_distmat(persist_type::train, l_name, *m_cache);
 
-   char l_name[512];
-   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
-   p.write_rank_distmat(persist_type::train, l_name, *m_cache);
+  return true;
+}
 
-   return true;
- }
+template <typename TensorDataType>
+bool rmsprop<TensorDataType>::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
+  read_cereal_archive(*this, p, "rmsprop.xml");
 
- bool rmsprop::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
-   optimizer::load_from_checkpoint_distributed(p, name_prefix);
-   char l_name[512];
+  char l_name[512];
+  sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
+  p.read_rank_distmat(persist_type::train, l_name, *m_cache);
+
+  return true;
+}
+
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_rmsprop_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Optimizer::RMSprop const&>(msg);
+  return make_unique<rmsprop<TensorDataType>>(
+    TensorDataType(params.learn_rate()),
+    TensorDataType(params.decay_rate()),
+    TensorDataType(params.eps()));
+}
 
-   sprintf(l_name, "%s_optimizer_cache_%lldx%lld", name_prefix.c_str(), m_cache->Height(), m_cache->Width());
-   p.read_rank_distmat(persist_type::train, l_name, *m_cache);
+#define PROTO(T)                                    \
+  template class rmsprop<T>;                        \
+  template std::unique_ptr<optimizer>               \
+  build_rmsprop_optimizer_from_pbuf<T>(             \
+    google::protobuf::Message const&)
 
-   return true;
- }
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 
 } // namespace lbann
diff --git a/src/optimizers/rmsprop.cu b/src/optimizers/rmsprop.cu
index f312c43d103..e3820a4d22f 100644
--- a/src/optimizers/rmsprop.cu
+++ b/src/optimizers/rmsprop.cu
@@ -31,16 +31,17 @@ namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 __global__ void rmsprop_kernel(size_t height,
                                size_t width,
-                               DataType learning_rate,
-                               DataType decay_rate,
-                               DataType eps,
-                               DataType * __restrict__ values,
+                               TensorDataType learning_rate,
+                               TensorDataType decay_rate,
+                               TensorDataType eps,
+                               TensorDataType * __restrict__ values,
                                size_t values_ldim,
-                               const DataType * __restrict__ gradient,
+                               const TensorDataType * __restrict__ gradient,
                                size_t gradient_ldim,
-                               DataType * __restrict__ cache,
+                               TensorDataType * __restrict__ cache,
                                size_t cache_ldim) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t nthreads = gridDim.x * blockDim.x;
@@ -50,14 +51,16 @@ __global__ void rmsprop_kernel(size_t height,
     const auto& g = gradient[row + col * gradient_ldim];
     auto& c = cache[row + col * cache_ldim];
     auto& x = values[row + col * values_ldim];
-    c = decay_rate * c + (DataType(1) - decay_rate) * g * g;
+    c = decay_rate * c + (TensorDataType(1) - decay_rate) * g * g;
     x -= learning_rate * g / (cuda::sqrt(c) + eps);
   }
 }
 
 } // namespace
 
-void rmsprop::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void rmsprop<TensorDataType>::step_compute_gpu(AbsDistMatrixType& values,
+                                               const AbsDistMatrixType& gradient) {
   const size_t local_height = values.LocalHeight();
   const size_t local_width = values.LocalWidth();
   const size_t local_size = local_height * local_width;
@@ -65,7 +68,7 @@ void rmsprop::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
     constexpr size_t block_size = 256;
     const size_t grid_size = (local_size + block_size - 1) / block_size;
     auto&& stream = El::GPUManager::Stream();
-    rmsprop_kernel<<<grid_size, block_size, 0, stream>>>(
+    rmsprop_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
       this->get_learning_rate(), m_decay_rate, m_eps,
       values.Buffer(), values.LDim(),
@@ -74,4 +77,20 @@ void rmsprop::step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
+#ifdef LBANN_HAS_HALF
+template <>
+void rmsprop<cpu_fp16>::step_compute_gpu(AbsDistMatrixType&,
+                                         const AbsDistMatrixType&) {
+  LBANN_ERROR("Can't call this function with cpu_fp16!");
+}
+#endif // LBANN_HAS_HALF
+
+#define PROTO(T)                               \
+  template void rmsprop<T>::step_compute_gpu(  \
+    El::AbstractDistMatrix<T>&,                \
+    const El::AbstractDistMatrix<T>&)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/optimizers/sgd.cpp b/src/optimizers/sgd.cpp
index 147a7ee2937..0d0673c5ad1 100644
--- a/src/optimizers/sgd.cpp
+++ b/src/optimizers/sgd.cpp
@@ -26,25 +26,28 @@
 
 #include "lbann/optimizers/sgd.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
 
 namespace lbann {
 
-sgd::sgd(lbann_comm* comm,
-         DataType learning_rate,
-         DataType momentum,
+template <typename TensorDataType>
+sgd<TensorDataType>::sgd(TensorDataType learning_rate,
+         TensorDataType momentum,
          bool nesterov)
-  : optimizer(comm, learning_rate),
+  : BaseType(learning_rate),
     m_momentum(momentum),
     m_nesterov(nesterov) {}
 
-sgd::sgd(const sgd& other)
-  : optimizer(other),
+template <typename TensorDataType>
+sgd<TensorDataType>::sgd(const sgd& other)
+  : BaseType(other),
     m_momentum(other.m_momentum),
     m_nesterov(other.m_nesterov),
     m_velocity(other.m_velocity ? other.m_velocity->Copy() : nullptr) {}
 
-sgd& sgd::operator=(const sgd& other) {
-  optimizer::operator=(other);
+template <typename TensorDataType>
+sgd<TensorDataType>& sgd<TensorDataType>::operator=(const sgd<TensorDataType>& other) {
+  OptimizerType::operator=(other);
   m_momentum = other.m_momentum;
   m_nesterov = other.m_nesterov;
   m_velocity.reset(other.m_velocity ?
@@ -52,34 +55,39 @@ sgd& sgd::operator=(const sgd& other) {
   return *this;
 }
 
-description sgd::get_description() const {
-  auto&& desc = optimizer::get_description();
+template <typename TensorDataType>
+description sgd<TensorDataType>::get_description() const {
+  auto desc = OptimizerType::get_description();
   desc.add("Momentum", m_momentum);
   desc.add("Nesterov acceleration", m_nesterov);
   return desc;
 }
 
-const AbsDistMat& sgd::get_velocity() const {
+template <typename TensorDataType>
+auto sgd<TensorDataType>::get_velocity() const -> const AbsDistMatrixType& {
   if (m_velocity == nullptr) {
     LBANN_ERROR(get_type() + " optimizer "
                 + "attempted to access velocity before it was setup");
   }
   return *m_velocity;
 }
-AbsDistMat& sgd::get_velocity() {
+template <typename TensorDataType>
+auto sgd<TensorDataType>::get_velocity() -> AbsDistMatrixType& {
   // Item 3, p. 23 in "Effective C++", 3rd ed., by Scott Meyers
-  return const_cast<AbsDistMat&>(static_cast<const sgd&>(*this).get_velocity());
+  return const_cast<AbsDistMatrixType&>(static_cast<const sgd&>(*this).get_velocity());
 }
 
-void sgd::setup(weights* w) {
-  optimizer::setup(w);
+template <typename TensorDataType>
+void sgd<TensorDataType>::setup(WeightsType* w) {
+  OptimizerType::setup(w);
   const auto& gradient = this->get_gradient();
-  m_velocity.reset(AbsDistMat::Instantiate(gradient.DistData()));
+  m_velocity.reset(AbsDistMatrixType::Instantiate(gradient.DistData()));
   El::Zeros(*m_velocity, gradient.Height(), gradient.Width());
 }
 
-void sgd::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
-  if (m_momentum == DataType(0)) {
+template <typename TensorDataType>
+void sgd<TensorDataType>::step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) {
+  if (m_momentum == TensorDataType(0.)) {
     // Vanilla SGD
     El::Axpy(-this->get_learning_rate(), gradient, values);
   } else {
@@ -98,7 +106,9 @@ void sgd::step_compute(AbsDistMat& values, const AbsDistMat& gradient) {
   }
 }
 
-void sgd::momentum_step_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void sgd<TensorDataType>::momentum_step_cpu(AbsDistMatrixType& values,
+                                            const AbsDistMatrixType& gradient) {
 
   // Get local matrix data
   const auto& learning_rate = this->get_learning_rate();
@@ -163,11 +173,10 @@ void sgd::momentum_step_cpu(AbsDistMat& values, const AbsDistMat& gradient) {
 // Checkpointing
 // =============================================
 
-bool sgd::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_shared(p, name_prefix);
-
-  if (get_comm().am_trainer_master()) {
-    pack_scalars(p);
+template <typename TensorDataType>
+bool sgd<TensorDataType>::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
+  if (this->get_comm().am_trainer_master()) {
+    write_cereal_archive(*this, p, "sgd.xml");
   }
 
   char l_name[512];
@@ -177,16 +186,10 @@ bool sgd::save_to_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool sgd::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_shared(p, name_prefix);
-  struct packing_header header;
-  if (get_comm().am_trainer_master()) {
-    unpack_scalars(p, &header);
-  }
-
-  get_comm().trainer_broadcast(0, header);
+template <typename TensorDataType>
+bool sgd<TensorDataType>::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
+  load_from_shared_cereal_archive(*this, p, this->get_comm(), "sgd.xml");
 
-  unpack_header(header);
   char l_name[512];
   sprintf(l_name, "%s_optimizer_velocity_%lldx%lld.bin", name_prefix.c_str(), m_velocity->Height(), m_velocity->Width());
   p.read_distmat(persist_type::train, l_name, m_velocity.get());
@@ -194,10 +197,9 @@ bool sgd::load_from_checkpoint_shared(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool sgd::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::save_to_checkpoint_distributed(p, name_prefix);
-
-  pack_scalars(p);
+template <typename TensorDataType>
+bool sgd<TensorDataType>::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
+  write_cereal_archive(*this, p, "sgd.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_velocity_%lldx%lld", name_prefix.c_str(), m_velocity->LocalHeight(), m_velocity->LocalWidth());
@@ -206,10 +208,9 @@ bool sgd::save_to_checkpoint_distributed(persist& p, std::string name_prefix) {
   return true;
 }
 
-bool sgd::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
-  optimizer::load_from_checkpoint_distributed(p, name_prefix);
-  struct packing_header header;
-  unpack_scalars(p, &header);
+template <typename TensorDataType>
+bool sgd<TensorDataType>::load_from_checkpoint_distributed(persist& p, std::string name_prefix) {
+  read_cereal_archive(*this, p, "sgd.xml");
 
   char l_name[512];
   sprintf(l_name, "%s_optimizer_velocity_%lldx%lld", name_prefix.c_str(), m_velocity->LocalHeight(), m_velocity->LocalWidth());
@@ -218,4 +219,23 @@ bool sgd::load_from_checkpoint_distributed(persist& p, std::string name_prefix)
   return true;
 }
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer>
+build_sgd_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  const auto& params = dynamic_cast<lbann_data::Optimizer::SGD const&>(msg);
+  return make_unique<sgd<TensorDataType>>(TensorDataType(params.learn_rate()),
+                                          TensorDataType(params.momentum()),
+                                          params.nesterov());
+}
+
+#define PROTO(T)                                \
+  template class sgd<T>;                        \
+  template std::unique_ptr<optimizer>           \
+  build_sgd_optimizer_from_pbuf<T>(             \
+    google::protobuf::Message const&)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 } // namespace lbann
diff --git a/src/optimizers/sgd.cu b/src/optimizers/sgd.cu
index 8a51f93cf6e..b33e54ee5d2 100644
--- a/src/optimizers/sgd.cu
+++ b/src/optimizers/sgd.cu
@@ -30,15 +30,16 @@ namespace lbann {
 
 namespace {
 
+template <typename TensorDataType>
 __global__ void momentum_noncontiguous_kernel(size_t height,
                                               size_t width,
-                                              DataType learning_rate,
-                                              DataType momentum,
-                                              DataType * __restrict__ values,
+                                              TensorDataType learning_rate,
+                                              TensorDataType momentum,
+                                              TensorDataType * __restrict__ values,
                                               size_t values_ldim,
-                                              const DataType * __restrict__ gradient,
+                                              const TensorDataType * __restrict__ gradient,
                                               size_t gradient_ldim,
-                                              DataType * __restrict__ velocity,
+                                              TensorDataType * __restrict__ velocity,
                                               size_t velocity_ldim) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   if (gid < height * width) {
@@ -52,12 +53,13 @@ __global__ void momentum_noncontiguous_kernel(size_t height,
   }
 }
 
+template <typename TensorDataType>
 __global__ void momentum_contiguous_kernel(size_t size,
-                                           DataType learning_rate,
-                                           DataType momentum,
-                                           DataType * __restrict__ values,
-                                           const DataType * __restrict__ gradient,
-                                           DataType * __restrict__ velocity) {
+                                           TensorDataType learning_rate,
+                                           TensorDataType momentum,
+                                           TensorDataType * __restrict__ values,
+                                           const TensorDataType * __restrict__ gradient,
+                                           TensorDataType * __restrict__ velocity) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   if (gid < size) {
     const auto& g = gradient[gid];
@@ -68,15 +70,16 @@ __global__ void momentum_contiguous_kernel(size_t size,
   }
 }
 
+template <typename TensorDataType>
 __global__ void nesterov_kernel(size_t height,
                                 size_t width,
-                                DataType learning_rate,
-                                DataType momentum,
-                                DataType * __restrict__ values,
+                                TensorDataType learning_rate,
+                                TensorDataType momentum,
+                                TensorDataType * __restrict__ values,
                                 size_t values_ldim,
-                                const DataType * __restrict__ gradient,
+                                const TensorDataType * __restrict__ gradient,
                                 size_t gradient_ldim,
-                                DataType * __restrict__ velocity,
+                                TensorDataType * __restrict__ velocity,
                                 size_t velocity_ldim) {
   const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t nthreads = gridDim.x * blockDim.x;
@@ -93,7 +96,9 @@ __global__ void nesterov_kernel(size_t height,
 
 } // namespace
 
-void sgd::momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
+template <typename TensorDataType>
+void sgd<TensorDataType>::momentum_step_gpu(AbsDistMatrixType& values,
+                                            const AbsDistMatrixType& gradient) {
 
   // Get matrix dimensions
   const size_t local_height = values.LocalHeight();
@@ -106,7 +111,7 @@ void sgd::momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
   const size_t grid_size = (local_size + block_size - 1) / block_size;
   auto&& stream = El::GPUManager::Stream();
   if (m_nesterov) {
-    nesterov_kernel<<<grid_size, block_size, 0, stream>>>(
+    nesterov_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
       local_height, local_width,
       this->get_learning_rate(), m_momentum,
       values.Buffer(), values.LDim(),
@@ -115,11 +120,11 @@ void sgd::momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
   } else {
     if (values.Contiguous() && gradient.Contiguous()
         && m_velocity->Contiguous()) {
-      momentum_contiguous_kernel<<<grid_size, block_size, 0, stream>>>(
+      momentum_contiguous_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
         local_size, this->get_learning_rate(), m_momentum,
         values.Buffer(), gradient.LockedBuffer(), m_velocity->Buffer());
     } else {
-      momentum_noncontiguous_kernel<<<grid_size, block_size, 0, stream>>>(
+      momentum_noncontiguous_kernel<TensorDataType><<<grid_size, block_size, 0, stream>>>(
         local_height, local_width,
         this->get_learning_rate(), m_momentum,
         values.Buffer(), values.LDim(),
@@ -130,4 +135,19 @@ void sgd::momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient) {
 
 }
 
+#ifdef LBANN_HAS_HALF
+template <>
+void sgd<cpu_fp16>::momentum_step_gpu(AbsDistMatrixType&,
+                                      const AbsDistMatrixType&) {
+  LBANN_ERROR("Can't call this function with cpu_fp16!");
+}
+#endif // LBANN_HAS_HALF
+
+#define PROTO(T)                            \
+  template void sgd<T>::momentum_step_gpu(  \
+    El::AbstractDistMatrix<T>&,             \
+    const El::AbstractDistMatrix<T>&)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
 } // namespace lbann
diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt
index 98abbc423e4..2c69276d4ba 100644
--- a/src/proto/CMakeLists.txt
+++ b/src/proto/CMakeLists.txt
@@ -6,18 +6,37 @@ if (LBANN_HAS_PROTOBUF)
   # implementation of "protobuf_generate_cpp" but it gives us a custom
   # command on which we can depend. Using this, when lbann.proto is
   # touched, CMake will rebuild the LbannProto library.
-  set(PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/lbann.pb.cc")
-  set(PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/lbann.pb.h")
-  set(PROTO_PY "${CMAKE_CURRENT_BINARY_DIR}/lbann_pb2.py")
+  set_full_path(PROTO_INPUTS
+    lbann.proto
+    callbacks.proto
+    layers.proto
+    metrics.proto
+    model.proto
+    objective_functions.proto
+    optimizers.proto
+    reader.proto
+    trainer.proto
+    training_algorithm.proto
+    transforms.proto
+    weights.proto
+    )
+
+  foreach (proto IN LISTS PROTO_INPUTS)
+    get_filename_component(name "${proto}" NAME_WE)
+    list(APPEND PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${name}.pb.cc")
+    list(APPEND PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/${name}.pb.h")
+    list(APPEND PROTO_PY "${CMAKE_CURRENT_BINARY_DIR}/${name}_pb2.py")
+  endforeach ()
   add_custom_command(
     COMMAND protobuf::protoc
     "--cpp_out=${CMAKE_CURRENT_BINARY_DIR}"
     "--python_out=${CMAKE_CURRENT_BINARY_DIR}"
     "-I" "${CMAKE_CURRENT_SOURCE_DIR}"
-    "${CMAKE_CURRENT_SOURCE_DIR}/lbann.proto"
+    "${PROTO_INPUTS}"
     OUTPUT ${PROTO_SRCS} ${PROTO_HDRS} ${PROTO_PY}
-    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/lbann.proto" protobuf::protoc
-    COMMENT "Running protoc on lbann.proto"
+    DEPENDS ${PROTO_INPUTS} protobuf::protoc
+    COMMENT "Running protoc on LBANN protobuf sources."
+    COMMAND_EXPAND_LISTS
     VERBATIM)
 
   add_custom_target(LbannProto_genSrc
@@ -54,6 +73,7 @@ endif (LBANN_HAS_PROTOBUF)
 set_full_path(THIS_DIR_SOURCES
   init_image_data_readers.cpp
   proto_common.cpp
+  helpers.cpp
   )
 
 # Add the subdirectories
diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
new file mode 100644
index 00000000000..f41449ff5d3
--- /dev/null
+++ b/src/proto/callbacks.proto
@@ -0,0 +1,378 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message Callback {
+  // a Callback should contain exactly one of the following
+  oneof callback_type {
+    CallbackPrint print = 1;
+    CallbackTimer timer = 2;
+    CallbackSummary summary = 3;
+    CallbackDumpWeights dump_weights = 4;
+    CallbackDumpOutputs dump_outputs = 5;
+    CallbackDumpErrorSignals dump_error_signals = 35;
+    CallbackDumpGradients dump_gradients = 6;
+    CallbackDumpMBIndices dump_mb_indices = 7;
+    CallbackDispIOStats disp_io_stats = 8;
+    CallbackImComm imcomm = 9;
+    CallbackSaveImages save_images = 10;
+    CallbackDebug debug = 11;
+    CallbackAdaptiveLearningRate adaptive_learning_rate = 12;
+    CallbackStepLearningRate step_learning_rate = 13;
+    CallbackCustomLearningRate custom_learning_rate = 14;
+    CallbackCheckSmall check_small = 15;
+    CallbackCheckNaN check_nan = 16;
+    CallbackCheckDataset check_dataset = 17;
+    CallbackHang hang = 18;
+    CallbackDropFixedLearningRate drop_fixed_learning_rate = 19;
+    CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20;
+    CallbackProfiler profiler = 21;
+    CallbackStepMinibatch step_minibatch = 22;
+    CallbackCheckGradients check_gradients = 23;
+    CallbackLTFB ltfb = 24;
+    CallbackDebugIO debug_io = 25;
+    CallbackMinibatchSchedule minibatch_schedule = 26;
+    CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27;
+    CallbackCheckpoint checkpoint = 28;
+    CallbackSaveModel save_model = 29;
+    CallbackPolyLearningRate poly_learning_rate = 30;
+    CallbackReplaceWeights replace_weights = 31;
+    CallbackGPUMemoryUsage gpu_memory_usage = 32;
+    CallbackSyncLayers sync_layers = 33;
+    CallbackConfusionMatrix confusion_matrix = 36;
+    CallbackCheckMetric check_metric = 37;
+    CallbackPerturbAdam perturb_adam = 38;
+    CallbackPerturbDropout perturb_dropout = 39;
+    CallbackSaveTopKModels save_topk_models = 40;
+    CallbackMixup mixup = 41;
+    CallbackCheckInit init = 42;
+    CallbackEarlyStopping early_stopping = 43;
+    CallbackTimeline timeline = 44;
+    CallbackPrintModelDescription print_model_description = 45;
+    CallbackLoadModel load_model = 46;
+    CallbackSetWeightsValue set_weights_value = 47;
+    CallbackSummarizeImages summarize_images = 48;
+  }
+
+  message CallbackLTFB {
+    int64 batch_interval = 1;
+    string metric = 2;
+    string weights = 3;       // default: all weights
+    bool low_score_wins = 4;
+    string communication_algorithm = 5;   // default: "sendrecv_weights"
+    bool exchange_hyperparameters = 6;
+    string checkpoint_basedir = 7;
+  }
+
+  message CallbackStepLearningRate {
+    string weights = 1; //default: all weights
+    int64 step = 2;
+    double amt = 3;
+  }
+
+  message CallbackCustomLearningRate {
+    //don't know how to support this, since it takes an std::function as an argument
+  }
+
+  message CallbackAdaptiveLearningRate {
+    string weights = 1; //default: all weights
+    int64 patience = 2;
+    double amt = 3;
+  }
+
+  message CallbackSaveImages {
+    string layers       = 1; // Layer outputs to save as images
+    string image_format = 2; // Image format (e.g. jpg, png, pgm)
+    string image_prefix = 3; // Prefix for saved image files
+  }
+
+  message CallbackPrint {
+    int64 interval = 1; //default in lbann_callback_print.hpp is 1
+    bool  print_global_stat_only = 2; //useful in large scale multi-trainer, default is false
+  }
+
+  message CallbackProfiler {
+    bool sync = 1;
+    bool skip_init = 2;
+  }
+
+  message CallbackTimer {
+  }
+
+  message CallbackSummary {
+    int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1
+    int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25
+  }
+
+  message CallbackDumpWeights {
+    string directory = 1;
+    int64  epoch_interval = 2;
+  }
+
+  message CallbackDumpOutputs {
+    string layers = 1;          // Default: all layers
+    string execution_modes = 2; // Default: all modes
+    int64 batch_interval = 3;   // Frequency for output dumping (default: all steps)
+    string directory = 4;       // Directory for output files
+    string format = 5;          // Options: csv, tsv, npy, npz (default: csv)
+  }
+
+  message CallbackDumpErrorSignals {
+    string basename = 1;
+  }
+
+  message CallbackDumpGradients {
+    string basename = 1;
+    int64 interval = 2;
+  }
+
+  message CallbackDumpMBIndices {
+    string basename = 1;
+    int64 interval = 2;
+  }
+
+  message CallbackDispIOStats {
+    string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers
+  }
+
+  message CallbackImComm {
+    string intertrainer_comm_method = 1;
+    bool all_optimizers = 2;
+  }
+
+  message CallbackDebug {
+    string phase = 1; //should be called "modes"
+  }
+
+  message CallbackDebugIO {
+    string phase = 1;
+    int32 lvl = 2;
+  }
+
+  message CallbackCheckSmall {
+  }
+
+  message CallbackCheckNaN {
+  }
+
+  message CallbackCheckDataset {
+  }
+
+  message CallbackHang {
+    int64 rank = 1;
+  }
+
+  message CallbackDropFixedLearningRate {
+    string weights = 1;
+    repeated int64 drop_epoch = 2;
+    double amt = 3;
+  }
+
+  message CallbackLinearGrowthLearningRate {
+    string weights = 1;
+    double target = 2;
+    int64 num_epochs = 3;
+    int64 delay = 4;
+  }
+
+  message CallbackPolyLearningRate {
+    string weights = 1;
+    double power = 2;
+    uint64 num_epochs = 3;
+    uint64 max_iter = 4;
+    double end_lr = 5;
+  }
+
+  message CallbackStepMinibatch {
+    int64 starting_mbsize = 1;
+    int64 step = 2;
+    int64 ramp_time = 3;
+  }
+
+  message MinibatchScheduleStep {
+    int64 epoch = 1;
+    int64 mbsize = 2;
+    double lr = 3;
+    int64 ramp_time = 4;
+  }
+
+  message CallbackOptimizerwiseAdaptiveLearningRate {
+    string weights = 1;
+    double scale = 2;
+  }
+
+  message CallbackMinibatchSchedule {
+    int64 starting_mbsize = 1;
+    repeated MinibatchScheduleStep step = 2;
+  }
+
+  message CallbackCheckGradients {
+    double step_size = 1;
+    bool verbose = 2;
+    bool error_on_failure = 3; // Throw error if gradient check fails
+    string execution_modes = 4; // Default: all modes
+  }
+
+  message CallbackCheckMetric {
+    string metric = 1;
+    double lower_bound = 2;
+    double upper_bound = 3;
+    bool error_on_failure = 4;  // Throw error if metric check fails
+    string execution_modes = 5; // Default: all modes
+  }
+
+  message CallbackCheckpoint {
+    string checkpoint_dir = 1;
+    string restart_dir = 8;
+    int64 checkpoint_epochs = 2;
+    int64 checkpoint_steps = 3;
+    double checkpoint_secs = 4;
+    string per_rank_dir = 5;
+    int64 ckpt_dist_epochs = 6;
+    int64 ckpt_dist_steps = 7;
+  }
+
+
+  message CallbackSaveModel {
+    string dir = 1;
+    string extension = 2;
+    bool disable_save_after_training = 3;
+  }
+
+  message CallbackLoadModel {
+    string dirs = 1;  //director(ies) to load pretrained model(s)
+    string extension = 2;
+  }
+
+  message CallbackReplaceWeights {
+    string source_layers = 1; //set of layers to copy weights from
+    string destination_layers = 2;  //set of layers to copy weights to
+    int64 batch_interval = 3;
+  }
+  message CallbackGPUMemoryUsage {
+  }
+
+  message CallbackSyncLayers {
+    bool sync_gpus = 1;
+    bool sync_mpi = 2;
+    bool only_input = 3;
+  }
+
+  message CallbackConfusionMatrix {
+    string prediction = 1; // Prediction layer
+    string label = 2;      // Label layer
+    string prefix = 3;     // Prefix for output files
+  }
+
+  message CallbackPerturbAdam {
+    float learning_rate_factor = 1;   // Learning rate perturbation (in log space)
+    float beta1_factor = 2;           // beta1 perturbation (in log space)
+    float beta2_factor = 3;           // beta2 perturbation (in log space)
+    float eps_factor = 4;             // eps perturbation (in log space)
+    bool perturb_during_training = 5; // Whether to periodically perturb during training
+    int64 batch_interval = 6;         // Frequency of perturbation if perturb_during_training is true
+    string weights = 7;               // Weights with Adam optimizer
+  }
+
+  message CallbackPerturbDropout {
+    float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space)
+    string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default
+  }
+
+  message CallbackSaveTopKModels {
+    string dir = 1;  //directory to save model
+    int32  k = 2;    //number of (top) models to save
+    string metric = 3; //metrics to use in evaluating models
+    bool  ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default
+  }
+
+  message CallbackMixup {
+    string layers = 1;
+    float alpha = 2;
+  }
+
+  message CallbackCheckInit {
+  }
+
+  message CallbackEarlyStopping {
+    int64 patience = 1;
+  }
+
+  message CallbackTimeline {
+    string directory = 1;
+  }
+
+  // Print human-readable description of model to standard output.
+  //
+  // Message is printed when the model has finished setup. The
+  // description includes information on the model's layers, weights,
+  // and callbacks.
+  message CallbackPrintModelDescription {
+  }
+
+  /** @brief Set values in a weights object at a given training step */
+  message CallbackSetWeightsValue {
+    string weights = 1;
+    double value = 2;
+    uint64 step = 3;
+  }
+
+  message CallbackSummarizeImages {
+
+    message SelectionStrategy {
+
+      oneof strategy_type {
+        CategoricalAccuracyStrategy categorical_accuracy = 1;
+        TrackSampleIDsStrategy track_sample_ids = 2;
+      }
+
+      message CategoricalAccuracyStrategy {
+        enum MatchType{
+          NOMATCH = 0;//NOMATCH = dump failed categorization
+          MATCH = 1;//MATCH = dump successful categorization
+          ALL = 2;//ALL = dump all
+        }
+        string accuracy_layer_name = 1;
+        MatchType match_type = 2;
+        uint64 num_images_per_epoch = 3;
+      }
+
+      message TrackSampleIDsStrategy {
+        string input_layer_name = 1;
+        uint64 num_tracked_images = 2;
+      }
+
+    }// message SelectionStrategy
+
+    SelectionStrategy selection_strategy = 1;
+    string image_source_layer_name = 2;
+    uint64 epoch_interval = 3;
+  }// message CallbackSummarizeImages
+
+}
diff --git a/src/proto/factories/CMakeLists.txt b/src/proto/factories/CMakeLists.txt
index c89d2d12e81..34d42b4ed21 100644
--- a/src/proto/factories/CMakeLists.txt
+++ b/src/proto/factories/CMakeLists.txt
@@ -1,12 +1,13 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
   callback_factory.cpp
-  factories.cpp
   layer_factory.cpp
   layer_graph_factory.cpp
   model_factory.cpp
   objective_function_factory.cpp
   optimizer_factory.cpp
+  trainer_factory.cpp
+  transform_factory.cpp
   weights_factory.cpp
 )
 
diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp
index 6a3eb421f61..ef29b972858 100644
--- a/src/proto/factories/callback_factory.cpp
+++ b/src/proto/factories/callback_factory.cpp
@@ -24,432 +24,264 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+// Get the declarations of all the builders for registration
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/callbacks/check_dataset.hpp"
+#include "lbann/callbacks/check_gradients.hpp"
+#include "lbann/callbacks/check_init.hpp"
+#include "lbann/callbacks/check_metric.hpp"
+#include "lbann/callbacks/check_nan.hpp"
+#include "lbann/callbacks/check_small.hpp"
+#include "lbann/callbacks/checkpoint.hpp"
+#include "lbann/callbacks/confusion_matrix.hpp"
+#include "lbann/callbacks/debug.hpp"
+#include "lbann/callbacks/debug_io.hpp"
+#include "lbann/callbacks/dump_error_signals.hpp"
+#include "lbann/callbacks/dump_gradients.hpp"
+#include "lbann/callbacks/dump_minibatch_sample_indices.hpp"
+#include "lbann/callbacks/dump_outputs.hpp"
+#include "lbann/callbacks/dump_weights.hpp"
+#include "lbann/callbacks/early_stopping.hpp"
+#include "lbann/callbacks/gpu_memory_usage.hpp"
+#include "lbann/callbacks/hang.hpp"
+#include "lbann/callbacks/imcomm.hpp"
+#include "lbann/callbacks/learning_rate.hpp"
+#include "lbann/callbacks/ltfb.hpp"
+#include "lbann/callbacks/mixup.hpp"
+#include "lbann/callbacks/monitor_io.hpp"
+#include "lbann/callbacks/perturb_adam.hpp"
+#include "lbann/callbacks/perturb_dropout.hpp"
+#include "lbann/callbacks/print_model_description.hpp"
+#include "lbann/callbacks/print_statistics.hpp"
+#include "lbann/callbacks/profiler.hpp"
+#include "lbann/callbacks/replace_weights.hpp"
+#include "lbann/callbacks/save_images.hpp"
+#include "lbann/callbacks/save_model.hpp"
+#include "lbann/callbacks/load_model.hpp"
+#include "lbann/callbacks/save_topk_models.hpp"
+#include "lbann/callbacks/summarize_images.hpp"
+#include "lbann/callbacks/summary.hpp"
+#include "lbann/callbacks/sync_layers.hpp"
+#include "lbann/callbacks/timeline.hpp"
+#include "lbann/callbacks/timer.hpp"
+#include "lbann/callbacks/variable_minibatch.hpp"
+#include "lbann/callbacks/set_weights_value.hpp"
+
 #include "lbann/proto/factories.hpp"
-#include "lbann/utils/peek_map.hpp"
+#include "lbann/proto/helpers.hpp"
+#include "lbann/utils/factory.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <callbacks.pb.h>
+#include <model.pb.h>
+
+#include <google/protobuf/message.h>
+
+#include <memory>
+#include <string>
 
 namespace lbann {
 namespace proto {
 
 namespace {
 
-/** Select entries from a list based on names.
- *  Any entry in 'list' with a name found in 'names' (interpreted as a
- *  space-separated list) is added to the output list.
- */
-template <typename T>
-std::vector<T*> select_from_list(std::string names,
-                                        std::vector<T*> list) {
-  std::vector<T*> selected;
-  for (const auto& name : parse_list<std::string>(names)) {
-    for (auto&& t : list) {
-      if (name == t->get_name()) {
-        selected.push_back(t);
-      }
-    }
-  }
-  return selected;
+// Define the factory type.
+using factory_type = lbann::generic_factory<
+  lbann::callback_base,
+  std::string,
+  generate_builder_type<lbann::callback_base,
+                        google::protobuf::Message const&,
+                        std::shared_ptr<lbann_summary> const&>,
+  default_key_error_policy>;
+
+void register_default_builders(factory_type& factory)
+{
+  using namespace ::lbann::callback;
+  factory.register_builder("CallbackAdaptiveLearningRate",
+                           build_adaptive_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackCheckDataset",
+                           build_check_dataset_callback_from_pbuf);
+  factory.register_builder("CallbackCheckGradients",
+                           build_check_gradients_callback_from_pbuf);
+  factory.register_builder("CallbackCheckInit",
+                           build_check_init_callback_from_pbuf);
+  factory.register_builder("CallbackCheckMetric",
+                           build_check_metric_callback_from_pbuf);
+  factory.register_builder("CallbackCheckNaN",
+                           build_check_nan_callback_from_pbuf);
+  factory.register_builder("CallbackCheckSmall",
+                           build_check_small_callback_from_pbuf);
+  factory.register_builder("CallbackConfusionMatrix",
+                           build_confusion_matrix_callback_from_pbuf);
+  factory.register_builder("CallbackDebug",
+                           build_debug_callback_from_pbuf);
+  factory.register_builder("CallbackDebugIO",
+                           build_debug_io_callback_from_pbuf);
+  factory.register_builder("CallbackDispIOStats",
+                           build_monitor_io_callback_from_pbuf);
+  factory.register_builder("CallbackDropFixedLearningRate",
+                           build_drop_fixed_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackDumpErrorSignals",
+                           build_dump_error_signals_callback_from_pbuf);
+  factory.register_builder("CallbackDumpGradients",
+                           build_dump_gradients_callback_from_pbuf);
+  factory.register_builder("CallbackDumpMBIndices",
+                           build_dump_mb_indices_callback_from_pbuf);
+  factory.register_builder("CallbackDumpOutputs",
+                           build_dump_outputs_callback_from_pbuf);
+  factory.register_builder("CallbackDumpWeights",
+                           build_dump_weights_callback_from_pbuf);
+  factory.register_builder("CallbackEarlyStopping",
+                           build_early_stopping_callback_from_pbuf);
+  factory.register_builder("CallbackGPUMemoryUsage",
+                           build_gpu_memory_usage_callback_from_pbuf);
+  factory.register_builder("CallbackHang",
+                           build_hang_callback_from_pbuf);
+  factory.register_builder("CallbackImComm",
+                           build_imcomm_callback_from_pbuf);
+  factory.register_builder(
+    "CallbackLinearGrowthLearningRate",
+    build_linear_growth_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackLTFB",
+                           build_ltfb_callback_from_pbuf);
+  factory.register_builder("CallbackMinibatchSchedule",
+                           build_minibatch_schedule_callback_from_pbuf);
+  factory.register_builder("CallbackMixup",
+                           build_mixup_callback_from_pbuf);
+  factory.register_builder(
+    "CallbackOptimizerwiseAdaptiveLearningRate",
+    build_optimizerwise_adaptive_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackPerturbAdam",
+                           build_perturb_adam_callback_from_pbuf);
+  factory.register_builder("CallbackPerturbDropout",
+                           build_perturb_dropout_callback_from_pbuf);
+  factory.register_builder("CallbackPolyLearningRate",
+                           build_poly_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackPrintModelDescription",
+                           build_print_model_description_callback_from_pbuf);
+  factory.register_builder("CallbackPrint",
+                           build_print_statistics_callback_from_pbuf);
+  factory.register_builder("CallbackProfiler",
+                           build_profiler_callback_from_pbuf);
+  factory.register_builder("CallbackReplaceWeights",
+                           build_replace_weights_callback_from_pbuf);
+  factory.register_builder("CallbackSaveImages",
+                           build_save_images_callback_from_pbuf);
+  factory.register_builder("CallbackSaveModel",
+                           build_save_model_callback_from_pbuf);
+  factory.register_builder("CallbackLoadModel",
+                           build_load_model_callback_from_pbuf);
+  factory.register_builder("CallbackSaveTopKModels",
+                           build_save_topk_models_callback_from_pbuf);
+  factory.register_builder("CallbackStepLearningRate",
+                           build_step_learning_rate_callback_from_pbuf);
+  factory.register_builder("CallbackStepMinibatch",
+                           build_step_minibatch_callback_from_pbuf);
+  factory.register_builder("CallbackSummarizeImages",
+                           build_summarize_images_callback_from_pbuf);
+  factory.register_builder("CallbackSummary",
+                           build_summary_callback_from_pbuf);
+  factory.register_builder("CallbackSyncLayers",
+                           build_sync_layers_callback_from_pbuf);
+  factory.register_builder("CallbackTimeline",
+                           build_timeline_callback_from_pbuf);
+  factory.register_builder("CallbackTimer",
+                           build_timer_callback_from_pbuf);
+  factory.register_builder("CallbackSetWeightsValue",
+                           build_set_weights_value_callback_from_pbuf);
 }
 
+// Manage a global factory
+struct factory_manager
+{
+    factory_type factory_;
 
-} // namespace
-
-lbann_callback* construct_callback(lbann_comm* comm,
-                                   const lbann_data::Callback& proto_cb,
-                                   const std::map<execution_mode, generic_data_reader*>& data_readers,
-                                   std::vector<Layer*> layer_list,
-                                   std::vector<weights*> weights_list,
-                                   lbann_summary* summarizer) {
-  std::stringstream err;
-
-  //////////////////////////////////////////////////////////////
-  // Display information
-  //////////////////////////////////////////////////////////////
-
-  if (proto_cb.has_print()) {
-    const auto& params = proto_cb.print();
-    return new lbann_callback_print(params.interval(),
-                                    params.print_global_stat_only());
-  }
-  if (proto_cb.has_timer()) {
-    return new lbann_callback_timer(summarizer);
-  }
-  if (proto_cb.has_disp_io_stats()) {
-    const auto& params = proto_cb.disp_io_stats();
-    auto&& l = select_from_list<Layer>(params.layers(),
-                                                     layer_list);
-    std::unordered_set<Layer*> selected_layers(l.begin(), l.end());
-    return new lbann_callback_io(selected_layers);
-  }
-  if (proto_cb.has_save_images()) {
-    const auto& params = proto_cb.save_images();
-    return new lbann_callback_save_images(parse_list<>(params.layers()),
-                                          params.image_format(),
-                                          params.image_prefix());
-  }
-  if (proto_cb.has_confusion_matrix()) {
-    const auto& params = proto_cb.confusion_matrix();
-    return new lbann_callback_confusion_matrix(params.prediction(),
-                                               params.label(),
-                                               params.prefix());
-  }
-
-  //////////////////////////////////////////////////////////////
-  // Inter-model communication
-  //////////////////////////////////////////////////////////////
-
-  if (proto_cb.has_ltfb()) {
-    const auto& params = proto_cb.ltfb();
-    return new lbann_callback_ltfb(params.batch_interval(),
-                                   params.metric(),
-                                   parse_set<std::string>(params.weights()),
-                                   params.low_score_wins(),
-                                   lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()),
-                                   summarizer);
-  }
-  /// @todo
-  if (proto_cb.has_imcomm()) {
-    const auto& params = proto_cb.imcomm();
-    const auto& type_str = params.intertrainer_comm_method();
-    lbann_callback_imcomm::comm_type type = lbann_callback_imcomm::comm_type::NONE;
-    if (type_str == "none") {
-      type = lbann_callback_imcomm::comm_type::NONE;
-    } else if (type_str == "normal") {
-      type = lbann_callback_imcomm::comm_type::NORMAL;
-    } else {
-      err << "invalid inter-model communication type (" << type_str << ")";
-      LBANN_ERROR(err.str());
-    }
-    std::unordered_set<weights*> selected_weights; /// @todo Initialize weights
-    return new lbann_callback_imcomm(type, selected_weights, summarizer);
-  }
-
-  //////////////////////////////////////////////////////////////
-  // Learning rate schedules
-  //////////////////////////////////////////////////////////////
-
-  if (proto_cb.has_step_learning_rate()) {
-    const auto& params = proto_cb.step_learning_rate();
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_step_learning_rate(params.step(),
-                                                 params.amt(),
-                                                 selected_weights);
-  }
-  if (proto_cb.has_adaptive_learning_rate()) {
-    const auto& params = proto_cb.adaptive_learning_rate();
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_adaptive_learning_rate(params.patience(),
-                                                     params.amt(),
-                                                     selected_weights);
-  }
-  if (proto_cb.has_drop_fixed_learning_rate()) {
-    const auto& params = proto_cb.drop_fixed_learning_rate();
-    std::vector<int64_t> drop_epochs;
-    for (int i = 0; i < params.drop_epoch_size(); ++i) {
-      drop_epochs.push_back(params.drop_epoch(i));
+    factory_manager() {
+        register_default_builders(factory_);
     }
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_drop_fixed_learning_rate(drop_epochs,
-                                                       params.amt(),
-                                                       selected_weights);
-  }
-  if (proto_cb.has_linear_growth_learning_rate()) {
-    const auto& params = proto_cb.linear_growth_learning_rate();
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_linear_growth_learning_rate(params.target(),
-                                                          params.num_epochs(),
-                                                          params.delay(),
-                                                          selected_weights);
-  }
-  if (proto_cb.has_optimizerwise_adaptive_learning_rate()) {
-    const auto& params = proto_cb.optimizerwise_adaptive_learning_rate();
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_optimizerwise_adaptive_learning_rate(params.scale(),
-                                                                   selected_weights);
-  }
-  if (proto_cb.has_poly_learning_rate()) {
-    const auto& params = proto_cb.poly_learning_rate();
-    auto&& w = select_from_list<weights>(params.weights(),
-                                                        weights_list);
-    std::unordered_set<weights*> selected_weights(w.begin(), w.end());
-    return new lbann_callback_poly_learning_rate(params.power(),
-                                                 params.num_epochs(),
-                                                 params.max_iter(),
-                                                 params.end_lr(),
-                                                 selected_weights);
-  }
-
-  //////////////////////////////////////////////////////////////
-  // Mini-batch schedules
-  //////////////////////////////////////////////////////////////
+};
 
-  if (proto_cb.has_step_minibatch()) {
-    const auto& params = proto_cb.step_minibatch();
-    return new lbann_callback_step_minibatch(params.starting_mbsize(),
-                                             params.step(),
-                                             params.ramp_time());
-  }
-  if (proto_cb.has_minibatch_schedule()) {
-    const auto& params = proto_cb.minibatch_schedule();
-    std::vector<lbann_callback_minibatch_schedule::minibatch_step> steps;
-    for (int i = 0; i < params.step_size(); ++i) {
-      const auto& proto_step = params.step(i);
-      steps.emplace_back(proto_step.epoch(),
-                         proto_step.mbsize(),
-                         proto_step.lr(),
-                         proto_step.ramp_time());
-    }
-    return new lbann_callback_minibatch_schedule(params.starting_mbsize(),
-                                                 steps);
-  }
-
-  //////////////////////////////////////////////////////////////
-  // Checkpointing and exporting
-  //////////////////////////////////////////////////////////////
+factory_manager factory_mgr_;
+factory_type const& get_callback_factory() noexcept
+{
+  return factory_mgr_.factory_;
+}
 
-  if (proto_cb.has_checkpoint()) {
-    const auto& params = proto_cb.checkpoint();
-    return new lbann_callback_checkpoint(params.checkpoint_dir(),
-                                         params.checkpoint_epochs(),
-                                         params.checkpoint_steps(),
-                                         params.checkpoint_secs(),
-                                         params.per_rank_dir(),
-                                         params.ckpt_dist_epochs(),
-                                         params.ckpt_dist_steps());
-  }
-  if (proto_cb.has_save_model()) {
-    const auto& params = proto_cb.save_model();
-    if(params.extension().size() != 0) {
-      return new lbann_callback_save_model(params.dir(),
-                                           params.disable_save_after_training(),
-                                           params.extension());
-    }else {
-      return new lbann_callback_save_model(params.dir(),
-                                           params.disable_save_after_training());
-    }
-  }
+} // namespace
 
-  //////////////////////////////////////////////////////////////
-  // Weight exchange/replace
-  //////////////////////////////////////////////////////////////
+std::unique_ptr<callback_base>
+construct_callback(
+  const google::protobuf::Message& proto_msg, std::shared_ptr<lbann_summary> const& summarizer) {
 
-  if (proto_cb.has_replace_weights()) {
-    const auto& params = proto_cb.replace_weights();
-    auto&& src_layers = select_from_list<Layer>(params.source_layers(),
-                                                     layer_list);
-    auto&& dst_layers = select_from_list<Layer>(params.destination_layers(),
-                                                     layer_list);
-    return new lbann_callback_replace_weights(src_layers,dst_layers,params.batch_interval());
-  }
+  auto const& factory = get_callback_factory();
+  auto const& msg =
+    helpers::get_oneof_message(proto_msg, "callback_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg, summarizer);
+}
 
-  //////////////////////////////////////////////////////////////
-  // Profiling
-  //////////////////////////////////////////////////////////////
+std::unique_ptr<lbann_summary> construct_summarizer(lbann_comm* comm,
+                                                    const lbann_data::Model& m) {
+  const bool master = comm->am_world_master();
+  if (m.has_summarizer()) {
+    auto dir = m.summarizer().dir();
 
-  if (proto_cb.has_summary()) {
-    const auto& params = proto_cb.summary();
-    return new lbann_callback_summary(summarizer,
-                                      params.batch_interval(),
-                                      params.mat_interval());
-  }
-  if (proto_cb.has_profiler()) {
-    return new lbann_callback_profiler(proto_cb.profiler().sync(),
-                                       proto_cb.profiler().skip_init());
-  }
-  if (proto_cb.has_sync_layers()) {
-    const auto& params = proto_cb.sync_layers();
-    return new lbann_callback_sync_layers(params.sync_gpus(),
-                                          params.sync_mpi(),
-                                          params.only_input());
-  }
-  if (proto_cb.has_sync_selected()) {
-    const auto& params = proto_cb.sync_selected();
-    const int num_layers = params.layer_to_sync_size();
-    if (num_layers == 0) {
-      throw lbann_exception("sync_selected requires at least a layer to synchronize.");
+    if (master) {
+      std::cout << "constructing summarizer with dir: " << dir << std::endl;
     }
 
-    using layers_t = lbann_callback_sync_selected::layers_t;
-    using prop_t = lbann_callback_sync_selected::prop_t;
-
-    layers_t selected_layers;
-    selected_layers.reserve(num_layers);
-
-    for (int i = 0; i < num_layers; ++i) {
-      const auto& layer_to_sync = params.layer_to_sync(i);
-      selected_layers.emplace(layer_to_sync.name(),
-                              static_cast<prop_t>(layer_to_sync.prop()));
+    //check to see if directory exists
+    if (!file::directory_exists(dir)) {
+      LBANN_ERROR("summary directory ", dir, " does not exist.");
     }
 
-    lbann_callback_sync_selected* cb_ptr
-      = new lbann_callback_sync_selected(selected_layers,
-                                        params.async_gpus(),
-                                        params.async_mpi());
-
-    #ifdef LBANN_NVPROF
-    const auto& cp_setup = params.cuda_profiler_setup();
-    if (cp_setup.no_init()) {
-      lbann_callback_sync_selected::turn_off_init_cuda_profiler();
-    } else {
-      cb_ptr->init_cuda_profiler(cp_setup.config_file(),
-                                 cp_setup.output_dir(),
-                                 cp_setup.output_mode(),
-                                 comm);
-    }
-    #endif // LBANN_NVPROF
-    return cb_ptr;
+    return make_unique<lbann_summary>(dir, comm);
   }
+  return nullptr;
+}
 
-  //////////////////////////////////////////////////////////////
-  // Debugging
-  //////////////////////////////////////////////////////////////
-
-  if (proto_cb.has_debug()) {
-    const auto& params = proto_cb.debug();
-    const auto& modes = parse_set<execution_mode>(params.phase());
-    return new lbann_callback_debug(modes, summarizer);
-  }
-  if (proto_cb.has_debug_io()) {
-    const auto& params = proto_cb.debug_io();
-    const auto& phase = params.phase();
-    const auto& lvl = params.lvl();
-    if (phase == "train" || phase == "training") {
-      return new lbann_callback_debug_io(execution_mode::training, lvl);
-    } else if (phase == "validate" || phase == "validation") {
-      return new lbann_callback_debug_io(execution_mode::validation, lvl);
-    } else if (phase == "test" || phase == "testing") {
-      return new lbann_callback_debug_io(execution_mode::testing, lvl);
-    } else {
-      return new lbann_callback_debug_io();
-    }
-  }
-  if (proto_cb.has_dump_weights()) {
-    const auto& params = proto_cb.dump_weights();
-    return new lbann_callback_dump_weights(params.basename());
-  }
-  if (proto_cb.has_dump_outputs()) {
-    const auto& params = proto_cb.dump_outputs();
-    const auto& layer_names = parse_set<>(params.layers());
-    const auto& modes = parse_set<execution_mode>(params.execution_modes());
-    return new lbann_callback_dump_outputs(layer_names,
-                                           modes,
-                                           params.batch_interval(),
-                                           params.directory(),
-                                           params.format());
-  }
-  if (proto_cb.has_dump_error_signals()) {
-    const auto& params = proto_cb.dump_error_signals();
-    return new lbann_callback_dump_error_signals(params.basename());
-  }
-  if (proto_cb.has_dump_gradients()) {
-    const auto& params = proto_cb.dump_gradients();
-    return new lbann_callback_dump_gradients(params.basename(),
-                                             params.interval());
-  }
-  if (proto_cb.has_dump_mb_indices()) {
-    const auto& params = proto_cb.dump_mb_indices();
-    return new lbann_callback_dump_minibatch_sample_indices(params.basename(),
-                                                            params.interval());
-  }
-  if (proto_cb.has_check_dataset()) {
-    return new lbann_callback_check_dataset();
-  }
-  if (proto_cb.has_check_small()) {
-    return new lbann_callback_checksmall();
-  }
-  if (proto_cb.has_check_nan()) {
-    return new lbann_callback_checknan();
-  }
-  if (proto_cb.has_hang()) {
-    const auto& rank_to_hang = proto_cb.hang().rank();
-    if (comm->am_world_master()) {
-      if (rank_to_hang == -1) {
-        std::cout << "*** HANGING EVERY RANK IN HANG CALLBACK ***"
-                  << std::endl;
-      } else {
-        std::cout << "*** HANGING RANK " << rank_to_hang
-                  << " IN HANG CALLBACK ***" << std::endl;
-      }
-    }
-    return new lbann_callback_hang(rank_to_hang);
-  }
-  if (proto_cb.has_check_gradients()) {
-    const auto& params = proto_cb.check_gradients();
-    return new lbann_callback_check_gradients(params.step_size(),
-                                              params.verbose(),
-                                              params.error_on_failure());
-  }
-  if (proto_cb.has_check_metric()) {
-    const auto& params = proto_cb.check_metric();
-    const auto& modes = parse_set<execution_mode>(params.execution_modes());
-    return new lbann_callback_check_metric(params.metric(),
-                                           modes,
-                                           params.lower_bound(),
-                                           params.upper_bound(),
-                                           params.error_on_failure());
-  }
+namespace {
+// Define a second factory type for callbacks that don't require a summarizer
+using factory_type_no_summarizer = lbann::generic_factory<
+  lbann::callback_base,
+  std::string,
+  generate_builder_type<lbann::callback_base,
+                        google::protobuf::Message const&>,
+  default_key_error_policy>;
+
+void register_default_builders_no_summarizer(factory_type_no_summarizer& factory)
+{
+  using namespace callback;
+  factory.register_builder("CallbackCheckpoint",
+                           build_checkpoint_callback_from_pbuf);
+}
 
-  //////////////////////////////////////////////////////////////
-  // GPU memory profiling
-  //////////////////////////////////////////////////////////////
-  if (proto_cb.has_gpu_memory_usage()) {
-    return new lbann_callback_gpu_memory_usage();
-  }
+// Manage a global factory with no summarizer
+struct factory_manager_no_summarizer
+{
+    factory_type_no_summarizer factory_;
 
-  //////////////////////////////////////////////////////////////
-  // Hyperparameter exploration
-  //////////////////////////////////////////////////////////////
-  if (proto_cb.has_perturb_adam()) {
-    const auto& params = proto_cb.perturb_adam();
-    return new lbann_callback_perturb_adam(
-                 params.learning_rate_factor(),
-                 params.beta1_factor(),
-                 params.beta2_factor(),
-                 params.eps_factor(),
-                 params.perturb_during_training(),
-                 params.batch_interval(),
-                 parse_set<std::string>(params.weights()));
-  }
+    factory_manager_no_summarizer() {
+        register_default_builders_no_summarizer(factory_);
+    }
+};
 
-  return nullptr;
+factory_manager_no_summarizer factory_mgr_no_summarizer_;
+factory_type_no_summarizer const& get_callback_factory_no_summarizer() noexcept
+{
+  return factory_mgr_no_summarizer_.factory_;
 }
 
-lbann_summary* construct_summarizer(lbann_comm* comm,
-                                    const lbann_data::Model& m) {
-  lbann_summary *summary = nullptr;
-  bool master = comm->am_world_master();
-  int size = m.callback_size();
-  for (int j=0; j<size; j++) {
-    const lbann_data::Callback& callback = m.callback(j);
-    if (callback.has_summary()) {
-      const lbann_data::CallbackSummary& c = callback.summary();
-      if (master) {
-        std::cout << "constructing summarizer with dir: " << c.dir() << std::endl;
-      }
+} // namespace
 
-      //check to see if directory exists
-      struct stat sb;
-      if (! ( stat(c.dir().c_str(), &sb) == 0 && S_ISDIR(sb.st_mode) )) {
-        if (master) {
-          throw lbann_exception(
-            std::string {} + __FILE__ + " " + std::to_string(__LINE__) + " :: " +
-            "summary directory " + c.dir() + " does not exist");
-        }
-      }
-      summary = new lbann_summary(c.dir(), comm);
-    }
-  }
-  return summary;
-}
+std::unique_ptr<callback_base>
+construct_callback(
+  const google::protobuf::Message& proto_msg) {
 
+  auto const& factory = get_callback_factory_no_summarizer();
+  auto const& msg =
+    helpers::get_oneof_message(proto_msg, "callback_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg);
+}
 } // namespace proto
 } // namespace lbann
diff --git a/src/proto/factories/factories.cpp b/src/proto/factories/factories.cpp
deleted file mode 100644
index 617e639d8c1..00000000000
--- a/src/proto/factories/factories.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/proto/factories.hpp"
-
-namespace lbann {
-namespace proto {
-
-/** Parse a space-separated list of execution modes. */
-template <>
-std::vector<execution_mode> parse_list<execution_mode>(std::string str) {
-  std::vector<execution_mode> list;
-  for (const auto& mode : parse_list<std::string>(str)) {
-    if (mode == "train" || mode == "training") {
-      list.push_back(execution_mode::training);
-    } else if (mode == "validate" || mode == "validation") {
-      list.push_back(execution_mode::validation);
-    } else if (mode == "test" || mode == "testing") {
-      list.push_back(execution_mode::testing);
-    } else {
-      LBANN_ERROR("invalid execution mode (\"" + mode + "\")");
-    }
-  }
-  return list;
-}
-
-} // namespace proto
-} // namespace lbann
diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp
index 2521314d2ae..6d9d52899b4 100644
--- a/src/proto/factories/layer_factory.cpp
+++ b/src/proto/factories/layer_factory.cpp
@@ -25,32 +25,301 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/proto/factories.hpp"
+#include "lbann/proto/helpers.hpp"
+#include "lbann/utils/factory.hpp"
+#include "lbann/utils/typename.hpp"
+
+#include "lbann/layers/layer.hpp"
+#include "lbann/layers/activations/activations.hpp"
+#include "lbann/layers/activations/elu.hpp"
+#include "lbann/layers/activations/identity.hpp"
+#include "lbann/layers/activations/leaky_relu.hpp"
+#include "lbann/layers/activations/relu.hpp"
+#include "lbann/layers/activations/log_softmax.hpp"
+#include "lbann/layers/activations/softmax.hpp"
+#include "lbann/layers/image/bilinear_resize.hpp"
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/layers/io/input/input_layer.hpp"
+#include "lbann/layers/io/io_layer.hpp"
+#include "lbann/layers/learning/channelwise_fully_connected.hpp"
+#include "lbann/layers/learning/channelwise_scale_bias.hpp"
+#include "lbann/layers/learning/convolution.hpp"
+#include "lbann/layers/learning/deconvolution.hpp"
+#include "lbann/layers/learning/embedding.hpp"
+#include "lbann/layers/learning/entrywise_scale_bias.hpp"
+#include "lbann/layers/learning/fully_connected.hpp"
+#include "lbann/layers/learning/learning.hpp"
+#include "lbann/layers/loss/categorical_accuracy.hpp"
+#include "lbann/layers/loss/cross_entropy.hpp"
+#include "lbann/layers/loss/entrywise.hpp"
+#include "lbann/layers/loss/l1_norm.hpp"
+#include "lbann/layers/loss/l2_norm2.hpp"
+#include "lbann/layers/loss/mean_absolute_error.hpp"
+#include "lbann/layers/loss/mean_squared_error.hpp"
+#include "lbann/layers/loss/top_k_categorical_accuracy.hpp"
+#include "lbann/layers/math/binary.hpp"
+#include "lbann/layers/math/clamp.hpp"
+#include "lbann/layers/math/matmul.hpp"
+#include "lbann/layers/math/unary.hpp"
+#include "lbann/layers/misc/channelwise_mean.hpp"
+#include "lbann/layers/misc/channelwise_softmax.hpp"
+#include "lbann/layers/misc/covariance.hpp"
+#include "lbann/layers/misc/mini_batch_index.hpp"
+#include "lbann/layers/misc/mini_batch_size.hpp"
+#include "lbann/layers/misc/variance.hpp"
+#include "lbann/layers/misc/argmax.hpp"
+#include "lbann/layers/misc/argmin.hpp"
+#include "lbann/layers/misc/one_hot.hpp"
+#include "lbann/layers/misc/dist_embedding.hpp"
+#include "lbann/layers/regularizers/batch_normalization.hpp"
+#include "lbann/layers/regularizers/dropout.hpp"
+#include "lbann/layers/regularizers/local_response_normalization.hpp"
+#include "lbann/layers/regularizers/regularizer.hpp"
+#include "lbann/layers/regularizers/selu_dropout.hpp"
+#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp"
+#include "lbann/layers/regularizers/layer_norm.hpp"
+#include "lbann/layers/regularizers/instance_norm.hpp"
+#include "lbann/layers/transform/bernoulli.hpp"
+#include "lbann/layers/transform/categorical_random.hpp"
+#include "lbann/layers/transform/concatenate.hpp"
+#include "lbann/layers/transform/constant.hpp"
+#include "lbann/layers/transform/crop.hpp"
+#include "lbann/layers/transform/discrete_random.hpp"
+#include "lbann/layers/transform/dummy.hpp"
+#include "lbann/layers/transform/evaluation.hpp"
+#include "lbann/layers/transform/gaussian.hpp"
+#include "lbann/layers/transform/hadamard.hpp"
+#include "lbann/layers/transform/in_top_k.hpp"
+#include "lbann/layers/transform/pooling.hpp"
+#include "lbann/layers/transform/reduction.hpp"
+#include "lbann/layers/transform/reshape.hpp"
+#include "lbann/layers/transform/slice.hpp"
+#include "lbann/layers/transform/sort.hpp"
+#include "lbann/layers/transform/split.hpp"
+#include "lbann/layers/transform/stop_gradient.hpp"
+#include "lbann/layers/transform/sum.hpp"
+#include "lbann/layers/transform/tessellate.hpp"
+#include "lbann/layers/transform/transform.hpp"
+#include "lbann/layers/transform/uniform.hpp"
+#include "lbann/layers/transform/unpooling.hpp"
+#include "lbann/layers/transform/weighted_sum.hpp"
+#include "lbann/layers/transform/weights.hpp"
+
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
 #include "lbann/utils/peek_map.hpp"
 
+#include <layers.pb.h>
+
+#ifdef LBANN_HAS_CUDNN
+#include <cudnn.h>
+#endif // LBANN_HAS_CUDNN
+
 namespace lbann {
 namespace proto {
 
-std::vector<El::Int> get_slice_points_from_reader(const generic_data_reader* dr,
-                                                  const std::string& var_category,
-                                                  bool& is_supported);
+namespace {
 
-template <data_layout Layout, El::Device Device>
-std::unique_ptr<Layer> construct_layer(
+// Define the factory type.
+using factory_type = lbann::generic_factory<
+  lbann::Layer,
+  std::string,
+  generate_builder_type<lbann::Layer,
+                        lbann_comm*,
+                        const lbann_data::Layer&>,
+  nullptr_key_error_policy>;
+
+/** @brief Singleton holder for a factory.
+ *
+ *  @note This design requires that the builder function be valid for
+ *  every combination of T, L, and D. That is, layer types for which a
+ *  combination is invalid must handle that error inside their builder
+ *  function.
+ */
+template <typename T, data_layout L, El::Device D>
+class factory_manager
+{
+public:
+
+  factory_manager() { register_default_builders(); }
+  factory_type const& get() const noexcept { return factory_; }
+
+private:
+
+  // This macro simplifies the process of adding default builders
+#define LBANN_REGISTER_BUILDER(KEY, LAYER_NAME)                         \
+    factory_.register_builder(                                          \
+      #KEY, build_##LAYER_NAME##_layer_from_pbuf<T,L,D>)
+#define LBANN_REGISTER_DEFAULT_BUILDER(KEY, LAYER_NAME)                 \
+    factory_.register_builder(                                          \
+      #KEY,                                                             \
+      [](lbann_comm* comm,                                              \
+         lbann_data::Layer const&){                                     \
+        return lbann::make_unique<LAYER_NAME##_layer<T,L,D>>(comm);     \
+      })
+
+  // Builder registration happens here
+  void register_default_builders() {
+
+    // Learning layers
+    LBANN_REGISTER_BUILDER(Convolution, convolution);
+    LBANN_REGISTER_BUILDER(ChannelwiseFullyConnected, channelwise_fully_connected);
+    LBANN_REGISTER_BUILDER(ChannelwiseScaleBias, channelwise_scale_bias);
+    LBANN_REGISTER_BUILDER(Embedding, embedding);
+    LBANN_REGISTER_BUILDER(EntrywiseScaleBias, entrywise_scale_bias);
+    LBANN_REGISTER_BUILDER(FullyConnected, fully_connected);
+
+    // Math layers
+    LBANN_REGISTER_DEFAULT_BUILDER(Abs, abs);
+    LBANN_REGISTER_DEFAULT_BUILDER(Acos, acos);
+    LBANN_REGISTER_DEFAULT_BUILDER(Acosh, acosh);
+    LBANN_REGISTER_DEFAULT_BUILDER(Add, add);
+    LBANN_REGISTER_DEFAULT_BUILDER(Asin, asin);
+    LBANN_REGISTER_DEFAULT_BUILDER(Asinh, asinh);
+    LBANN_REGISTER_DEFAULT_BUILDER(Atan, atan);
+    LBANN_REGISTER_DEFAULT_BUILDER(Atanh, atanh);
+    LBANN_REGISTER_DEFAULT_BUILDER(Ceil, ceil);
+    LBANN_REGISTER_DEFAULT_BUILDER(Cos, cos);
+    LBANN_REGISTER_DEFAULT_BUILDER(Cosh, cosh);
+    LBANN_REGISTER_DEFAULT_BUILDER(Divide, divide);
+    LBANN_REGISTER_DEFAULT_BUILDER(Equal, equal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Exp, exp);
+    LBANN_REGISTER_DEFAULT_BUILDER(Expm1, expm1);
+    LBANN_REGISTER_DEFAULT_BUILDER(Floor, floor);
+    LBANN_REGISTER_DEFAULT_BUILDER(Greater, greater);
+    LBANN_REGISTER_DEFAULT_BUILDER(GreaterEqual, greater_equal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Less, less);
+    LBANN_REGISTER_DEFAULT_BUILDER(LessEqual, less_equal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Log, log);
+    LBANN_REGISTER_DEFAULT_BUILDER(Log1p, log1p);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogicalAnd, logical_and);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogicalNot, logical_not);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogicalOr, logical_or);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogicalXor, logical_xor);
+    LBANN_REGISTER_DEFAULT_BUILDER(Max, max);
+    LBANN_REGISTER_DEFAULT_BUILDER(Min, min);
+    LBANN_REGISTER_DEFAULT_BUILDER(Mod, mod);
+    LBANN_REGISTER_DEFAULT_BUILDER(Multiply, multiply);
+    LBANN_REGISTER_DEFAULT_BUILDER(Negative, negative);
+    LBANN_REGISTER_DEFAULT_BUILDER(NotEqual, not_equal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Pow, pow);
+    LBANN_REGISTER_DEFAULT_BUILDER(Reciprocal, reciprocal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Round, round);
+    LBANN_REGISTER_DEFAULT_BUILDER(Rsqrt, rsqrt);
+    LBANN_REGISTER_DEFAULT_BUILDER(SafeDivide, safe_divide);
+    LBANN_REGISTER_DEFAULT_BUILDER(SafeReciprocal, safe_reciprocal);
+    LBANN_REGISTER_DEFAULT_BUILDER(Sign, sign);
+    LBANN_REGISTER_DEFAULT_BUILDER(Sin, sin);
+    LBANN_REGISTER_DEFAULT_BUILDER(Sinh, sinh);
+    LBANN_REGISTER_DEFAULT_BUILDER(Sqrt, sqrt);
+    LBANN_REGISTER_DEFAULT_BUILDER(Square, square);
+    LBANN_REGISTER_DEFAULT_BUILDER(SquaredDifference, squared_difference);
+    LBANN_REGISTER_DEFAULT_BUILDER(Subtract, subtract);
+    LBANN_REGISTER_DEFAULT_BUILDER(Tan, tan);
+    LBANN_REGISTER_DEFAULT_BUILDER(Tanh, tanh);
+
+    // Transform layers
+    LBANN_REGISTER_BUILDER(Bernoulli, bernoulli);
+    LBANN_REGISTER_BUILDER(CategoricalRandom, categorical_random);
+    LBANN_REGISTER_BUILDER(Concatenation, concatenate);
+    LBANN_REGISTER_BUILDER(Constant, constant);
+    LBANN_REGISTER_BUILDER(Crop, crop);
+    LBANN_REGISTER_BUILDER(Dummy, dummy);
+    LBANN_REGISTER_BUILDER(Evaluation, evaluation);
+    LBANN_REGISTER_BUILDER(Hadamard, hadamard);
+    LBANN_REGISTER_BUILDER(Pooling, pooling);
+    LBANN_REGISTER_BUILDER(Split, split);
+    LBANN_REGISTER_BUILDER(StopGradient, stop_gradient);
+    LBANN_REGISTER_BUILDER(Sum, sum);
+    LBANN_REGISTER_BUILDER(WeightedSum, weighted_sum);
+    LBANN_REGISTER_BUILDER(WeightsLayer, weights);
+
+    // Activations
+    LBANN_REGISTER_DEFAULT_BUILDER(Identity, identity);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogSigmoid, log_sigmoid);
+    LBANN_REGISTER_DEFAULT_BUILDER(LogSoftmax, log_softmax);
+    LBANN_REGISTER_DEFAULT_BUILDER(Relu, relu);
+    LBANN_REGISTER_DEFAULT_BUILDER(Selu, selu);
+    LBANN_REGISTER_DEFAULT_BUILDER(Sigmoid, sigmoid);
+    LBANN_REGISTER_BUILDER(Softmax, softmax);
+    LBANN_REGISTER_DEFAULT_BUILDER(Softplus, softplus);
+    LBANN_REGISTER_DEFAULT_BUILDER(Softsign, softsign);
+
+    // Loss Layers
+    LBANN_REGISTER_DEFAULT_BUILDER(BinaryCrossEntropy, binary_cross_entropy);
+    LBANN_REGISTER_DEFAULT_BUILDER(BooleanAccuracy, boolean_accuracy);
+    LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalseNegative, boolean_false_negative);
+    LBANN_REGISTER_DEFAULT_BUILDER(BooleanFalsePositive, boolean_false_positive);
+    LBANN_REGISTER_DEFAULT_BUILDER(CategoricalAccuracy, categorical_accuracy);
+    LBANN_REGISTER_DEFAULT_BUILDER(CrossEntropy, cross_entropy);
+    LBANN_REGISTER_DEFAULT_BUILDER(L1Norm, l1_norm);
+    LBANN_REGISTER_DEFAULT_BUILDER(L2Norm2, l2_norm2);
+    LBANN_REGISTER_DEFAULT_BUILDER(MeanAbsoluteError, mean_absolute_error);
+    LBANN_REGISTER_DEFAULT_BUILDER(MeanSquaredError, mean_squared_error);
+    LBANN_REGISTER_DEFAULT_BUILDER(SigmoidBinaryCrossEntropy, sigmoid_binary_cross_entropy);
+
+    // Regularizer layers
+    LBANN_REGISTER_BUILDER(Dropout, dropout);
+    LBANN_REGISTER_BUILDER(InstanceNorm, instance_norm);
+    LBANN_REGISTER_BUILDER(LocalResponseNormalization,
+                           local_response_normalization);
+    // Miscellaneous layers
+    LBANN_REGISTER_BUILDER(ChannelwiseSoftmax, channelwise_softmax);
+    LBANN_REGISTER_DEFAULT_BUILDER(MiniBatchIndex, mini_batch_index);
+    LBANN_REGISTER_DEFAULT_BUILDER(MiniBatchSize, mini_batch_size);
+    LBANN_REGISTER_BUILDER(DistEmbedding, dist_embedding);
+
+  }
+
+  // Just to be clear/safe.
+#undef LBANN_REGISTER_DEFAULT_BUILDER
+
+private:
+  factory_type factory_;
+}; // class factory_manager
+
+template <typename T, data_layout L, El::Device D>
+factory_type const& get_layer_factory() noexcept
+{
+  static factory_manager<T,L,D> factory_mgr_;
+  return factory_mgr_.get();
+}
+
+// Some cuDNN stuff -- copied from convolution.cpp. To what common
+// location should this go?? The problem is it's the confluence of two
+// evils: protobuf and cudnn. I'd rather they never meet, but whatdya
+// gonna do.
+#ifdef LBANN_HAS_CUDNN
+using ProtoTensorOpEnumType = decltype(lbann_data::DEFAULT_TENSOR_OPS);
+cudnnMathType_t convert_to_cudnn_math_type(ProtoTensorOpEnumType mt)
+{
+  switch (mt)
+  {
+  case lbann_data::DEFAULT_TENSOR_OPS:
+    return cudnn::get_default_convolution_math_type();
+  case lbann_data::NO_TENSOR_OPS:
+    return CUDNN_DEFAULT_MATH;
+  case lbann_data::USE_TENSOR_OPS:
+    return CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+  default:
+    LBANN_ERROR("Bad math type value.");
+  }
+  return CUDNN_DEFAULT_MATH;
+}
+#endif // LBANN_HAS_CUDNN
+} // namespace
+
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> construct_layer_legacy(
   lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
+  int training_dr_linearized_data_size,
   int num_parallel_readers,
   const lbann_data::Layer& proto_layer) {
   std::stringstream err;
 
-  // Convenience macro to construct layers with no parameters
-#define CONSTRUCT_LAYER(name)                                           \
-  do {                                                                  \
-    if (proto_layer.has_##name()) {                                     \
-      return lbann::make_unique<name##_layer<Layout, Device>>(comm);    \
-    }                                                                   \
-  } while (false)
-
   // Input layers
+  // Currently this cannot be suitably removed from this function
+  // because it relies on "num_parallel_readers" and "data_readers"
+  // arguments.
   if (proto_layer.has_input()) {
     const auto& params = proto_layer.input();
     const auto& io_buffer = params.io_buffer();
@@ -60,101 +329,36 @@ std::unique_ptr<Layer> construct_layer(
     if (mode_str == "regression")                         { target_mode = data_reader_target_mode::REGRESSION; }
     if (mode_str == "reconstruction")                     { target_mode = data_reader_target_mode::RECONSTRUCTION; }
     if (mode_str == "na" || mode_str == "NA" || mode_str == "N/A") { target_mode = data_reader_target_mode::NA; }
-    if (io_buffer == "partitioned" || io_buffer.empty()) {
-      return lbann::make_unique<input_layer<partitioned_io_buffer,Layout,Device>>(
-               comm,
-               num_parallel_readers,
-               data_readers,
-               !params.data_set_per_model(),
-               target_mode);
-    } else {
-      LBANN_ERROR("invalid IO buffer type (" + io_buffer + ")");
-    }
-  }
-
-  // Fully connected layer
-  if (proto_layer.has_fully_connected()) {
-    const auto& params = proto_layer.fully_connected();
-    int num_neurons = 0;
-    std::string num_neurons_method_name;
-
-    if (params.get_num_neurons_of_slice_from_reader_size() > 0) {
-      num_neurons_method_name = "get_num_neurons_of_slice_from_reader";
-    #if defined(LBANN_HAS_CONDUIT)
-      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
-      const int num_slice_indices = params.get_num_neurons_of_slice_from_reader_size();
-      if (dynamic_cast<lbann::data_reader_jag_conduit*>(dr_generic) != nullptr) {
-        const std::string& var = params.get_slice_points_from_reader();
-        bool is_supported = false; /// @todo Remove unneeded function parameter
-        const auto slice_points = get_slice_points_from_reader(dr_generic, var, is_supported);
-        for (int i = 0; i < num_slice_indices; ++i) {
-          const size_t idx = static_cast<size_t>(params.get_num_neurons_of_slice_from_reader(i));
-          if ((idx == 0u) || (idx >= slice_points.size())) {
-            err << "invalid slice index from get_num_neurons_of_slice_from_reader";
-            LBANN_ERROR(err.str());
-          }
-          const int diff = static_cast<int>(slice_points[idx] - slice_points[idx-1]);
-          num_neurons += diff;
-        }
-      }
-    #endif // defined(LBANN_HAS_CONDUIT)
-    } else {
-      num_neurons_method_name = "num_neurons";
-      num_neurons = params.num_neurons();
-      if (proto_layer.num_neurons_from_data_reader()) {
-        const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
-        if (!dr) {
-          LBANN_ERROR("training data reader does not exist!");
-        }
-        num_neurons = dr->get_linearized_data_size();
-      }
-    }
-    return lbann::make_unique<fully_connected_layer<Layout, Device>>(
-             comm,
-             num_neurons,
-             params.transpose(),
-             nullptr,
-             params.has_bias());
-  }
-
-  // Convolution and deconvolution layer
-  if (proto_layer.has_convolution()) {
-    const auto& params = proto_layer.convolution();
-    const auto& num_output_channels = params.num_output_channels();
-    const auto& bias = params.has_bias();
-    int num_groups = params.num_groups();
-    if (num_groups == 0) {
-      num_groups = 1;
-    }
     if (Layout != data_layout::DATA_PARALLEL) {
-      LBANN_ERROR("convolution layer is only supported with "
+      LBANN_ERROR("input layer is only supported with "
                   "a data-parallel layout");
     }
-    if (params.has_vectors()) {
-      const auto& dims = parse_list<int>(params.conv_dims());
-      const auto& pads = parse_list<int>(params.conv_pads());
-      const auto& strides = parse_list<int>(params.conv_strides());
-      std::vector<int> dilations = parse_list<int>(params.conv_dilations());
-      if (dilations.empty()) {
-        dilations.resize(dims.size(), 1);
+    if (io_buffer == "partitioned" || io_buffer.empty()) {
+      /// @todo Question for Tim Moon and Tom Benson, I had to change this line from Layout to
+      /// data_layout::DATA_PARALLEL to make it compile with clang on OS X, but it seems like
+      /// this is not related to this PR.
+      if ((typeid(TensorDataType) == typeid(DataType))
+          && (Layout == data_layout::DATA_PARALLEL)) {
+        return lbann::make_unique<input_layer<DataType,
+                                              partitioned_io_buffer<DataType>,
+                                              data_layout::DATA_PARALLEL,
+                                              Device>>(
+                                                comm,
+                                                num_parallel_readers,
+                                                target_mode);
       }
-      return lbann::make_unique<convolution_layer<data_layout::DATA_PARALLEL, Device>>(
-               comm, dims.size(), num_output_channels,
-               dims, pads, strides, dilations, num_groups, bias);
-    } else {
-      const auto& num_dims = params.num_dims();
-      const auto& dim = params.conv_dims_i();
-      const auto& pad = params.conv_pads_i();
-      const auto& stride = params.conv_strides_i();
-      int dilation = params.conv_dilations_i();
-      if (dilation == 0) {
-        dilation = 1;
+      else {
+        LBANN_ERROR("Input layers are only valid with "
+                    "TensorDataType == DataType and Layout == DATA_PARALLEL");
       }
-      return lbann::make_unique<convolution_layer<data_layout::DATA_PARALLEL, Device>>(
-               comm, num_dims, num_output_channels,
-               dim, pad, stride, dilation, num_groups, bias);
+    } else {
+      LBANN_ERROR("invalid IO buffer type (" + io_buffer + ")");
     }
   }
+
+  // Currently this cannot be suitably removed from this function
+  // because it relies on "num_parallel_readers" and "data_readers"
+  // arguments.
   if (proto_layer.has_deconvolution()) {
     const auto& params = proto_layer.deconvolution();
     const auto& bias = params.has_bias();
@@ -164,11 +368,10 @@ std::unique_ptr<Layer> construct_layer(
       num_groups = 1;
     }
     if (proto_layer.num_neurons_from_data_reader()) {
-      const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
-      if (!dr) {
+      if (training_dr_linearized_data_size == -1) {
         LBANN_ERROR("Training data reader does not exist!");
       }
-      num_output_channels = dr->get_linearized_data_size();
+      num_output_channels = training_dr_linearized_data_size;
     }
     if (Layout != data_layout::DATA_PARALLEL) {
       LBANN_ERROR("deconvolution layer is only supported with "
@@ -182,9 +385,19 @@ std::unique_ptr<Layer> construct_layer(
       if (dilations.empty()) {
         dilations.resize(dims.size(), 1);
       }
-      return lbann::make_unique<deconvolution_layer<data_layout::DATA_PARALLEL, Device>>(
+#ifdef LBANN_HAS_CUDNN
+      auto ret = lbann::make_unique<deconvolution_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
+        comm, dims.size(), num_output_channels,
+        dims, pads, strides, dilations, num_groups, bias);
+      ret->set_cudnn_math_mode(
+        convert_to_cudnn_math_type(params.conv_tensor_op_mode()));
+      return ret;
+#else
+      return lbann::make_unique<deconvolution_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
                comm, dims.size(), num_output_channels,
                dims, pads, strides, dilations, num_groups, bias);
+#endif // LBANN_HAS_CUDNN
+
     } else {
       const auto& num_dims = params.num_dims();
       const auto& dim = params.conv_dims_i();
@@ -194,13 +407,25 @@ std::unique_ptr<Layer> construct_layer(
       if (dilation == 0) {
         dilation = 1;
       }
-      return lbann::make_unique<deconvolution_layer<data_layout::DATA_PARALLEL, Device>>(
-               comm, num_dims, num_output_channels,
-               dim, pad, stride, dilation, num_groups, bias);
+#ifdef LBANN_HAS_CUDNN
+      auto ret = lbann::make_unique<deconvolution_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
+        comm, num_dims, num_output_channels,
+        dim, pad, stride, dilation, num_groups, bias);
+      ret->set_cudnn_math_mode(
+        convert_to_cudnn_math_type(params.conv_tensor_op_mode()));
+      return ret;
+#else
+      return lbann::make_unique<deconvolution_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
+        comm, num_dims, num_output_channels,
+        dim, pad, stride, dilation, num_groups, bias);
+#endif // LBANN_HAS_CUDNN
     }
   }
 
   // Transform layers
+  // Currently this cannot be suitably removed from this function
+  // because it relies on "num_parallel_readers" and "data_readers"
+  // arguments.
   if (proto_layer.has_reshape()) {
     const auto& params = proto_layer.reshape();
     std::vector<int> dims = parse_list<int>(params.dims());
@@ -209,124 +434,73 @@ std::unique_ptr<Layer> construct_layer(
     }
     if (proto_layer.num_neurons_from_data_reader()) {
       dims.clear();
-      const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
-      if (!dr) {
+      if (training_dr_linearized_data_size == -1) {
         LBANN_ERROR("Training data reader does not exist!");
       }
-      dims.push_back(dr->get_linearized_data_size());
+      dims.push_back(training_dr_linearized_data_size);
     }
-    return lbann::make_unique<reshape_layer<Layout, Device>>(comm, dims);
-  }
-  if (proto_layer.has_sum()) {
-    return lbann::make_unique<sum_layer<Layout, Device>>(comm);
-  }
-  if (proto_layer.has_weighted_sum()) {
-    const auto& params = proto_layer.weighted_sum();
-    const auto& scaling_factors = parse_list<DataType>(params.scaling_factors());
-    return lbann::make_unique<weighted_sum_layer<Layout, Device>>(comm, scaling_factors);
-  }
-  if (proto_layer.has_split()) {
-    return lbann::make_unique<split_layer<Layout, Device>>(comm);
-  }
-  if (proto_layer.has_concatenation()) {
-    const auto& axis = proto_layer.concatenation().axis();
-    return lbann::make_unique<concatenation_layer<Layout, Device>>(comm, axis);
+    return lbann::make_unique<reshape_layer<TensorDataType, Layout, Device>>(comm, dims);
   }
+
+  // Currently this cannot be suitably removed from this function
+  // because it relies on "num_parallel_readers" and "data_readers"
+  // arguments.
   if (proto_layer.has_slice()) {
     const auto& params = proto_layer.slice();
-    std::vector<El::Int> slice_points;
-    bool is_supported = false;
-    std::string slice_point_method_name;
+    std::vector<size_t> slice_points;
+
+    auto layer = lbann::make_unique<slice_layer<TensorDataType, Layout, Device>>(comm);
 
     if (params.get_slice_points_from_reader() != "") {
-      slice_point_method_name = "'get_slice_points_from_reader'";
-    #if defined(LBANN_HAS_CONDUIT)
-      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
-      const std::string& var = params.get_slice_points_from_reader();
-      slice_points = get_slice_points_from_reader(dr_generic, var, is_supported);
-    #endif // defined(LBANN_HAS_CONDUIT)
+      const slice_points_mode var = slice_points_mode_from_string(params.get_slice_points_from_reader());
+      layer->setup_slice_points(params.axis(), true, var);
     } else {
-      slice_point_method_name = "'slice_points'";
-      slice_points = parse_list<El::Int>(params.slice_points());
-      is_supported = true;
-    }
-    if (slice_points.size() < 2u) {
-      if (is_supported) {
+      std::string slice_point_method_name = "'slice_points'";
+      slice_points = parse_list<size_t>(params.slice_points());
+      if (slice_points.size() < 2u) {
         err << "Failed to get slice points via " << slice_point_method_name << '.';
-      } else {
-        err << slice_point_method_name << " is not supported by the reader.";
+        LBANN_ERROR(err.str());
+        return nullptr;
       }
-      LBANN_ERROR(err.str());
-      return nullptr;
+      layer->setup_slice_points(params.axis(), slice_points);
     }
-    return lbann::make_unique<slice_layer<Layout, Device>>(
-             comm, params.axis(), slice_points);
-  }
-  if (proto_layer.has_hadamard()) {
-    return lbann::make_unique<hadamard_layer<Layout, Device>>(comm);
-  }
-  if (proto_layer.has_constant()) {
-    const auto& params = proto_layer.constant();
-    const auto& dims = parse_list<int>(params.num_neurons());
-    return lbann::make_unique<constant_layer<Layout, Device>>(comm, params.value(), dims);
+    return layer;
   }
   if (proto_layer.has_gaussian()) {
     const auto& params = proto_layer.gaussian();
     const auto& dims = parse_list<int>(params.neuron_dims());
-    if (params.mean() == 0 && params.stdev() == 0) {
-      return lbann::make_unique<gaussian_layer<Layout, Device>>(comm, dims);
-    } else {
-      return lbann::make_unique<gaussian_layer<Layout, Device>>(comm,
-                                             dims,
-                                             params.mean(),
-                                             params.stdev());
+    double mean = params.mean();
+    double stdev = params.stdev();
+    if (mean == 0.0 && stdev == 0.0) {
+      mean = 0.0;
+      stdev = 1.0;
     }
-  }
-  if (proto_layer.has_bernoulli()) {
-    const auto& params = proto_layer.bernoulli();
-    const auto& dims = parse_list<int>(params.neuron_dims());
-    return lbann::make_unique<bernoulli_layer<Layout, Device>>(
-             comm, dims, params.prob());
+    return lbann::make_unique<gaussian_layer<TensorDataType,Layout,Device>>(
+      comm,
+      dims,
+      mean,
+      stdev,
+      params.training_only());
   }
   if (proto_layer.has_uniform()) {
     const auto& params = proto_layer.uniform();
     const auto& dims = parse_list<int>(params.neuron_dims());
-    if (params.min() == 0 && params.max() == 0) {
-      return lbann::make_unique<uniform_layer<Layout, Device>>(comm, dims);
-    } else {
-      return lbann::make_unique<uniform_layer<Layout, Device>>(
-               comm, dims, params.min(), params.max());
-    }
-  }
-  if (proto_layer.has_pooling()) {
-    const auto& params = proto_layer.pooling();
-    const auto& mode_str = params.pool_mode();
-    pool_mode mode = pool_mode::invalid;
-    if (mode_str == "max" )            { mode = pool_mode::max; }
-    if (mode_str == "average" )        { mode = pool_mode::average; }
-    if (mode_str == "average_no_pad" ) { mode = pool_mode::average_no_pad; }
-    if (Layout != data_layout::DATA_PARALLEL) {
-      LBANN_ERROR("pooling layer is only supported with "
-                  "a data-parallel layout");
-    }
-    if (params.has_vectors()) {
-      const auto& dims = parse_list<int>(params.pool_dims());
-      const auto& pads = parse_list<int>(params.pool_pads());
-      const auto& strides = parse_list<int>(params.pool_strides());
-      return lbann::make_unique<pooling_layer<data_layout::DATA_PARALLEL, Device>>(
-               comm, dims.size(), dims, pads, strides, mode);
-    } else {
-      const auto& num_dims = params.num_dims();
-      const auto& dim = params.pool_dims_i();
-      const auto& pad = params.pool_pads_i();
-      const auto& stride = params.pool_strides_i();
-      return lbann::make_unique<pooling_layer<data_layout::DATA_PARALLEL, Device>>(
-               comm, num_dims, dim, pad, stride, mode);
+    double min = params.min();
+    double max = params.max();
+    if (min == 0.0 && max == 0.0) {
+      min = 0.0;
+      max = 1.0;
     }
+    return lbann::make_unique<uniform_layer<TensorDataType,Layout,Device>>(
+      comm,
+      dims,
+      min,
+      max,
+      params.training_only());
   }
   if (proto_layer.has_unpooling()) {
     if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) {
-      return lbann::make_unique<unpooling_layer<data_layout::DATA_PARALLEL, El::Device::CPU>>(comm);
+      return lbann::make_unique<unpooling_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>>(comm);
     } else {
       LBANN_ERROR("unpooling layer is only supported with "
                   "a data-parallel layout and on CPU");
@@ -339,92 +513,67 @@ std::unique_ptr<Layer> construct_layer(
     if (mode_str == "sum" || mode_str.empty()) { mode = reduction_mode::SUM; }
     if (mode_str == "average") { mode = reduction_mode::AVERAGE; }
     if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<reduction_layer<data_layout::DATA_PARALLEL, Device>>(comm, mode);
+      return lbann::make_unique<reduction_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(comm, mode);
     } else {
       LBANN_ERROR("reduction layer is only supported with "
                   "a data-parallel layout");
     }
   }
-  if (proto_layer.has_evaluation()) {
-    return lbann::make_unique<evaluation_layer<Layout, Device>>(comm);
-  }
-  if (proto_layer.has_crop()) {
-    const auto& params = proto_layer.crop();
-    const auto& dims = parse_list<int>(params.dims());
-    if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<crop_layer<data_layout::DATA_PARALLEL, Device>>(comm, dims);
-    } else {
-      LBANN_ERROR("crop layer is only supported with "
-                  "a data-parallel layout");
-    }
-  }
-  if (proto_layer.has_categorical_random()) {
-    if (Layout == data_layout::DATA_PARALLEL
-        && Device == El::Device::CPU) {
-      return lbann::make_unique<categorical_random_layer<data_layout::DATA_PARALLEL, El::Device::CPU>>(comm);
-    } else {
-      LBANN_ERROR("categorical random layer is only supported on CPU");
-    }
-  }
   if (proto_layer.has_discrete_random()) {
     const auto& params = proto_layer.discrete_random();
     const auto& values = parse_list<DataType>(params.values());
     const auto& dims = parse_list<int>(params.dims());
     if (Layout == data_layout::DATA_PARALLEL
         && Device == El::Device::CPU) {
-      return lbann::make_unique<discrete_random_layer<data_layout::DATA_PARALLEL, El::Device::CPU>>(
+      return lbann::make_unique<discrete_random_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>>(
                comm, values, dims);
     } else {
       LBANN_ERROR("discrete random layer is only supported on CPU");
     }
   }
-  if (proto_layer.has_dummy()) {
-    return lbann::make_unique<dummy_layer<Layout, Device>>(comm);
-  }
-  if (proto_layer.has_stop_gradient()) {
-    return lbann::make_unique<stop_gradient_layer<Layout, Device>>(comm);
-  }
   if (proto_layer.has_in_top_k()) {
     const auto& params = proto_layer.in_top_k();
-    return lbann::make_unique<in_top_k_layer<Layout, Device>>(comm, params.k());
+    return lbann::make_unique<in_top_k_layer<TensorDataType, Layout, Device>>(comm, params.k());
   }
   if (proto_layer.has_sort()) {
     const auto& params = proto_layer.sort();
     if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<sort_layer<data_layout::DATA_PARALLEL, Device>>(comm, params.descending());
+      return lbann::make_unique<sort_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(comm, params.descending());
     } else {
       LBANN_ERROR("sort layer is only supported with "
                   "a data-parallel layout");
     }
   }
-  if (proto_layer.has_weights_layer()) {
-    const auto& params = proto_layer.weights_layer();
-    const auto& dims = parse_list<El::Int>(params.dims());
-    return lbann::make_unique<weights_layer<Layout, Device>>(comm, dims);
-  }
   if (proto_layer.has_tessellate()) {
     const auto& params = proto_layer.tessellate();
     const auto& dims = parse_list<int>(params.dims());
-    return lbann::make_unique<tessellate_layer<Layout, Device>>(comm, dims);
+    return lbann::make_unique<tessellate_layer<TensorDataType, Layout, Device>>(comm, dims);
   }
 
   // Regularizer layers
   if (proto_layer.has_batch_normalization()) {
     const auto& params = proto_layer.batch_normalization();
     if (Layout == data_layout::DATA_PARALLEL) {
+      int statistics_group_size = params.statistics_group_size();
+      if (statistics_group_size < 0) {
+        statistics_group_size = 0;  // Global statistics.
+      } else if (statistics_group_size == 0) {
+        statistics_group_size = 1;  // Default to local.
+      }
       const auto& aggr_str = params.stats_aggregation();
-      batch_normalization_stats_aggregation aggr =
-        batch_normalization_stats_aggregation::local;
-      if (aggr_str == "local" || aggr_str.empty()) {
-        aggr = batch_normalization_stats_aggregation::local;
-      } else if (aggr_str == "node_local") {
-        aggr = batch_normalization_stats_aggregation::node_local;
-      } else if (aggr_str == "global") {
-        aggr = batch_normalization_stats_aggregation::global;
-      } else {
-        err << "Invalid batch normalization stats aggregation " << aggr_str;
-        LBANN_ERROR(err.str());
-        return nullptr;
+      if (!aggr_str.empty()) {
+        LBANN_WARNING("stats_aggregation field for BatchNormalization is deprecated");
+        if (aggr_str == "local") {
+          statistics_group_size = 1;
+        } else if (aggr_str == "node_local") {
+          statistics_group_size = comm->get_procs_per_node();
+        } else if (aggr_str == "global") {
+          statistics_group_size = 0;
+        } else {
+          err << "Invalid batch normalization stats aggregation " << aggr_str;
+          LBANN_ERROR(err.str());
+          return nullptr;
+        }
       }
       // Set defaults if not given.
       auto decay = params.decay();
@@ -435,97 +584,54 @@ std::unique_ptr<Layer> construct_layer(
       if (epsilon == 0.0) {
         epsilon = 1e-5;
       }
-      return lbann::make_unique<batch_normalization_layer<data_layout::DATA_PARALLEL, Device>>(
+      return lbann::make_unique<batch_normalization_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
         comm,
         decay,
         epsilon,
-        aggr);
+        statistics_group_size);
     } else {
       LBANN_ERROR("batch normalization layer is only supported with "
                   "a data-parallel layout");
     }
   }
-  if (proto_layer.has_dropout()) {
-    const auto& params = proto_layer.dropout();
-    return lbann::make_unique<dropout<Layout, Device>>(comm, params.keep_prob());
-  }
-  if (proto_layer.has_local_response_normalization()) {
- const auto& params = proto_layer.local_response_normalization();
-    if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<local_response_normalization_layer<data_layout::DATA_PARALLEL, Device>>(
-             comm,
-             params.window_width(),
-             params.lrn_alpha(),
-             params.lrn_beta(),
-             params.lrn_k());
-    } else {
-      LBANN_ERROR("local response normalization layer is only supported "
-                  "with a data-parallel layout");
-    }
-  }
   if (proto_layer.has_selu_dropout()) {
     const auto& params = proto_layer.selu_dropout();
     const auto& keep_prob = params.keep_prob();
     const auto& alpha = params.alpha();
     const auto& scale = params.scale();
     if (alpha != 0.0 && scale != 0.0) {
-      return lbann::make_unique<selu_dropout<Layout, Device>>(comm, keep_prob, alpha, scale);
+      return lbann::make_unique<selu_dropout<TensorDataType, Layout, Device>>(comm, keep_prob, alpha, scale);
     } else {
-      return lbann::make_unique<selu_dropout<Layout, Device>>(comm, keep_prob);
+      return lbann::make_unique<selu_dropout<TensorDataType, Layout, Device>>(comm, keep_prob);
     }
   }
+  if (proto_layer.has_entrywise_batch_normalization()) {
+    const auto& params = proto_layer.entrywise_batch_normalization();
+    return lbann::make_unique<entrywise_batch_normalization_layer<TensorDataType, Layout, Device>>(comm, params.decay(), params.epsilon());
+  }
+  if (proto_layer.has_layer_norm()) {
+    const auto& params = proto_layer.layer_norm();
+    const double epsilon = (params.has_epsilon()
+                            ? params.epsilon().value()
+                            : 1e-5);
+    return lbann::make_unique<layer_norm_layer<TensorDataType, Layout, Device>>(comm, epsilon);
+  }
 
-  // Math layers
-  CONSTRUCT_LAYER(logical_not);
-  CONSTRUCT_LAYER(abs);
-  CONSTRUCT_LAYER(negative);
-  CONSTRUCT_LAYER(sign);
-  CONSTRUCT_LAYER(round);
-  CONSTRUCT_LAYER(ceil);
-  CONSTRUCT_LAYER(floor);
-  CONSTRUCT_LAYER(reciprocal);
-  CONSTRUCT_LAYER(square);
-  CONSTRUCT_LAYER(sqrt);
-  CONSTRUCT_LAYER(rsqrt);
-  CONSTRUCT_LAYER(safe_reciprocal);
-  CONSTRUCT_LAYER(exp);
-  CONSTRUCT_LAYER(expm1);
-  CONSTRUCT_LAYER(log);
-  CONSTRUCT_LAYER(log1p);
-  CONSTRUCT_LAYER(cos);
-  CONSTRUCT_LAYER(sin);
-  CONSTRUCT_LAYER(tan);
-  CONSTRUCT_LAYER(acos);
-  CONSTRUCT_LAYER(asin);
-  CONSTRUCT_LAYER(atan);
-  CONSTRUCT_LAYER(cosh);
-  CONSTRUCT_LAYER(sinh);
-  CONSTRUCT_LAYER(tanh);
-  CONSTRUCT_LAYER(acosh);
-  CONSTRUCT_LAYER(asinh);
-  CONSTRUCT_LAYER(atanh);
-  CONSTRUCT_LAYER(add);
-  CONSTRUCT_LAYER(subtract);
-  CONSTRUCT_LAYER(multiply);
-  CONSTRUCT_LAYER(divide);
-  CONSTRUCT_LAYER(mod);
-  CONSTRUCT_LAYER(pow);
-  CONSTRUCT_LAYER(safe_divide);
-  CONSTRUCT_LAYER(squared_difference);
-  CONSTRUCT_LAYER(max);
-  CONSTRUCT_LAYER(min);
-  CONSTRUCT_LAYER(equal);
-  CONSTRUCT_LAYER(not_equal);
-  CONSTRUCT_LAYER(less);
-  CONSTRUCT_LAYER(less_equal);
-  CONSTRUCT_LAYER(greater);
-  CONSTRUCT_LAYER(greater_equal);
-  CONSTRUCT_LAYER(logical_and);
-  CONSTRUCT_LAYER(logical_or);
-  CONSTRUCT_LAYER(logical_xor);
   if (proto_layer.has_clamp()) {
     const auto& params = proto_layer.clamp();
-    return lbann::make_unique<clamp_layer<Layout, Device>>(comm, params.min(), params.max());
+    return lbann::make_unique<clamp_layer<TensorDataType, Layout, Device>>(comm, params.min(), params.max());
+  }
+  if (proto_layer.has_matmul()) {
+    if (Layout == data_layout::DATA_PARALLEL) {
+      const auto& params = proto_layer.matmul();
+      return lbann::make_unique<matmul_layer<TensorDataType, data_layout::DATA_PARALLEL,Device>>(
+               comm,
+               params.transpose_a(),
+               params.transpose_b());
+    } else {
+      LBANN_ERROR("matrix multiply layer is only supported with "
+                  "a data-parallel layout");
+    }
   }
 
   // Activation layers
@@ -533,52 +639,32 @@ std::unique_ptr<Layer> construct_layer(
     const auto& params = proto_layer.elu();
     const auto& alpha = params.alpha();
     if (alpha != 0) {
-      return lbann::make_unique<elu_layer<Layout, Device>>(comm, alpha);
+      return lbann::make_unique<elu_layer<TensorDataType, Layout, Device>>(comm, alpha);
     } else {
-      return lbann::make_unique<elu_layer<Layout, Device>>(comm);
+      return lbann::make_unique<elu_layer<TensorDataType, Layout, Device>>(comm);
     }
   }
-  CONSTRUCT_LAYER(identity);
   if (proto_layer.has_leaky_relu()) {
     const auto& params = proto_layer.leaky_relu();
     const auto& negative_slope = params.negative_slope();
     if (negative_slope != 0) {
-      return lbann::make_unique<leaky_relu_layer<Layout, Device>>(comm, negative_slope);
+      return lbann::make_unique<leaky_relu_layer<TensorDataType, Layout, Device>>(comm, negative_slope);
     } else {
-      return lbann::make_unique<leaky_relu_layer<Layout, Device>>(comm);
+      return lbann::make_unique<leaky_relu_layer<TensorDataType, Layout, Device>>(comm);
     }
   }
-  CONSTRUCT_LAYER(log_sigmoid);
-  CONSTRUCT_LAYER(log_softmax);
-  CONSTRUCT_LAYER(relu);
-  CONSTRUCT_LAYER(selu);
-  CONSTRUCT_LAYER(sigmoid);
-  CONSTRUCT_LAYER(softmax);
-  CONSTRUCT_LAYER(softplus);
-  CONSTRUCT_LAYER(softsign);
 
   // Loss layers
-  CONSTRUCT_LAYER(categorical_accuracy);
-  CONSTRUCT_LAYER(cross_entropy);
-  CONSTRUCT_LAYER(mean_squared_error);
-  CONSTRUCT_LAYER(mean_absolute_error);
   if (proto_layer.has_top_k_categorical_accuracy()) {
     const auto& params = proto_layer.top_k_categorical_accuracy();
-    return lbann::make_unique<top_k_categorical_accuracy_layer<Layout, Device>>(comm, params.k());
+    return lbann::make_unique<top_k_categorical_accuracy_layer<TensorDataType, Layout, Device>>(comm, params.k());
   }
-  CONSTRUCT_LAYER(l2_norm2);
-  CONSTRUCT_LAYER(l1_norm);
-  CONSTRUCT_LAYER(binary_cross_entropy);
-  CONSTRUCT_LAYER(sigmoid_binary_cross_entropy);
-  CONSTRUCT_LAYER(boolean_accuracy);
-  CONSTRUCT_LAYER(boolean_false_negative);
-  CONSTRUCT_LAYER(boolean_false_positive);
 
   // Image layers
   if (proto_layer.has_bilinear_resize()) {
     const auto& params = proto_layer.bilinear_resize();
     if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<bilinear_resize_layer<data_layout::DATA_PARALLEL, Device>>(
+      return lbann::make_unique<bilinear_resize_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(
                comm, params.height(), params.width());
     } else {
       LBANN_ERROR("bilinear resize layer is only supported with "
@@ -589,22 +675,45 @@ std::unique_ptr<Layer> construct_layer(
   // Miscellaneous layers
   if (proto_layer.has_covariance()) {
     const auto& params = proto_layer.covariance();
-    return lbann::make_unique<covariance_layer<Layout, Device>>(comm, params.biased());
+    return lbann::make_unique<covariance_layer<TensorDataType, Layout, Device>>(comm, params.biased());
   }
   if (proto_layer.has_variance()) {
     const auto& params = proto_layer.variance();
-    return lbann::make_unique<variance_layer<Layout, Device>>(comm, params.biased());
+    return lbann::make_unique<variance_layer<TensorDataType, Layout, Device>>(comm, params.biased());
   }
   if (proto_layer.has_channelwise_mean()) {
     if (Layout == data_layout::DATA_PARALLEL) {
-      return lbann::make_unique<channelwise_mean_layer<data_layout::DATA_PARALLEL, Device>>(comm);
+      return lbann::make_unique<channelwise_mean_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(comm);
     } else {
       LBANN_ERROR("channel-wise mean layer is only supported with "
                   "a data-parallel layout");
     }
   }
-  CONSTRUCT_LAYER(mini_batch_index);
-  CONSTRUCT_LAYER(mini_batch_size);
+  if (proto_layer.has_argmax()) {
+    if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) {
+      return lbann::make_unique<argmax_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>>(comm);
+    } else {
+      LBANN_ERROR("argmax layer is only supported with "
+                  "a data-parallel layout and on CPU");
+    }
+  }
+  if (proto_layer.has_argmin()) {
+    if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) {
+      return lbann::make_unique<argmin_layer<TensorDataType, data_layout::DATA_PARALLEL, El::Device::CPU>>(comm);
+    } else {
+      LBANN_ERROR("argmin layer is only supported with "
+                  "a data-parallel layout and on CPU");
+    }
+  }
+  if (proto_layer.has_one_hot()) {
+    if (Layout == data_layout::DATA_PARALLEL) {
+      const auto& params = proto_layer.one_hot();
+      return lbann::make_unique<one_hot_layer<TensorDataType, data_layout::DATA_PARALLEL, Device>>(comm, params.size());
+    } else {
+      LBANN_ERROR("one-hot layer is only supported with "
+                  "a data-parallel layout");
+    }
+  }
 
   // Throw exception if layer has not been constructed
   err << "could not construct layer " << proto_layer.name();
@@ -613,58 +722,47 @@ std::unique_ptr<Layer> construct_layer(
 
 }
 
-// Template instantiation
-template std::unique_ptr<Layer> construct_layer<data_layout::DATA_PARALLEL, El::Device::CPU>(
-  lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
-  int num_parallel_readers,
-  const lbann_data::Layer& proto_layer
-);
-template std::unique_ptr<Layer> construct_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>(
-  lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
-  int num_parallel_readers,
-  const lbann_data::Layer& proto_layer
-);
-#ifdef LBANN_HAS_GPU
-template std::unique_ptr<Layer> construct_layer<data_layout::DATA_PARALLEL, El::Device::GPU>(
-  lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
-  int num_parallel_readers,
-  const lbann_data::Layer& proto_layer
-);
-template std::unique_ptr<Layer> construct_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>(
+template <typename TensorDataType, data_layout Layout, El::Device Device>
+std::unique_ptr<Layer> construct_layer(
   lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader*>& data_readers,
+  int training_dr_linearized_data_size,
   int num_parallel_readers,
-  const lbann_data::Layer& proto_layer
-);
-#endif // LBANN_HAS_GPU
-
-/// Obtain the slice points from the data reader
-std::vector<El::Int> get_slice_points_from_reader(const generic_data_reader* dr_generic,
-                                                  const std::string& var_category,
-                                                  bool& is_supported) {
-  std::vector<El::Int> slice_points;
-  is_supported = false;
-#if defined(LBANN_HAS_CONDUIT)
-  // TODO: remove the dynamic cast when this feature gets merged into the base class
-  const auto dr = dynamic_cast<const data_reader_jag_conduit*>(dr_generic);
-
-  if (dr != nullptr) {
-    is_supported = true;
-    if (var_category == "independent") {
-      slice_points = dr->get_slice_points_independent();
-    } else if (var_category == "dependent") {
-      slice_points = dr->get_slice_points_independent();
-    } else {
-      LBANN_ERROR("Unknown variable category \"" + var_category \
-                  + "\". Must be either \"independent\" or \"dependent\".");
-    }
-  }
-#endif
-  return slice_points;
+  const lbann_data::Layer& proto_layer) {
+
+  auto const& factory = get_layer_factory<TensorDataType, Layout, Device>();
+  auto const& msg =
+    helpers::get_oneof_message(proto_layer, "layer_type");
+
+  std::unique_ptr<Layer> l = factory.create_object(
+    msg.GetDescriptor()->name(), comm, proto_layer);
+  if(!l) {
+    if (typeid(TensorDataType) == typeid(DataType))
+      l = construct_layer_legacy<DataType, Layout, Device>(
+            comm, training_dr_linearized_data_size, num_parallel_readers, proto_layer);
+    else
+      LBANN_ERROR("Currently, layers of type \"", msg.GetDescriptor()->name(),
+                  "\" are not constructible with any type other than the "
+                  "default DataType.");
+  }
+  return l;
 }
 
+// Template instantiation
+#define PROTO_DEVICE(T, Device) \
+  template std::unique_ptr<Layer> construct_layer<T, data_layout::DATA_PARALLEL, Device>(  \
+    lbann_comm* comm,                                                                      \
+    int training_dr_linearized_data_size,                                                  \
+    int num_parallel_readers,                                                              \
+    const lbann_data::Layer& proto_layer                                                   \
+  );                                                                                       \
+  template std::unique_ptr<Layer> construct_layer<T, data_layout::MODEL_PARALLEL, Device>( \
+    lbann_comm* comm,                                                                      \
+    int training_dr_linearized_data_size,                                                  \
+    int num_parallel_readers,                                                              \
+    const lbann_data::Layer& proto_layer                                                   \
+  )
+
+#include "lbann/macros/instantiate_device.hpp"
+
 } // namespace proto
 } // namespace lbann
diff --git a/src/proto/factories/layer_graph_factory.cpp b/src/proto/factories/layer_graph_factory.cpp
index 5e8c12dd98e..7d838ea1e8b 100644
--- a/src/proto/factories/layer_graph_factory.cpp
+++ b/src/proto/factories/layer_graph_factory.cpp
@@ -25,6 +25,18 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/proto/factories.hpp"
+#include "lbann/proto/datatype_helpers.hpp"
+
+#include "lbann/layers/learning/fully_connected.hpp"
+#include "lbann/layers/transform/pooling.hpp"
+#include "lbann/layers/transform/unpooling.hpp"
+
+#include <model.pb.h>
+#include <trainer.pb.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 namespace lbann {
 namespace proto {
@@ -80,39 +92,6 @@ void setup_hints(
   }
 }
 
-void setup_fc_num_neurons(
-  std::vector<Layer*>& layers,
-  const std::map<execution_mode, generic_data_reader *>& data_readers,
-  const lbann_data::Model& proto_model) {
-  std::stringstream err;
-  for (int i=0; i<proto_model.layer_size(); ++i) {
-    const auto& proto_layer = proto_model.layer(i);
-    Layer* l = layers[i];
-    if (proto_layer.has_fully_connected()) {
-      bool set_num_neurons = proto_layer.fully_connected().num_neurons_is_num_labels();
-      if (set_num_neurons) {
-        for (auto t : data_readers) {
-          if (t.second != nullptr && t.second->get_role() == "train") {
-            std::vector<int> dims(1, t.second->get_num_labels());
-            auto&& fc_dp_cpu = dynamic_cast<fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::CPU>*>(l);
-            auto&& fc_mp_cpu = dynamic_cast<fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>*>(l);
-#ifdef LBANN_HAS_GPU
-            auto&& fc_dp_gpu = dynamic_cast<fully_connected_layer<data_layout::DATA_PARALLEL, El::Device::GPU>*>(l);
-            auto&& fc_mp_gpu = dynamic_cast<fully_connected_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>*>(l);
-#endif // LBANN_HAS_GPU
-            if (fc_dp_cpu != nullptr) { fc_dp_cpu->set_output_dims(dims); }
-            if (fc_mp_cpu != nullptr) { fc_mp_cpu->set_output_dims(dims); }
-#ifdef LBANN_HAS_GPU
-            if (fc_dp_gpu != nullptr) { fc_dp_gpu->set_output_dims(dims); }
-            if (fc_mp_gpu != nullptr) { fc_mp_gpu->set_output_dims(dims); }
-#endif // LBANN_HAS_GPU
-          }
-        }
-      }
-    }
-  }
-}
-
 /** Setup paired pooling layers for unpooling layers. */
 void setup_unpooling_pointers(lbann_comm* comm,
                               std::vector<Layer*>& layers,
@@ -121,12 +100,12 @@ void setup_unpooling_pointers(lbann_comm* comm,
   std::stringstream err;
   for (int i=0; i<proto_model.layer_size(); ++i) {
     {
-      unpooling_layer<data_layout::DATA_PARALLEL, El::Device::CPU>* unpool
-        = dynamic_cast<unpooling_layer<data_layout::DATA_PARALLEL, El::Device::CPU>*>(layers[i]);
+      unpooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>* unpool
+        = dynamic_cast<unpooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>*>(layers[i]);
       if (unpool != nullptr) {
         const auto& pool_name = proto_model.layer(i).unpooling().pooling_layer();
-        pooling_layer<data_layout::DATA_PARALLEL, El::Device::CPU>* pool
-          = dynamic_cast<pooling_layer<data_layout::DATA_PARALLEL, El::Device::CPU>*>(names_to_layers[pool_name]);
+        pooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>* pool
+          = dynamic_cast<pooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::CPU>*>(names_to_layers[pool_name]);
         if (pool == nullptr) {
           err << "could not find pooling layer " << pool_name << " "
               << "to pair with unpooling layer " << unpool->get_name();
@@ -135,14 +114,14 @@ void setup_unpooling_pointers(lbann_comm* comm,
         unpool->set_pooling_layer(pool);
       }
     }
-#ifdef LBANN_HAS_GPU
+#if defined(LBANN_HAS_GPU) && defined(LBANN_UNPOOLING_LAYER_SUPPORTS_GPU)
     {
-      unpooling_layer<data_layout::DATA_PARALLEL, El::Device::GPU>* unpool
-        = dynamic_cast<unpooling_layer<data_layout::DATA_PARALLEL, El::Device::GPU>*>(layers[i]);
+      unpooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>* unpool
+        = dynamic_cast<unpooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>*>(layers[i]);
       if (unpool != nullptr) {
         const auto& pool_name = proto_model.layer(i).unpooling().pooling_layer();
-        pooling_layer<data_layout::DATA_PARALLEL, El::Device::GPU>* pool
-          = dynamic_cast<pooling_layer<data_layout::DATA_PARALLEL, El::Device::GPU>*>(names_to_layers[pool_name]);
+        pooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>* pool
+          = dynamic_cast<pooling_layer<DataType, data_layout::DATA_PARALLEL, El::Device::GPU>*>(names_to_layers[pool_name]);
         if (pool == nullptr) {
           err << "could not find pooling layer " << pool_name << " "
               << "to pair with unpooling layer " << unpool->get_name();
@@ -159,7 +138,8 @@ void setup_unpooling_pointers(lbann_comm* comm,
 
 std::vector<std::unique_ptr<Layer>> construct_layer_graph(
   lbann_comm* comm,
-  const std::map<execution_mode, generic_data_reader *>& data_readers,
+  int training_dr_linearized_data_size,
+  const lbann_data::Trainer& proto_trainer,
   const lbann_data::Model& proto_model) {
   std::stringstream err;
 
@@ -190,47 +170,68 @@ std::vector<std::unique_ptr<Layer>> construct_layer_graph(
     }
 
     // Get parameters from prototext
+    const auto model_disable_gpus = proto_model.disable_cuda();
+
     const auto& layout_str = proto_layer.data_layout();
-    data_layout layout = data_layout::invalid;
-    if (layout_str.empty())             { layout = data_layout::DATA_PARALLEL; }
-    if (layout_str == "data_parallel")  { layout = data_layout::DATA_PARALLEL; }
-    if (layout_str == "model_parallel") { layout = data_layout::MODEL_PARALLEL; }
-    const auto& num_parallel_readers = proto_model.num_parallel_readers();
+    data_layout layout = (layout_str.empty()
+                          ? data_layout::DATA_PARALLEL
+                          : data_layout_from_string(layout_str));
+
+    const auto& num_parallel_readers = proto_trainer.num_parallel_readers();
     El::Device device = El::Device::CPU;
 #ifdef LBANN_HAS_GPU
-    const auto& device_str = proto_layer.device_allocation();
-    if (!proto_model.disable_cuda()) {
-      if (device_str == "gpu" || device_str.empty()) {
-        device = El::Device::GPU;
-      }
-      if (device_str == "cpu") { device = El::Device::CPU; }
-      if (proto_layer.has_input()) {
-        // Input layers must be on CPU
-        device = El::Device::CPU;
-      }
+    // Input layers must be on CPU
+    if (!proto_layer.has_input() && !model_disable_gpus) {
+      const auto& device_str = proto_layer.device_allocation();
+      device = (device_str.empty()
+                ? El::Device::GPU
+                : device_from_string(device_str));
     }
+#else
+    (void) model_disable_gpus;
 #endif // LBANN_HAS_GPU
 
+    auto proto_datatype = proto_layer.datatype();
+
     // Construct layer
     std::unique_ptr<Layer> l;
-#define TEMPLATE_INSTANTIATION(T_layout, T_device)                      \
+#define TEMPLATE_INSTANTIATION(TensorDataType, T_layout, T_device)      \
     do {                                                                \
-      if (layout == T_layout && device == T_device) {        \
-        l = construct_layer<T_layout, T_device>(                        \
-              comm,                                                     \
-              data_readers,                                             \
-              num_parallel_readers,                                     \
-              proto_layer);                                             \
+      if (proto_datatype == TypeToProtoDataType<TensorDataType>::value  \
+          && layout == T_layout                                         \
+          && device == T_device) {                                      \
+        l = construct_layer<TensorDataType, T_layout, T_device>(        \
+          comm,                                                         \
+          training_dr_linearized_data_size,                             \
+          num_parallel_readers,                                         \
+          proto_layer);                                                 \
       }                                                                 \
     } while (0)
-    TEMPLATE_INSTANTIATION(data_layout::DATA_PARALLEL, El::Device::CPU);
-    TEMPLATE_INSTANTIATION(data_layout::MODEL_PARALLEL, El::Device::CPU);
-#ifdef LBANN_HAS_GPU
-    TEMPLATE_INSTANTIATION(data_layout::DATA_PARALLEL, El::Device::GPU);
-    TEMPLATE_INSTANTIATION(data_layout::MODEL_PARALLEL, El::Device::GPU);
-#endif // LBANN_HAS_GPU
+
+#define PROTO_DEVICE(T, Device) \
+    TEMPLATE_INSTANTIATION(T, data_layout::DATA_PARALLEL, Device); \
+    TEMPLATE_INSTANTIATION(T, data_layout::MODEL_PARALLEL, Device)
+
+#include "lbann/macros/instantiate_device.hpp"
+
 #undef TEMPLATE_INSTANTIATION
 
+    // Set up parallel strategy.
+    ParallelStrategy& ps = l->get_parallel_strategy();
+    ps.sample_groups = proto_layer.parallel_strategy().sample_groups();
+    ps.sample_splits = proto_layer.parallel_strategy().sample_splits();
+    ps.height_groups = proto_layer.parallel_strategy().height_groups();
+    ps.height_splits = proto_layer.parallel_strategy().height_splits();
+    ps.width_groups = proto_layer.parallel_strategy().width_groups();
+    ps.width_splits = proto_layer.parallel_strategy().width_splits();
+    ps.channel_groups = proto_layer.parallel_strategy().channel_groups();
+    ps.channel_splits = proto_layer.parallel_strategy().channel_splits();
+    ps.filter_groups = proto_layer.parallel_strategy().filter_groups();
+    ps.filter_splits = proto_layer.parallel_strategy().filter_splits();
+    ps.replications = proto_layer.parallel_strategy().replications();
+    ps.depth_groups = proto_layer.parallel_strategy().depth_groups();
+    ps.depth_splits = proto_layer.parallel_strategy().depth_splits();
+
     // Check that layer has been constructed
     if (l == nullptr) {
       err << "could not construct layer " << name;
@@ -269,9 +270,6 @@ std::vector<std::unique_ptr<Layer>> construct_layer_graph(
   setup_hints(layer_pointers, names_to_layers, proto_model);
   setup_unpooling_pointers(comm, layer_pointers, names_to_layers, proto_model);
 
-  // Optionally Set num_neurons = num_labels
-  setup_fc_num_neurons(layer_pointers, data_readers, proto_model);
-
   // Return layer list
   return layers;
 
diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp
index c3ceb22e725..67d550b04e7 100644
--- a/src/proto/factories/model_factory.cpp
+++ b/src/proto/factories/model_factory.cpp
@@ -25,7 +25,25 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/proto/factories.hpp"
+
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/metrics/layer_metric.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/models/directed_acyclic_graph.hpp"
 #include "lbann/objective_functions/layer_term.hpp"
+#include "lbann/objective_functions/weight_regularization/l2.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <model.pb.h>
+#include <objective_functions.pb.h>
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 namespace lbann {
 namespace proto {
@@ -33,47 +51,42 @@ namespace proto {
 namespace {
 
 /** Instantiate a model based on prototext. */
-model* instantiate_model(lbann_comm* comm,
-                         objective_function* obj,
-                         const lbann_data::Optimizer& proto_opt,
-                         const lbann_data::Model& proto_model) {
-  std::stringstream err;
-
-  // Default optimizer
-  auto&& opt = construct_optimizer(comm, proto_opt);
+std::unique_ptr<model>
+instantiate_model(lbann_comm* comm,
+                  std::unique_ptr<objective_function> obj,
+                  const lbann_data::Optimizer& proto_opt,
+                  const lbann_data::Model& proto_model) {
 
   // Construct model
   const auto& type = proto_model.type();
-  const auto& mini_batch_size = proto_model.mini_batch_size();
   if (type.empty() || type == "directed_acyclic_graph_model") {
-    return new directed_acyclic_graph_model(comm, mini_batch_size, obj, opt);
+    return make_unique<directed_acyclic_graph_model>(
+      comm, std::move(obj),
+      make_unique<lbann_data::Optimizer>(proto_opt));
   }
 
   // Throw error if model type is not supported
-  err << "unknown model type (" << type << ")";
-  LBANN_ERROR(err.str());
+  LBANN_ERROR("unknown model type (", type, ")");
   return nullptr;
-
 }
 
 /** Setup pointers from objective function to layers.
  *
  *  Layer terms require pointers to layers.
  */
-void assign_layers_to_objective_function(std::vector<Layer*>& layer_list,
-                                         objective_function& obj,
-                                         const lbann_data::ObjectiveFunction& proto_obj) {
-  std::stringstream err;
+void assign_layers_to_objective_function(
+  const std::vector<std::unique_ptr<Layer>>& layer_list,
+  objective_function& obj,
+  const lbann_data::ObjectiveFunction& proto_obj) {
 
   // Construct map from layer names to layers
   std::unordered_map<std::string, Layer*> names_to_layers;
   for (auto&& l : layer_list) {
     const auto& name = l->get_name();
     if (names_to_layers.count(name) > 0) {
-      err << "layer name \"" << name << "\" is not unique";
-      LBANN_ERROR(err.str());
+      LBANN_ERROR("layer name \"", name, "\" is not unique");
     }
-    names_to_layers[name] = l;
+    names_to_layers[name] = l.get();
   }
 
   // Assign layers to layer terms in objective function
@@ -86,10 +99,9 @@ void assign_layers_to_objective_function(std::vector<Layer*>& layer_list,
       const auto& params = proto_obj.layer_term(num_layer_terms-1);
       auto* l = names_to_layers[params.layer()];
       if (l == nullptr) {
-        err << "attempted to set objective function layer term "
-            << "to correspond to layer \"" << params.layer() << "\", "
-            << "but no such layer exists";
-        LBANN_ERROR(err.str());
+        LBANN_ERROR("attempted to set objective function layer term ",
+                    "to correspond to layer \"", params.layer(), "\", ",
+                    "but no such layer exists");
       }
       term->set_layer(*l);
     }
@@ -97,43 +109,38 @@ void assign_layers_to_objective_function(std::vector<Layer*>& layer_list,
 
   // Check that layer terms in objective function match prototext
   if (num_layer_terms != proto_obj.layer_term_size()) {
-    err << "recieved " << num_layer_terms << " "
-        << "objective function layer terms, "
-        << "but there are " << proto_obj.layer_term_size() << " "
-        << "in the prototext";
-    LBANN_ERROR(err.str());
+    LBANN_ERROR("recieved ", num_layer_terms,
+                " objective function layer terms, but there are ",
+                proto_obj.layer_term_size(), " in the prototext");
   }
-
 }
 
-void assign_layers_to_metrics(std::vector<Layer*>& layer_list,
-                              std::vector<metric*>& metric_list,
-                              const lbann_data::Model& proto_model) {
+void assign_layers_to_metrics(
+  const std::vector<std::unique_ptr<Layer>>& layer_list,
+  std::vector<std::unique_ptr<metric>>& metric_list,
+  const lbann_data::Model& proto_model) {
 
   // Construct map from layer names to layers
   std::unordered_map<std::string, Layer*> names_to_layers;
   for (auto&& l : layer_list) {
     const auto& name = l->get_name();
     if (names_to_layers.count(name) > 0) {
-      std::stringstream err;
-      err << "layer name \"" << name << "\" is not unique";
-      LBANN_ERROR(err.str());
+      LBANN_ERROR("layer name \"", name, "\" is not unique");
     }
-    names_to_layers[name] = l;
+    names_to_layers[name] = l.get();
   }
 
   // Assign layers to layer metrics
   for (int i=0; i<proto_model.metric_size(); ++i) {
-    auto&& m = dynamic_cast<layer_metric*>(metric_list[i]);
+    auto&& m = dynamic_cast<layer_metric*>(metric_list[i].get());
     if (m != nullptr) {
       const auto& params = proto_model.metric(i).layer_metric();
       auto* l = names_to_layers[params.layer()];
       if (l == nullptr) {
-        std::stringstream err;
-        err << "attempted to set layer metric \"" << m->name() << "\" "
-            << "to correspond to layer \"" << params.layer() << "\", "
-            << "but no such layer exists";
-        LBANN_ERROR(err.str());
+        LBANN_ERROR("attempted to set layer metric "
+                    "\"", m->name(), "\" "
+                    "to correspond to layer \"", params.layer(), "\", "
+                    "but no such layer exists");
       }
       m->set_layer(*l);
     }
@@ -142,33 +149,33 @@ void assign_layers_to_metrics(std::vector<Layer*>& layer_list,
 }
 
 /** Setup pointers from layers to weights. */
-void assign_weights_to_layers(std::vector<Layer*>& layer_list,
-                              std::vector<weights*>& weights_list,
-                              const lbann_data::Model& proto_model) {
-  std::stringstream err;
+void assign_weights_to_layers(
+  const std::vector<std::unique_ptr<Layer>>& layer_list,
+  std::vector<std::unique_ptr<weights>>& weights_list,
+  const lbann_data::Model& proto_model) {
 
   // Construct map from weights names to weights
   std::unordered_map<std::string, weights*> names_to_weights;
   for (auto&& w : weights_list) {
     const auto& name = w->get_name();
     if (names_to_weights.count(name) > 0) {
-      err << "weights name \"" << name << "\" is not unique";
-      LBANN_ERROR(err.str());
+      LBANN_ERROR("weights name \"", name, "\" is not unique");
     }
-    names_to_weights[name] = w;
+    names_to_weights[name] = w.get();
   }
 
   // Find weights assigned to each layer
   for (int i=0; i<proto_model.layer_size(); ++i) {
     const auto& proto_layer = proto_model.layer(i);
-    auto& layer_weights = layer_list[i]->get_weights();
+    auto layer_weights = extract_weights(*layer_list[i]);
     const bool is_frozen = layer_list[i]->is_frozen();
     for (auto&& name : parse_list<std::string>(proto_layer.weights())) {
       auto&& w = names_to_weights[name];
-      if (w == nullptr) {
-        err << "could not find weights named \"" << name << "\", "
-            << "which are expected by layer " << layer_list[i]->get_name();
-        LBANN_ERROR(err.str());
+      if (!w) {
+        LBANN_ERROR("could not find weights named "
+                    "\"", name, "\", "
+                    "which are expected by layer ",
+                    layer_list[i]->get_name());
       }
       if (is_frozen) {
         w->freeze();
@@ -177,6 +184,7 @@ void assign_weights_to_layers(std::vector<Layer*>& layer_list,
       }
       layer_weights.push_back(w);
     }
+    layer_list[i]->set_weights(layer_weights);
   }
 
 }
@@ -185,20 +193,19 @@ void assign_weights_to_layers(std::vector<Layer*>& layer_list,
  *
  *  L2 weight regularization requires pointers to weights.
  */
-void assign_weights_to_objective_function(std::vector<weights*>& weights_list,
-                                          objective_function& obj,
-                                          const lbann_data::ObjectiveFunction& proto_obj) {
-  std::stringstream err;
+void assign_weights_to_objective_function(
+  const std::vector<std::unique_ptr<weights>>& weights_list,
+  objective_function& obj,
+  const lbann_data::ObjectiveFunction& proto_obj) {
 
   // Construct map from weights names to weights
   std::unordered_map<std::string, weights*> names_to_weights;
   for (auto&& w : weights_list) {
     const auto& name = w->get_name();
     if (names_to_weights.count(name) > 0) {
-      err << "weights name \"" << name << "\" is not unique";
-      LBANN_ERROR(err.str());
+      LBANN_ERROR("weights name \"", name, "\" is not unique");
     }
-    names_to_weights[name] = w;
+    names_to_weights[name] = w.get();
   }
 
   // Setup weights with L2 regularization
@@ -212,11 +219,10 @@ void assign_weights_to_objective_function(std::vector<weights*>& weights_list,
       std::vector<weights*> term_weights;
       for (auto&& weights_name : parse_list<std::string>(params.weights())) {
         auto&& w = names_to_weights[weights_name];
-        if (w == nullptr) {
-          err << "attempted to apply L2 weight regularization to "
-              << "weights \"" << weights_name << "\", "
-              << "but no such weights exists";
-          LBANN_ERROR(err.str());
+        if (!w) {
+          LBANN_ERROR("attempted to apply L2 weight regularization to "
+                      "weights \"", weights_name, "\", "
+                      "but no such weights exists");
         }
         term_weights.push_back(w);
       }
@@ -228,71 +234,63 @@ void assign_weights_to_objective_function(std::vector<weights*>& weights_list,
 
 } // namespace
 
-model* construct_model(lbann_comm* comm,
-                       const std::map<execution_mode, generic_data_reader*>& data_readers,
-                       const lbann_data::Optimizer& proto_opt,
-                       const lbann_data::Model& proto_model) {
+std::unique_ptr<model> construct_model(
+  lbann_comm* comm,
+  int training_dr_linearized_data_size,
+  const lbann_data::Optimizer& proto_opt,
+  const lbann_data::Trainer& proto_trainer,
+  const lbann_data::Model& proto_model) {
 
   // Construct layer graph
   auto&& layer_list = construct_layer_graph(comm,
-                                            data_readers,
+                                            training_dr_linearized_data_size,
+                                            proto_trainer,
                                             proto_model);
-  std::vector<Layer*> layer_pointers;
-  layer_pointers.reserve(layer_list.size());
-  for (auto&& ptr : layer_list) {
-    layer_pointers.push_back(ptr.get());
-  }
 
   // Construct objective function
   const auto& proto_obj = proto_model.objective_function();
-  auto&& obj = construct_objective_function(proto_obj);
-  assign_layers_to_objective_function(layer_pointers, *obj, proto_obj);
+  auto obj = construct_objective_function(proto_obj);
+  assign_layers_to_objective_function(layer_list, *obj, proto_obj);
 
   // Construct weights
-  std::vector<weights*> weights_list;
+  std::vector<std::unique_ptr<weights>> weights_list;
   for (int i=0; i<proto_model.weights_size(); i++) {
-    weights_list.push_back(construct_weights(comm,
-                                             proto_opt,
-                                             proto_model.weights(i)));
+    weights_list.push_back(
+      construct_weights(comm,
+                        proto_opt,
+                        proto_model.weights(i)));
   }
-  assign_weights_to_layers(layer_pointers, weights_list, proto_model);
+  assign_weights_to_layers(layer_list, weights_list, proto_model);
   assign_weights_to_objective_function(weights_list, *obj, proto_obj);
 
   // Construct metrics
-  std::vector<metric*> metric_list;
+  std::vector<std::unique_ptr<metric>> metric_list;
   for (int i=0; i<proto_model.metric_size(); ++i) {
     const auto& params = proto_model.metric(i).layer_metric();
-    metric_list.push_back(new layer_metric(comm,
-                                           params.name(),
-                                           params.unit()));
+    metric_list.push_back(make_unique<layer_metric>(comm,
+                                                    params.name(),
+                                                    params.unit()));
   }
-  assign_layers_to_metrics(layer_pointers, metric_list, proto_model);
+  assign_layers_to_metrics(layer_list, metric_list, proto_model);
 
   // Construct callbacks
-  std::vector<lbann_callback*> callback_list;
-  auto&& summarizer = construct_summarizer(comm, proto_model);
+  std::vector<std::unique_ptr<callback_base>> callback_list;
+  auto summarizer = std::shared_ptr<lbann_summary>(construct_summarizer(comm, proto_model));
   for (int i=0; i<proto_model.callback_size(); i++) {
-    callback_list.push_back(construct_callback(comm,
-                                               proto_model.callback(i),
-                                               data_readers,
-                                               layer_pointers,
-                                               weights_list,
+    callback_list.push_back(construct_callback(proto_model.callback(i),
                                                summarizer));
   }
 
   // Instantiate model
-  auto&& m = instantiate_model(comm, obj, proto_opt, proto_model);
-  for (auto&& l   : layer_list   ) { m->add_layer(std::move(l)); }
-  for (auto&& w   : weights_list ) { m->add_weights(w);   }
-  for (auto&& met : metric_list  ) { m->add_metric(met);  }
-  for (auto&& cb  : callback_list) { m->add_callback(cb); }
+  auto m = instantiate_model(comm, std::move(obj), proto_opt, proto_model);
+  for (auto&& l   : layer_list   ) { m->add_layer(std::move(l));    }
+  for (auto&& w   : weights_list ) { m->add_weights(std::move(w));  }
+  for (auto&& met : metric_list  ) { m->add_metric(met.release());  }
+  for (auto&& cb  : callback_list) { m->add_callback(std::move(cb)); }
   const auto& name = proto_model.name();
   if (!name.empty()) {
     m->set_name(name);
   }
-  for (auto t : data_readers) {
-    t.second->set_model(m);
-  }
   return m;
 
 }
diff --git a/src/proto/factories/objective_function_factory.cpp b/src/proto/factories/objective_function_factory.cpp
index 9ca69a151bf..19243a5fd3b 100644
--- a/src/proto/factories/objective_function_factory.cpp
+++ b/src/proto/factories/objective_function_factory.cpp
@@ -26,13 +26,21 @@
 
 #include "lbann/proto/factories.hpp"
 
+#include "lbann/objective_functions/objective_function.hpp"
+
+#include "lbann/objective_functions/layer_term.hpp"
+#include "lbann/objective_functions/weight_regularization/l2.hpp"
+
+#include <objective_functions.pb.h>
+
 namespace lbann {
 namespace proto {
 
-objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj) {
+std::unique_ptr<objective_function>
+construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj) {
 
   // Instantiate objective function
-  objective_function* obj = new objective_function();
+  auto obj = make_unique<objective_function>();
 
   // Weight regularization terms
   for (int i=0; i<proto_obj.l2_weight_regularization_size(); ++i) {
diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp
index a4215b4e644..4c8652c5bb9 100644
--- a/src/proto/factories/optimizer_factory.cpp
+++ b/src/proto/factories/optimizer_factory.cpp
@@ -26,61 +26,81 @@
 
 #include "lbann/proto/factories.hpp"
 
+#include "lbann/optimizers/optimizer.hpp"
+
+#include "lbann/optimizers/adagrad.hpp"
+#include "lbann/optimizers/adam.hpp"
+#include "lbann/optimizers/hypergradient_adam.hpp"
+#include "lbann/optimizers/rmsprop.hpp"
+#include "lbann/optimizers/sgd.hpp"
+
+#include "lbann/proto/helpers.hpp"
+#include "lbann/utils/factory.hpp"
+
+#include <optimizers.pb.h>
+
 namespace lbann {
 namespace proto {
+namespace {
 
-optimizer* construct_optimizer(lbann_comm* comm,
-                               const lbann_data::Optimizer& proto_opt) {
+template <typename T>
+std::unique_ptr<optimizer>
+build_no_optimizer_from_pbuf(
+  google::protobuf::Message const& msg) {
+  return nullptr;
+}
 
-  // Stochastic gradient descent
-  if (proto_opt.has_sgd()) {
-    const auto& params = proto_opt.sgd();
-    return new sgd(comm,
-                   params.learn_rate(),
-                   params.momentum(),
-                   params.nesterov());
-  }
+using factory_type = lbann::generic_factory<
+  lbann::optimizer,
+  std::string,
+  generate_builder_type<lbann::optimizer,
+                        google::protobuf::Message const&>,
+  default_key_error_policy>;
 
-  // AdaGrad
-  if (proto_opt.has_adagrad()) {
-    const auto& params = proto_opt.adagrad();
-    return new adagrad(comm, params.learn_rate(), params.eps());
-  }
+// Manage a global factory
+template <typename T>
+struct factory_manager {
+  factory_type factory_;
 
-  // RMSProp
-  if (proto_opt.has_rmsprop()) {
-    const auto& params = proto_opt.rmsprop();
-    return new rmsprop(comm,
-                       params.learn_rate(),
-                       params.decay_rate(),
-                       params.eps());
+  factory_manager() {
+    register_default_builders();
   }
 
-  // Adam
-  if (proto_opt.has_adam()) {
-    const auto& params = proto_opt.adam();
-    return new adam(comm,
-                    params.learn_rate(),
-                    params.beta1(),
-                    params.beta2(),
-                    params.eps());
+private:
+  void register_default_builders() {
+    factory_.register_builder("NoOptimizer", build_no_optimizer_from_pbuf<T>);
+    factory_.register_builder("AdaGrad", build_adagrad_optimizer_from_pbuf<T>);
+    factory_.register_builder("Adam", build_adam_optimizer_from_pbuf<T>);
+    factory_.register_builder("HypergradientAdam",
+                              build_hypergradient_adam_optimizer_from_pbuf<T>);
+    factory_.register_builder("RMSprop", build_rmsprop_optimizer_from_pbuf<T>);
+    factory_.register_builder("SGD", build_sgd_optimizer_from_pbuf<T>);
   }
+};
 
-  // Hypergradient Adam
-  if (proto_opt.has_hypergradient_adam()) {
-    const auto& params = proto_opt.hypergradient_adam();
-    return new hypergradient_adam(comm,
-                                  params.init_learning_rate(),
-                                  params.hyper_learning_rate(),
-                                  params.beta1(),
-                                  params.beta2(),
-                                  params.eps());
-  }
+template <typename T>
+factory_type const& get_optimizer_factory() noexcept {
+  static factory_manager<T> factory_mgr_;
+  return factory_mgr_.factory_;
+}
 
-  // Return null pointer if no optimizer is specified
-  return nullptr;
+}// namespace <anon>
 
+template <typename TensorDataType>
+std::unique_ptr<optimizer> construct_optimizer(
+  const lbann_data::Optimizer& proto_opt) {
+  auto const& factory = get_optimizer_factory<TensorDataType>();
+  auto const& msg =
+    helpers::get_oneof_message(proto_opt, "optimizer_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg);
 }
 
+#define PROTO(T)                                                \
+  template std::unique_ptr<optimizer> construct_optimizer<T>(   \
+    const lbann_data::Optimizer&)
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace proto
 } // namespace lbann
diff --git a/src/proto/factories/trainer_factory.cpp b/src/proto/factories/trainer_factory.cpp
new file mode 100644
index 00000000000..01ae76d9123
--- /dev/null
+++ b/src/proto/factories/trainer_factory.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/proto/factories.hpp"
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/callbacks/callback.hpp"
+
+#include <trainer.pb.h>
+
+namespace lbann {
+namespace proto {
+
+std::unique_ptr<trainer> construct_trainer(lbann_comm* comm,
+                                           const lbann_data::Trainer& proto_trainer) {
+
+  // Instantiate trainer
+  auto t = make_unique<trainer>(comm, proto_trainer.mini_batch_size());
+  const auto& name = proto_trainer.name();
+  if (!name.empty()) {
+    t->set_name(name);
+  }
+
+  // Construct callbacks
+  for (int i=0; i<proto_trainer.callback_size(); i++) {
+    t->add_callback(construct_callback(proto_trainer.callback(i)));
+  }
+
+  return t;
+}
+
+} // namespace proto
+} // namespace lbann
diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp
new file mode 100644
index 00000000000..4b855e0cf4d
--- /dev/null
+++ b/src/proto/factories/transform_factory.cpp
@@ -0,0 +1,129 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/normalize.hpp"
+#include "lbann/transforms/sample_normalize.hpp"
+#include "lbann/transforms/scale.hpp"
+#include "lbann/transforms/vision/adjust_brightness.hpp"
+#include "lbann/transforms/vision/adjust_contrast.hpp"
+#include "lbann/transforms/vision/adjust_saturation.hpp"
+#include "lbann/transforms/vision/center_crop.hpp"
+#include "lbann/transforms/vision/colorize.hpp"
+#include "lbann/transforms/vision/color_jitter.hpp"
+#include "lbann/transforms/vision/cutout.hpp"
+#include "lbann/transforms/vision/grayscale.hpp"
+#include "lbann/transforms/vision/horizontal_flip.hpp"
+#include "lbann/transforms/vision/normalize_to_lbann_layout.hpp"
+#include "lbann/transforms/vision/random_affine.hpp"
+#include "lbann/transforms/vision/random_crop.hpp"
+#include "lbann/transforms/vision/random_resized_crop.hpp"
+#include "lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp"
+#include "lbann/transforms/vision/resize.hpp"
+#include "lbann/transforms/vision/resized_center_crop.hpp"
+#include "lbann/transforms/vision/to_lbann_layout.hpp"
+#include "lbann/transforms/vision/vertical_flip.hpp"
+
+#include "lbann/proto/factories.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/proto/helpers.hpp"
+#include "lbann/utils/factory.hpp"
+#include "lbann/utils/memory.hpp"
+
+#include <reader.pb.h>
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace proto {
+namespace {
+
+using factory_type = lbann::generic_factory<
+  transform::transform,
+  std::string,
+  generate_builder_type<transform::transform,
+                        google::protobuf::Message const&>,
+  default_key_error_policy>;
+
+void register_default_builders(factory_type& factory) {
+  using namespace transform;
+  factory.register_builder("AdjustBrightness", build_adjust_brightness_transform_from_pbuf);
+  factory.register_builder("AdjustContrast", build_adjust_contrast_transform_from_pbuf);
+  factory.register_builder("AdjustSaturation", build_adjust_saturation_transform_from_pbuf);
+  factory.register_builder("CenterCrop", build_center_crop_transform_from_pbuf);
+  factory.register_builder("ColorJitter", build_color_jitter_transform_from_pbuf);
+  factory.register_builder("Colorize", build_colorize_transform_from_pbuf);
+  factory.register_builder("Cutout", build_cutout_transform_from_pbuf);
+  factory.register_builder("Grayscale", build_grayscale_transform_from_pbuf);
+  factory.register_builder("HorizontalFlip", build_horizontal_flip_transform_from_pbuf);
+  factory.register_builder("Normalize", build_normalize_transform_from_pbuf);
+  factory.register_builder("NormalizeToLBANNLayout", build_normalize_to_lbann_layout_transform_from_pbuf);
+  factory.register_builder("RandomAffine", build_random_affine_transform_from_pbuf);
+  factory.register_builder("RandomCrop", build_random_crop_transform_from_pbuf);
+  factory.register_builder("RandomResizedCrop", build_random_resized_crop_transform_from_pbuf);
+  factory.register_builder("RandomResizedCropWithFixedAspectRatio", build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf);
+  factory.register_builder("Resize", build_resize_transform_from_pbuf);
+  factory.register_builder("ResizedCenterCrop", build_resized_center_crop_transform_from_pbuf);
+  factory.register_builder("SampleNormalize", build_sample_normalize_transform_from_pbuf);
+  factory.register_builder("Scale", build_scale_transform_from_pbuf);
+  factory.register_builder("ToLBANNLayout", build_to_lbann_layout_transform_from_pbuf);
+  factory.register_builder("VerticalFlip", build_vertical_flip_transform_from_pbuf);
+}
+
+// Manage a global factory
+struct factory_manager {
+  factory_type factory_;
+
+  factory_manager() {
+    register_default_builders(factory_);
+  }
+};
+
+factory_manager factory_mgr_;
+factory_type const& get_transform_factory() noexcept {
+  return factory_mgr_.factory_;
+}
+
+}// namespace <anon>
+
+std::unique_ptr<transform::transform> construct_transform(
+  const lbann_data::Transform& trans) {
+
+  auto const& factory = get_transform_factory();
+  auto const& msg =
+    helpers::get_oneof_message(trans, "transform_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg);
+}
+
+transform::transform_pipeline construct_transform_pipeline(
+  const lbann_data::Reader& data_reader) {
+  transform::transform_pipeline tp;
+  for (int i = 0; i < data_reader.transforms_size(); ++i) {
+    tp.add_transform(construct_transform(data_reader.transforms(i)));
+  }
+  return tp;
+}
+
+}  // namespace proto
+}  // namespace lbann
diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp
index 17ded03f1ae..570faa68825 100644
--- a/src/proto/factories/weights_factory.cpp
+++ b/src/proto/factories/weights_factory.cpp
@@ -26,74 +26,131 @@
 
 #include "lbann/proto/factories.hpp"
 
-namespace lbann {
-namespace proto {
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/weights/initializer.hpp"
+#include "lbann/weights/variance_scaling_initializers.hpp"
 
-namespace {
+#include "lbann/proto/helpers.hpp"
+#include "lbann/proto/datatype_helpers.hpp"
+#include "lbann/utils/factory.hpp"
 
-/** Construct a weights initialization specified with prototext. */
-weights_initializer* construct_initializer(const lbann_data::Weights& proto_weights) {
+#include <weights.pb.h>
 
-  // Constant initialization
-  if (proto_weights.has_constant_initializer()) {
-    const auto& params = proto_weights.constant_initializer();
-    return new constant_initializer(params.value());
-  }
+namespace lbann {
+namespace proto {
+namespace {
 
-  // Value initialization
-  if (proto_weights.has_value_initializer()) {
-    const auto& params = proto_weights.value_initializer();
-    return new value_initializer(parse_list<DataType>(params.values()));
+using MessageT = google::protobuf::Message;
+
+// Define the factory type.
+using factory_type = lbann::generic_factory<
+  lbann::weights_initializer,
+  std::string,
+  generate_builder_type<lbann::weights_initializer,
+                        MessageT const&>,
+  default_key_error_policy>;
+
+/** @brief Singleton holder for a factory.
+ *
+ *  @note This design requires that the builder function be valid for
+ *  every combination of T, L, and D. That is, layer types for which a
+ *  combination is invalid must handle that error inside their builder
+ *  function.
+ */
+template <typename T>
+class factory_manager
+{
+public:
+  factory_manager() { register_default_builders(); }
+  factory_type const& get() const noexcept { return factory_; }
+
+private:
+  void register_default_builders()
+  {
+    factory_.register_builder("ConstantInitializer",
+                              build_constant_initializer_from_pbuf<T>);
+    factory_.register_builder("ValueInitializer",
+                              build_value_initializer_from_pbuf<T>);
+    factory_.register_builder("UniformInitializer",
+                              build_uniform_initializer_from_pbuf<T>);
+    factory_.register_builder("NormalInitializer",
+                              build_normal_initializer_from_pbuf<T>);
+    factory_.register_builder("GlorotNormalInitializer",
+                              build_glorot_initializer_from_pbuf<T>);
+    factory_.register_builder("GlorotUniformInitializer",
+                              build_glorot_initializer_from_pbuf<T>);
+    factory_.register_builder("HeNormalInitializer",
+                              build_he_initializer_from_pbuf<T>);
+    factory_.register_builder("HeUniformInitializer",
+                              build_he_initializer_from_pbuf<T>);
+    factory_.register_builder("LeCunNormalInitializer",
+                              build_lecun_initializer_from_pbuf<T>);
+    factory_.register_builder("LeCunUniformInitializer",
+                              build_lecun_initializer_from_pbuf<T>);
   }
 
-  // Random initialization
-  if (proto_weights.has_uniform_initializer()) {
-    const auto& params = proto_weights.uniform_initializer();
-    const auto& min = params.min();
-    const auto& max = params.max();
-    if (min != 0.0 || max != 0.0) {
-      return new uniform_initializer(min, max);
-    } else {
-      return new uniform_initializer();
-    }
-  }
-  if (proto_weights.has_normal_initializer()) {
-    const auto& params = proto_weights.normal_initializer();
-    const auto& mean = params.mean();
-    const auto& standard_deviation = params.standard_deviation();
-    if (mean != 0.0 || standard_deviation != 0.0) {
-      return new normal_initializer(mean, standard_deviation);
-    } else {
-      return new normal_initializer();
-    }
-  }
+private:
+  factory_type factory_;
+};
 
-  // Variance scaling initialization
-  if (proto_weights.has_glorot_normal_initializer()) {
-    return new glorot_initializer(probability_distribution::gaussian);
-  }
-  if (proto_weights.has_glorot_uniform_initializer()) {
-    return new glorot_initializer(probability_distribution::uniform);
-  }
-  if (proto_weights.has_he_normal_initializer()) {
-    return new he_initializer(probability_distribution::gaussian);
-  }
-  if (proto_weights.has_he_uniform_initializer()) {
-    return new he_initializer(probability_distribution::uniform);
-  }
+template <typename TensorDataType>
+factory_type const& get_weight_initializer_factory() noexcept
+{
+  static factory_manager<TensorDataType> factory_mgr_;
+  return factory_mgr_.get();
+}
 
-  return nullptr;
+/* Construct a weights initialization specified with prototext. */
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+construct_initializer(const lbann_data::Weights& proto_weights) {
+  auto const& factory = get_weight_initializer_factory<TensorDataType>();
+  auto const& msg =
+    helpers::get_oneof_message(proto_weights.initializer(), "initializer_type");
+  return factory.create_object(msg.GetDescriptor()->name(), msg);
 }
 
 } // namespace
 
-weights* construct_weights(lbann_comm* comm,
-                           const lbann_data::Optimizer& proto_opt,
-                           const lbann_data::Weights& proto_weights) {
+std::unique_ptr<weights> construct_weights(
+  lbann_comm* comm,
+  const lbann_data::Optimizer& proto_opt,
+  const lbann_data::Weights& proto_weights) {
   std::stringstream err;
 
+  auto proto_datatype = proto_weights.datatype();
+
   // Instantiate weights
-  weights* w = new weights(comm);
+  //  auto w = make_unique<data_type_weights<DataType>>(comm);
+  std::unique_ptr<weights> w;
+  std::unique_ptr<weights_initializer> init;
+  std::unique_ptr<optimizer> opt;
+
+#define TEMPLATE_INSTANTIATION(TensorDataType)                                \
+    do {                                                                      \
+      if (proto_datatype == TypeToProtoDataType<TensorDataType>::value) {     \
+        w = make_unique<data_type_weights<TensorDataType>>(comm);             \
+        init = (proto_weights.has_initializer()                               \
+          ? construct_initializer<TensorDataType>(proto_weights)              \
+          : nullptr);                                                         \
+        const lbann_data::Optimizer& opt_msg = (proto_weights.has_optimizer() \
+          ? proto_weights.optimizer()                                         \
+          : proto_opt);                                                       \
+        opt = (helpers::has_oneof(opt_msg, "optimizer_type")                  \
+          ? construct_optimizer<TensorDataType>(opt_msg)                      \
+          : nullptr);                                                         \
+      }                                                                       \
+    } while (0)
+
+#define PROTO(T) \
+    TEMPLATE_INSTANTIATION(T)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+#undef PROTO
+#undef TEMPLATE_INSTANTIATION
 
   // Set weights name if provided
   const auto& name = proto_weights.name();
@@ -108,18 +165,10 @@ weights* construct_weights(lbann_comm* comm,
   }
 
   // Set weights initializer and optimizer
-  std::unique_ptr<weights_initializer> init(construct_initializer(proto_weights));
-  std::unique_ptr<optimizer> opt;
-  if (proto_weights.has_optimizer()) {
-    opt.reset(construct_optimizer(comm, proto_weights.optimizer()));
-  } else {
-    opt.reset(construct_optimizer(comm, proto_opt));
-  }
-  w->set_initializer(init);
-  w->set_optimizer(opt);
+  w->set_initializer(std::move(init));
+  w->set_optimizer(std::move(opt));
 
   return w;
-
 }
 
 } // namespace proto
diff --git a/src/proto/helpers.cpp b/src/proto/helpers.cpp
new file mode 100644
index 00000000000..1ec2ed82274
--- /dev/null
+++ b/src/proto/helpers.cpp
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/proto/helpers.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include <string>
+
+namespace lbann {
+namespace proto {
+namespace helpers {
+namespace {
+google::protobuf::FieldDescriptor const* get_oneof_field_descriptor(
+  google::protobuf::Message const& msg_in, std::string const& oneof_name) {
+  auto desc = msg_in.GetDescriptor();
+  auto reflex = msg_in.GetReflection();
+  auto oneof_handle = desc->FindOneofByName(oneof_name);
+  if (!oneof_handle)
+  {
+    std::string msg_string;
+    google::protobuf::TextFormat::PrintToString(msg_in, &msg_string);
+    LBANN_ERROR("Message has no oneof field named \"",
+                oneof_name, "\"\n\nMessage(",
+                desc->DebugString(), "):\n\n",
+                msg_string);
+  }
+
+  return reflex->GetOneofFieldDescriptor(msg_in, oneof_handle);
+}
+}// namespace
+
+bool has_oneof(
+  google::protobuf::Message const& msg, std::string const& oneof_name)
+{
+  return (bool) get_oneof_field_descriptor(msg, oneof_name);
+}
+
+google::protobuf::Message const&
+get_oneof_message(
+  google::protobuf::Message const& msg_in, std::string const& oneof_name) {
+  auto oneof_field = get_oneof_field_descriptor(msg_in, oneof_name);
+  if (!oneof_field) {
+    LBANN_ERROR("Oneof field \"", oneof_name,
+                "\" in message has not been set. Message:\n{",
+                msg_in.DebugString(),"\n}\n");
+  }
+
+  if (oneof_field->type() != google::protobuf::FieldDescriptor::TYPE_MESSAGE) {
+    LBANN_ERROR("Oneof field is not of message type.");
+  }
+
+  return msg_in.GetReflection()->GetMessage(msg_in, oneof_field);
+}
+
+}// namespace helpers
+}// namespace proto
+}// namespace lbann
diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp
index 65a13c52d80..0b9e38c6187 100644
--- a/src/proto/init_image_data_readers.cpp
+++ b/src/proto/init_image_data_readers.cpp
@@ -27,286 +27,25 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/proto/init_image_data_readers.hpp"
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/text_format.h>
-#include <memory> // for dynamic_pointer_cast
+#include "lbann/proto/factories.hpp"
 
-namespace lbann {
+#include "lbann/data_readers/data_reader_cifar10.hpp"
+#include "lbann/data_readers/data_reader_jag_conduit.hpp"
+#include "lbann/data_readers/data_reader_imagenet.hpp"
+#include "lbann/data_readers/data_reader_mnist.hpp"
 
-/// set up a cropper
-static void set_cropper(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                        const bool master, std::shared_ptr<cv_process>& pp,
-                        int& width, int& height) {
-  if (pb_preprocessor.has_cropper()) {
-    const lbann_data::ImagePreprocessor::Cropper& pb_cropper = pb_preprocessor.cropper();
-    if (!pb_cropper.disable()) {
-      const std::string cropper_name = ((pb_cropper.name() == "")? "default_cropper" : pb_cropper.name());
-      std::unique_ptr<lbann::cv_cropper> cropper(new(lbann::cv_cropper));
-      cropper->set_name(cropper_name);
-      cropper->set(pb_cropper.crop_width(),
-                   pb_cropper.crop_height(),
-                   pb_cropper.crop_randomly(),
-                   std::make_pair<int,int>(pb_cropper.resized_width(),
-                                           pb_cropper.resized_height()),
-                   pb_cropper.adaptive_interpolation());
-      pp->add_transform(std::move(cropper));
-      width = pb_cropper.crop_width();
-      height = pb_cropper.crop_height();
-      if (master) std::cout << "image processor: " << cropper_name << " cropper is set" << std::endl;
-    }
-  }
-}
+#include <reader.pb.h>
 
-/// set up a resizer
-static void set_resizer(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                        const bool master, std::shared_ptr<cv_process>& pp,
-                        int& width, int& height) {
-  if (pb_preprocessor.has_resizer()) {
-    const lbann_data::ImagePreprocessor::Resizer& pb_resizer = pb_preprocessor.resizer();
-    if (!pb_resizer.disable()) {
-      const std::string resizer_name = ((pb_resizer.name() == "")? "default_resizer" : pb_resizer.name());
-      std::unique_ptr<lbann::cv_resizer> resizer(new(lbann::cv_resizer));
-      resizer->set_name(resizer_name);
-      resizer->set(pb_resizer.resized_width(),
-                   pb_resizer.resized_height(),
-                   pb_resizer.adaptive_interpolation());
-      pp->add_transform(std::move(resizer));
-      width = pb_resizer.resized_width();
-      height = pb_resizer.resized_height();
-      if (master) std::cout << "image processor: " << resizer_name << " resizer is set" << std::endl;
-    }
-  }
-}
-
-/// set up an augmenter
-static void set_augmenter(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                          const bool master, std::shared_ptr<cv_process>& pp) {
-  if (pb_preprocessor.has_augmenter()) {
-    const lbann_data::ImagePreprocessor::Augmenter& pb_augmenter = pb_preprocessor.augmenter();
-    if (!pb_augmenter.disable() &&
-        (pb_augmenter.horizontal_flip() ||
-         pb_augmenter.vertical_flip() ||
-         pb_augmenter.rotation() != 0.0 ||
-         pb_augmenter.horizontal_shift() != 0.0 ||
-         pb_augmenter.vertical_shift() != 0.0 ||
-         pb_augmenter.shear_range() != 0.0))
-    {
-      const std::string augmenter_name = ((pb_augmenter.name() == "")? "default_augmenter" : pb_augmenter.name());
-      std::unique_ptr<lbann::cv_augmenter> augmenter(new(lbann::cv_augmenter));
-      augmenter->set_name(augmenter_name);
-      augmenter->set(pb_augmenter.horizontal_flip(),
-                     pb_augmenter.vertical_flip(),
-                     pb_augmenter.rotation(),
-                     pb_augmenter.horizontal_shift(),
-                     pb_augmenter.vertical_shift(),
-                     pb_augmenter.shear_range());
-      pp->add_transform(std::move(augmenter));
-      if (master) std::cout << "image processor: " << augmenter_name << " augmenter is set" << std::endl;
-    }
-  }
-}
-
-/// set up a decolorizer
-static void set_decolorizer(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                     const bool master, std::shared_ptr<cv_process>& pp, int& channels) {
-  if (pb_preprocessor.has_decolorizer()) {
-    const lbann_data::ImagePreprocessor::Decolorizer& pb_decolorizer = pb_preprocessor.decolorizer();
-    if  (!pb_decolorizer.disable()) {
-      const std::string decolorizer_name = ((pb_decolorizer.name() == "")? "default_decolorizer" : pb_decolorizer.name());
-      std::unique_ptr<lbann::cv_decolorizer> decolorizer(new(lbann::cv_decolorizer));
-      decolorizer->set_name(decolorizer_name);
-      decolorizer->set(pb_decolorizer.pick_1ch());
-      pp->add_transform(std::move(decolorizer));
-      channels = 1;
-      if (master) std::cout << "image processor: " << decolorizer_name << " decolorizer is set" << std::endl;
-    }
-  }
-}
-
-/// set up a colorizer
-static void set_colorizer(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                          const bool master, std::shared_ptr<cv_process>& pp, int& channels) {
-  if (pb_preprocessor.has_colorizer()) {
-    const lbann_data::ImagePreprocessor::Colorizer& pb_colorizer = pb_preprocessor.colorizer();
-    if (!pb_colorizer.disable()) {
-      const std::string colorizer_name = ((pb_colorizer.name() == "")? "default_colorizer" : pb_colorizer.name());
-      std::unique_ptr<lbann::cv_colorizer> colorizer(new(lbann::cv_colorizer));
-      colorizer->set_name(colorizer_name);
-      pp->add_transform(std::move(colorizer));
-      channels = 3;
-      if (master) std::cout << "image processor: " << colorizer_name << " colorizer is set" << std::endl;
-    }
-  }
-}
-
-static bool has_channel_wise_subtractor(const lbann_data::ImagePreprocessor& pb_preprocessor) {
-  if (!pb_preprocessor.has_subtractor()) {
-    return false;
-  }
-  const lbann_data::ImagePreprocessor::Subtractor& pb_subtractor = pb_preprocessor.subtractor();
-  return ((pb_subtractor.channel_mean_size() > 0) || (pb_subtractor.channel_stddev_size() > 0))
-         && pb_subtractor.image_to_sub().empty() && pb_subtractor.image_to_div().empty();
-}
-
-/// set up a subtractor
-static void set_subtractor(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                           const bool master, std::shared_ptr<cv_process>& pp,
-                           const int channels) {
-  if (pb_preprocessor.has_subtractor()) {
-    const lbann_data::ImagePreprocessor::Subtractor& pb_subtractor = pb_preprocessor.subtractor();
-    if  (!pb_subtractor.disable()) {
-      const std::string subtractor_name = ((pb_subtractor.name() == "")? "default_subtractor" : pb_subtractor.name());
-      std::unique_ptr<lbann::cv_subtractor> subtractor(new(lbann::cv_subtractor));
-      subtractor->set_name(subtractor_name);
-
-      bool is_mean_set = false;
-
-      if (!pb_subtractor.image_to_sub().empty()) {
-        subtractor->set_mean(pb_subtractor.image_to_sub());
-        is_mean_set = true;
-      }
-      else if (pb_subtractor.channel_mean_size() > 0) {
-        const size_t n = pb_subtractor.channel_mean_size();
-        if (n != static_cast<size_t>(channels)) {
-          throw lbann_exception("Failed to setup subtractor due to inconsistent number of channels.");
-        }
-        std::vector<lbann::DataType> ch_mean(n);
-        for(size_t i = 0u; i < n; ++i) {
-          ch_mean[i] = static_cast<lbann::DataType>(pb_subtractor.channel_mean(i));
-        }
-
-        subtractor->set_mean(ch_mean);
-        is_mean_set = true;
-      }
-
-      if (!is_mean_set && master) {
-        std::cout << "image processor: " << subtractor_name << " assumes zero mean." << std::endl
-                  << "  If this is not the case, provide mean." << std::endl;
-      }
-
-      bool is_stddev_set = false;
-      if (!pb_subtractor.image_to_div().empty()) {
-        subtractor->set_stddev(pb_subtractor.image_to_div());
-        is_stddev_set = true;
-      }
-      else if (pb_subtractor.channel_stddev_size() > 0) {
-        const size_t n = pb_subtractor.channel_stddev_size();
-        if (n != static_cast<size_t>(channels)) {
-          throw lbann_exception("Failed to setup subtractor due to inconsistent number of channels.");
-        }
-        std::vector<lbann::DataType> ch_stddev(n);
-        for(size_t i = 0u; i < n; ++i) {
-          ch_stddev[i] = static_cast<lbann::DataType>(pb_subtractor.channel_stddev(i));
-        }
-
-        subtractor->set_stddev(ch_stddev);
-        is_stddev_set = true;
-      }
-
-      pp->add_normalizer(std::move(subtractor));
-      if (master) {
-        std::cout << "image processor: " << subtractor_name << " subtractor is set for "
-                  << (has_channel_wise_subtractor(pb_preprocessor)? "channel-wise" : "pixel-wise")
-                  << ' ' << (is_stddev_set? "z-score" : "mean-subtraction") << std::endl;
-      }
-    }
-  }
-}
-
-/// set up a sample-wide normalizer
-static void set_normalizer(const lbann_data::ImagePreprocessor& pb_preprocessor,
-                           const bool master, std::shared_ptr<cv_process>& pp) {
-  if (pb_preprocessor.has_normalizer()) {
-    const lbann_data::ImagePreprocessor::Normalizer& pb_normalizer = pb_preprocessor.normalizer();
-    if (!pb_normalizer.disable()) {
-      const std::string normalizer_name = ((pb_normalizer.name() == "")? "default_normalizer" : pb_normalizer.name());
-      std::unique_ptr<lbann::cv_normalizer> normalizer(new(lbann::cv_normalizer));
-      normalizer->set_name(normalizer_name);
-      normalizer->unit_scale(pb_normalizer.scale());
-      normalizer->subtract_mean(pb_normalizer.subtract_mean());
-      normalizer->unit_variance(pb_normalizer.unit_variance());
-      normalizer->z_score(pb_normalizer.z_score());
-      bool ok = pp->add_normalizer(std::move(normalizer));
-      if (master && ok) std::cout << "image processor: " << normalizer_name << " normalizer is set" << std::endl;
-    }
-  }
-}
-
-
-void init_image_preprocessor(const lbann_data::Reader& pb_readme, const bool master,
-                             std::shared_ptr<cv_process>& pp, int& width, int& height, int& channels) {
-// Currently we set width and height for image_data_reader here considering the transform
-// pipeline. image_data_reader reports the final dimension of data to the child layer based
-// on these information.
-// TODO: However, for composible pipeline, this needs to be automatically determined by each
-// cv_process at the setup finalization stage.
-  if (!pb_readme.has_image_preprocessor()) return;
-
-  const lbann_data::ImagePreprocessor& pb_preprocessor = pb_readme.image_preprocessor();
-  if (pb_preprocessor.disable()) return;
-
-  // data reader name
-  const std::string& name = pb_readme.name();
-  // final size of image
-  width = pb_preprocessor.raw_width();
-  height = pb_preprocessor.raw_height();
-  if (pb_preprocessor.raw_num_channels() > 0) {
-    channels = pb_preprocessor.raw_num_channels();
-  }
-
-  if (pb_preprocessor.has_subtractor() && !has_channel_wise_subtractor(pb_preprocessor)) {
-    // decolorizer and colorizer are exclusive
-    set_decolorizer(pb_preprocessor, master, pp, channels);
-    set_colorizer(pb_preprocessor, master, pp, channels);
-    // set up a pixel-wise subtractor
-    set_subtractor(pb_preprocessor, master, pp, channels);
-  }
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
 
-  set_cropper(pb_preprocessor, master, pp, width, height);
-  set_resizer(pb_preprocessor, master, pp, width, height);
-  set_augmenter(pb_preprocessor, master, pp);
-  if (has_channel_wise_subtractor(pb_preprocessor)) {
-    // decolorizer and colorizer are exclusive
-    set_decolorizer(pb_preprocessor, master, pp, channels);
-    set_colorizer(pb_preprocessor, master, pp, channels);
-    // set up a channel-wise subtractor
-    set_subtractor(pb_preprocessor, master, pp, channels);
-  } else if (!pb_preprocessor.has_subtractor()) {
-    // decolorizer/colorizer would have already been applied in the pixel-wise subtractor
-    // decolorizer and colorizer are exclusive
-    set_decolorizer(pb_preprocessor, master, pp, channels);
-    set_colorizer(pb_preprocessor, master, pp, channels);
-  }
-  set_normalizer(pb_preprocessor, master, pp);
-
-  // create a data reader
-  if (name == "imagenet_patches") {
-    std::shared_ptr<cv_process_patches> ppp = std::dynamic_pointer_cast<cv_process_patches>(pp);
-    if (pb_preprocessor.has_patch_extractor()) {
-      const lbann_data::ImagePreprocessor::PatchExtractor& pb_patch_extractor = pb_preprocessor.patch_extractor();
-      if (!pb_patch_extractor.disable()) {
-        const std::string patch_extractor_name = ((pb_patch_extractor.name() == "")? "default_patch_extractor" : pb_patch_extractor.name());
-        lbann::patchworks::patch_descriptor pi;
-        pi.set_sample_image(static_cast<unsigned int>(width),
-                            static_cast<unsigned int>(height));
-        pi.set_size(pb_patch_extractor.patch_width(), pb_patch_extractor.patch_height());
-        pi.set_gap(pb_patch_extractor.patch_gap());
-        pi.set_jitter(pb_patch_extractor.patch_jitter());
-        pi.set_mode_centering(pb_patch_extractor.centering_mode());
-        pi.set_mode_chromatic_aberration(pb_patch_extractor.ca_correction_mode());
-        pi.set_self_label();
-        pi.define_patch_set();
-        width = pb_patch_extractor.patch_width();
-        height = pb_patch_extractor.patch_height();
-        ppp->set_name(patch_extractor_name);
-        ppp->set_patch_descriptor(pi);
-        if (master) std::cout << "image processor: " << patch_extractor_name << " patch_extractor is set" << std::endl;
-      }
-    }
-  }
-}
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
+namespace lbann {
 
 void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader) {
   // data reader name
@@ -316,48 +55,51 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat
   // number of labels
   const int n_labels = pb_readme.num_labels();
 
-  std::shared_ptr<cv_process> pp;
-  // set up the image preprocessor
-  if ((name == "imagenet") || (name == "jag_conduit") ||
-      (name == "multihead_siamese") || (name == "mnist_siamese") ||
-      (name == "multi_images") || (name == "moving_mnist")) {
-    pp = std::make_shared<cv_process>();
-  } else if (name == "imagenet_patches") {
-    pp = std::make_shared<cv_process_patches>();
-  } else {
-    if (master) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: unknown name for image data reader: "
-          << name;
-      throw lbann_exception(err.str());
-    }
-  }
-
   // final size of image
   int width = 0, height = 0;
   int channels = 0;
 
-  // setup preprocessor
-  init_image_preprocessor(pb_readme, master, pp, width, height, channels);
-
-  if (name == "imagenet_patches") {
-    std::shared_ptr<cv_process_patches> ppp = std::dynamic_pointer_cast<cv_process_patches>(pp);
-    reader = new imagenet_reader_patches(ppp, shuffle);
-  } else if (name == "imagenet") {
-    reader = new imagenet_reader(pp, shuffle);
-  } else if (name == "multihead_siamese") {
-    reader = new data_reader_multihead_siamese(pp, pb_readme.num_image_srcs(), shuffle);
-  } else if (name == "mnist_siamese") {
-    reader = new data_reader_mnist_siamese(pp, shuffle);
-  } else if (name == "multi_images") {
-    reader = new data_reader_multi_images(pp, shuffle);
-  } else if (name == "moving_mnist") {
-    reader = new moving_mnist_reader(7, 40, 40, 2);
-#ifdef LBANN_HAS_CONDUIT
+  // Ugly hack for now to extract dimensions.
+  for (int i = 0; i < pb_readme.transforms_size(); ++i) {
+    auto& trans = pb_readme.transforms(i);
+    if (trans.has_center_crop()) {
+      height = trans.center_crop().height();
+      width = trans.center_crop().width();
+    } else if (trans.has_grayscale()) { channels = 1; }
+    else if (trans.has_random_crop()) {
+      height = trans.random_crop().height();
+      width = trans.random_crop().width();
+    } else if (trans.has_random_resized_crop()) {
+      height = trans.random_resized_crop().height();
+      width = trans.random_resized_crop().width();
+    } else if (trans.has_random_resized_crop_with_fixed_aspect_ratio()) {
+      height = trans.random_resized_crop_with_fixed_aspect_ratio().crop_height();
+      width = trans.random_resized_crop_with_fixed_aspect_ratio().crop_width();
+    } else if (trans.has_resize()) {
+      height = trans.resize().height();
+      width = trans.resize().width();
+    } else if (trans.has_resized_center_crop()) {
+      height = trans.resized_center_crop().crop_height();
+      width = trans.resized_center_crop().crop_width();
+    }
+  }
+
+  if (name == "imagenet") {
+    reader = new imagenet_reader(shuffle);
   } else if (name =="jag_conduit") {
-    data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(pp, shuffle);
+    data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(shuffle);
     const lbann_data::DataSetMetaData::Schema& pb_schema = pb_metadata.schema();
 
+    if(height == 0 && pb_schema.image_height() != 0) {
+      height = pb_schema.image_height();
+    }
+    if(width == 0 && pb_schema.image_width() != 0) {
+      width = pb_schema.image_width();
+    }
+    if(channels == 0 && pb_schema.image_num_channels() != 0) {
+      channels = pb_schema.image_num_channels();
+    }
+
     if (channels == 0) {
       channels = 1;
     }
@@ -513,9 +255,11 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat
     reader = reader_jag;
     if (master) std::cout << reader->get_type() << " is set" << std::endl;
     return;
-#endif // LBANN_HAS_CONDUIT
   }
 
+  reader->set_transform_pipeline(
+    proto::construct_transform_pipeline(pb_readme));
+
   if (channels == 0) {
     channels = 3;
   }
@@ -528,82 +272,9 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat
   }
   if (master) std::cout << reader->get_type() << " is set" << std::endl;
 
-  // configure the data reader
-  if (name == "multi_images") {
-    const int n_img_srcs = pb_readme.num_image_srcs();
-    data_reader_multi_images* multi_image_dr_ptr
-      = dynamic_cast<data_reader_multi_images*>(image_data_reader_ptr);
-    if (multi_image_dr_ptr == nullptr) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " no data_reader_multi_images";
-      throw lbann_exception(err.str());
-    }
-    multi_image_dr_ptr->set_input_params(width, height, channels, n_labels, n_img_srcs);
-  } else if(name == "multihead_siamese") {
-    const int n_img_srcs = pb_readme.num_image_srcs();
-    data_reader_multi_images* multi_image_dr_ptr
-      = dynamic_cast<data_reader_multi_images*>(image_data_reader_ptr);
-    multi_image_dr_ptr->set_input_params(width, height, channels, n_labels, n_img_srcs);
-  } else {
-    image_data_reader_ptr->set_input_params(width, height, channels, n_labels);
-  }
-}
-
-
-void init_generic_preprocessor(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* reader) {
-  if (!pb_readme.has_image_preprocessor()) return;
-
-  const lbann_data::ImagePreprocessor& pb_preprocessor = pb_readme.image_preprocessor();
-  if (pb_preprocessor.disable()) return;
-
-  // set up augmenter if necessary
-  if (pb_preprocessor.has_augmenter()) {
-    const lbann_data::ImagePreprocessor::Augmenter& pb_augmenter = pb_preprocessor.augmenter();
-    if (!pb_augmenter.disable() &&
-        (pb_augmenter.name() == "") &&
-        (pb_augmenter.horizontal_flip() ||
-         pb_augmenter.vertical_flip() ||
-         pb_augmenter.rotation() != 0.0 ||
-         pb_augmenter.horizontal_shift() != 0.0 ||
-         pb_augmenter.vertical_shift() != 0.0 ||
-         pb_augmenter.shear_range() != 0.0))
-    {
-      reader->horizontal_flip( pb_augmenter.horizontal_flip() );
-      reader->vertical_flip( pb_augmenter.vertical_flip() );
-      reader->rotation( pb_augmenter.rotation() );
-      reader->horizontal_shift( pb_augmenter.horizontal_shift() );
-      reader->vertical_shift( pb_augmenter.vertical_shift() );
-      reader->shear_range( pb_augmenter.shear_range() );
-      if (master) std::cout << "image processor: augmenter is set" << std::endl;
-    } else {
-      reader->disable_augmentation();
-    }
-  }
-
-  // set up the normalizer
-  if (pb_preprocessor.has_normalizer()) {
-    const lbann_data::ImagePreprocessor::Normalizer& pb_normalizer = pb_preprocessor.normalizer();
-    if (!pb_normalizer.disable() &&
-        (pb_normalizer.name() == "")) {
-      reader->subtract_mean( pb_normalizer.subtract_mean() );
-      reader->unit_variance( pb_normalizer.unit_variance() );
-      reader->scale( pb_normalizer.scale() );
-      reader->z_score( pb_normalizer.z_score() );
-      if (master) std::cout << "image processor: normalizer is set" << std::endl;
-    }
-  }
-
-  if (pb_preprocessor.has_noiser()) {
-    const lbann_data::ImagePreprocessor::Noiser& pb_noiser = pb_preprocessor.noiser();
-    if (!pb_noiser.disable() &&
-        (pb_noiser.name() == "")) {
-      reader->add_noise( pb_noiser.factor() );
-      if (master) std::cout << "image processor: noiser is set" << std::endl;
-    }
-  }
+  image_data_reader_ptr->set_input_params(width, height, channels, n_labels);
 }
 
-
 void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* &reader) {
   // data reader name
   const std::string& name = pb_readme.name();
@@ -616,8 +287,6 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool
   } else if (name == "cifar10") {
     reader = new cifar10_reader(shuffle);
     if (master) std::cout << "cifar10_reader is set" << std::endl;
-  } else if (name == "moving_mnist") {
-    reader = new moving_mnist_reader(7, 40, 40, 2);
   } else {
     if (master) {
       std::stringstream err;
@@ -627,8 +296,8 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool
     }
   }
 
-  // setup preprocessor
-  init_generic_preprocessor(pb_readme, master, reader);
+  reader->set_transform_pipeline(
+    proto::construct_transform_pipeline(pb_readme));
 }
 
 }
diff --git a/src/proto/layers.proto b/src/proto/layers.proto
new file mode 100644
index 00000000000..be5f2883155
--- /dev/null
+++ b/src/proto/layers.proto
@@ -0,0 +1,794 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+import "google/protobuf/wrappers.proto";
+
+enum DataType {
+  FLOAT = 0;
+  DOUBLE = 1;
+  FP16 = 2;
+}
+
+// Lives here because it's used by both Convolution and
+// Deconvolution.
+enum ConvTensorOpsMode {
+  // Use the global default.
+  DEFAULT_TENSOR_OPS = 0;
+  // Explicitly disable tensor ops.
+  NO_TENSOR_OPS = 1;
+  // Use tensor ops -- allows conversion FP32->FP16
+  USE_TENSOR_OPS = 2;
+}
+
+message Layer {
+  string name = 50;
+  string parents = 151;
+  string children = 152;
+  string data_layout = 52;
+  string device_allocation = 55;
+  DataType datatype = 57;
+  string weights = 54;
+  bool num_neurons_from_data_reader = 53;
+  bool freeze = 5;
+  string hint_layer = 56;
+  ParallelStrategy parallel_strategy = 58;
+
+  repeated WeightsData weights_data = 153;
+  string top = 154;
+  string bottom = 155;
+  string type = 156;
+
+  oneof layer_type {
+    // Input layers
+    Input input = 2;
+
+    // Transform layers
+    Reshape reshape = 306;
+    Pooling pooling = 12;
+    Concatenation concatenation = 300;
+    Slice slice = 301;
+    Split split = 302;
+    Sum sum = 303;
+    WeightedSum weighted_sum = 323;
+    Unpooling unpooling = 304;
+    Hadamard hadamard = 308;
+    Constant constant = 309;
+    Reduction reduction = 310;
+    Evaluation evaluation = 311;
+    Gaussian gaussian = 312;
+    Bernoulli bernoulli = 313;
+    Uniform uniform = 314;
+    Crop crop = 316;
+    CategoricalRandom categorical_random = 317;
+    DiscreteRandom discrete_random = 318;
+    Dummy dummy = 319;
+    StopGradient stop_gradient = 320;
+    InTopK in_top_k = 324;
+    Sort sort = 325;
+    WeightsLayer weights_layer = 326;
+    Tessellate tessellate = 327;
+
+    // Learning layers
+    FullyConnected fully_connected = 11;
+    Convolution convolution = 13;
+    Deconvolution deconvolution = 305;
+    Embedding embedding = 328;
+    ChannelwiseScaleBias channelwise_scale_bias = 329;
+    EntrywiseScaleBias entrywise_scale_bias = 330;
+    ChannelwiseFullyConnected channelwise_fully_connected = 331;
+
+    // Loss layers
+    CrossEntropy cross_entropy = 60;
+    MeanSquaredError mean_squared_error = 61;
+    MeanAbsoluteError mean_absolute_error = 62;
+    CategoricalAccuracy categorical_accuracy = 63;
+    TopKCategoricalAccuracy top_k_categorical_accuracy = 64;
+    L2Norm2 l2_norm2 = 65;
+    L1Norm l1_norm = 66;
+    BinaryCrossEntropy binary_cross_entropy = 67;
+    SigmoidBinaryCrossEntropy sigmoid_binary_cross_entropy = 68;
+    BooleanAccuracy boolean_accuracy = 69;
+    BooleanFalseNegative boolean_false_negative = 70;
+    BooleanFalsePositive boolean_false_positive = 71;
+
+    // Math layers
+    LogicalNot logical_not = 401;
+    Abs abs = 402;
+    Negative negative = 403;
+    Sign sign = 404;
+    Round round = 405;
+    Ceil ceil = 406;
+    Floor floor = 407;
+    Reciprocal reciprocal = 408;
+    Square square = 409;
+    Sqrt sqrt = 410;
+    Rsqrt rsqrt = 411;
+    SafeReciprocal safe_reciprocal = 412;
+    Exp exp = 413;
+    Expm1 expm1 = 414;
+    Log log = 415;
+    Log1p log1p = 416;
+    Cos cos = 417;
+    Sin sin = 418;
+    Tan tan = 419;
+    Acos acos = 420;
+    Asin asin = 421;
+    Atan atan = 422;
+    Cosh cosh = 423;
+    Sinh sinh = 424;
+    Tanh tanh = 425;
+    Acosh acosh = 426;
+    Asinh asinh = 427;
+    Atanh atanh = 428;
+    Add add = 450;
+    Subtract subtract = 451;
+    Multiply multiply = 452;
+    Divide divide = 453;
+    Mod mod = 454;
+    Pow pow = 455;
+    SafeDivide safe_divide = 456;
+    SquaredDifference squared_difference = 457;
+    Max max = 458;
+    Min min = 459;
+    Equal equal = 460;
+    NotEqual not_equal = 461;
+    Less less = 462;
+    LessEqual less_equal = 463;
+    Greater greater = 464;
+    GreaterEqual greater_equal = 465;
+    LogicalAnd logical_and = 466;
+    LogicalOr logical_or = 467;
+    LogicalXor logical_xor = 468;
+    Clamp clamp = 469;
+    MatMul matmul = 470;
+
+    // Regularization layers
+    BatchNormalization batch_normalization = 19;
+    LocalResponseNormalization local_response_normalization = 20;
+    Dropout dropout = 21;
+    SeluDropout selu_dropout = 229;
+    EntrywiseBatchNormalization entrywise_batch_normalization = 230;
+    LayerNorm layer_norm = 231;
+    InstanceNorm instance_norm = 232;
+
+    // Activation layers
+    Elu elu = 200;
+    Identity identity = 201;
+    LeakyRelu leaky_relu = 202;
+    LogSigmoid log_sigmoid = 203;
+    LogSoftmax log_softmax = 204;
+    Relu relu = 205;
+    Selu selu = 206;
+    Sigmoid sigmoid = 207;
+    Softmax softmax = 208;
+    Softplus softplus = 209;
+    Softsign softsign = 210;
+
+    // Image layers
+    BilinearResize bilinear_resize = 500;
+
+    // Miscellaneous layers
+    Covariance covariance = 600;
+    Variance variance = 601;
+    ChannelwiseMean channelwise_mean = 602;
+    MiniBatchIndex mini_batch_index = 603;
+    MiniBatchSize mini_batch_size = 604;
+    Argmax argmax = 605;
+    Argmin argmin = 606;
+    OneHot one_hot = 607;
+    ChannelwiseSoftmax channelwise_softmax = 608;
+    DistEmbedding dist_embedding = 609;
+
+  }
+
+  ///////////////////////
+  // Math layers       //
+  ///////////////////////
+  message LogicalNot {}
+  message Abs {}
+  message Negative {}
+  message Sign {}
+  message Round {}
+  message Ceil {}
+  message Floor {}
+  message Reciprocal {}
+  message Square {}
+  message Sqrt {}
+  message Rsqrt {}
+  message SafeReciprocal {}
+  message Exp {}
+  message Expm1 {}
+  message Log {}
+  message Log1p {}
+  message Cos {}
+  message Sin {}
+  message Tan {}
+  message Acos {}
+  message Asin {}
+  message Atan {}
+  message Cosh {}
+  message Sinh {}
+  message Tanh {}
+  message Acosh {}
+  message Asinh {}
+  message Atanh {}
+  message Add {}
+  message Subtract {}
+  message Multiply {}
+  message Divide {}
+  message Mod {}
+  message Pow {}
+  message SafeDivide {}
+  message SquaredDifference {}
+  message Max {}
+  message Min {}
+  message Equal {}
+  message NotEqual {}
+  message Less {}
+  message LessEqual {}
+  message Greater {}
+  message GreaterEqual {}
+  message LogicalAnd {}
+  message LogicalOr {}
+  message LogicalXor {}
+  message Clamp {
+    double min = 1;
+    double max = 2;
+  }
+
+  /** @brief Matrix multiplication.
+   *
+   *  Takes two 2D input tensors and outputs their matrix product.
+   *  Matrix products are computed independently for each mini-batch
+   *  sample, in a similar manner as NumPy's matmul function.
+   */
+  message MatMul {
+    /** If true, matrices from the first input tensor are transposed
+     *  before multiplication.
+     */
+    bool transpose_a = 1;
+    /** If true, matrices from the second input tensor are transposed
+     *  before multiplication.
+     */
+    bool transpose_b = 2;
+  }
+
+  ///////////////////////
+  // Activation layers //
+  ///////////////////////
+  message Elu {
+    double alpha = 1; //default: 1.0; should be >= 0
+  }
+  message Identity {}
+  message LeakyRelu {
+    double negative_slope = 1; //default: 0.01
+  }
+  message LogSigmoid {}
+
+  /** @brief Logarithm of softmax function.
+   *
+   *  @f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} @f]
+   */
+  message LogSoftmax {}
+
+  message Relu {}
+  message Selu {}
+  message Sigmoid {}
+
+  /**
+   *  @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f]
+   */
+  message Softmax {
+    string softmax_mode = 1; // default: "instance"; should be "instance" or "channel"
+  }
+
+  message Softplus {}
+  message Softsign {}
+
+  ///////////////////////
+  // Loss layers //
+  ///////////////////////
+  message CrossEntropy {}
+  message MeanSquaredError {}
+  message MeanAbsoluteError {}
+  message CategoricalAccuracy {}
+  message TopKCategoricalAccuracy {
+    int64 k = 1;
+  }
+  message L2Norm2 {}
+  message L1Norm {}
+  message BinaryCrossEntropy {}
+  message SigmoidBinaryCrossEntropy {}
+  message BooleanAccuracy {}
+  message BooleanFalseNegative {}
+  message BooleanFalsePositive {}
+
+  ///////////////////////////
+  // Regularization layers //
+  ///////////////////////////
+  message BatchNormalization {
+    double decay = 1;          //default: 0.9
+    double scale_init = 2;     //default: 1.0
+    double bias_init = 3;      //default: 0.0
+    double epsilon = 4;        //default: 1e-5
+    string stats_aggregation = 5; // default: local; deprecated
+    // default: 1 (local aggregation); set to a negative value for global stats.
+    int64 statistics_group_size = 6;
+  }
+
+  message EntrywiseBatchNormalization {
+    double decay = 1;
+    double epsilon = 2;
+  }
+
+  message SeluDropout {
+    double keep_prob = 2; //default: 0.95
+    double alpha = 3;     //default: 1.6732632423543772848170429916717
+    double scale = 4;     //default: 1.0507009873554804934193349852946
+  }
+
+  message LocalResponseNormalization {
+    int64 window_width = 4;
+    double lrn_alpha = 5;
+    double lrn_beta = 6;
+    double lrn_k = 7;
+  }
+
+  message Dropout {
+    double keep_prob = 2;  //default: 0.5
+  }
+
+  /** @brief
+   *
+   *  Each data sample is normalized to have zero mean and unit
+   *  standard deviation. See:
+   *
+   *  Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer
+   *  normalization." arXiv preprint arXiv:1607.06450 (2016).
+   *
+   *  Note that this layer does not apply an entry-wise scale and bias
+   *  like in the paper. Use the entry-wise scale/bias layer to
+   *  reproduce that functionality.
+   */
+  message LayerNorm {
+    /** @brief Small number to avoid division by zero.
+     *  @details Default is 1e-5.
+     */
+    google.protobuf.DoubleValue epsilon = 1;
+  }
+
+  message InstanceNorm {
+    /** @brief Small number to avoid division by zero.
+     *  @details Default is 1e-5.
+     */
+    google.protobuf.DoubleValue epsilon = 1;
+  }
+
+  //////////////////
+  // Input layers //
+  //////////////////
+  message Input {
+    string io_buffer = 2;         // Options: "partitioned" (default)
+    string target_mode = 3;       // Options: "classification" (default), "regression", "reconstruction", "N/A"
+  }
+
+  //////////////////////
+  // Transform layers //
+  //////////////////////
+  message Reshape {
+    int64 num_dims = 1; //DEPRECATED
+    string dims = 2; //should be space-separated list of ints, e.g, "2 6 7"
+  }
+
+  message Pooling {
+    int64 num_dims = 1;
+
+    bool has_vectors = 2;
+
+    //these are used if has_vectors = true
+    string pool_dims = 4; //should be space-separated list, e.g, "2 2 3"
+    string pool_pads = 5; //should be space-separated list, e.g, "2 2 3"
+    string pool_strides = 6; //should be space-separated list, e.g, "2 2 3"
+
+    //these are used if has_vectors = false
+    int64 pool_dims_i = 10;
+    int64 pool_pads_i = 11;
+    int64 pool_strides_i = 12;
+
+    //pool_mode should be one of: max, average, average_no_pad
+    //see: lbann/include/lbann/lbann_base.hpp
+    string pool_mode = 7;
+  }
+
+  message Unpooling {
+    int64 num_dims = 1;
+    string pooling_layer = 13; //should be name of the pooling layer
+  }
+
+
+  message Concatenation {
+    int64 axis = 1;
+  }
+
+  message Slice {
+    int64 axis = 1;
+    string slice_points = 2; //should be space-separated list of ints, e.g, "2 6 7"
+    //the following is for jag_conduit_hdf5;
+    string get_slice_points_from_reader = 4;
+    bool get_slice_points_from_reader_bool = 5;
+  }
+
+  message Split {
+  }
+
+  message Sum {
+  }
+
+  message WeightedSum {
+    string scaling_factors = 1;
+    //should be a space-separated list of doubles, e.g. "1.0 2.0 -1.0"
+  }
+
+  message Hadamard {
+  }
+
+  message Constant {
+    double value=1;
+    string num_neurons=2;
+  }
+
+  message Reduction {
+    string mode=1; //"sum" or "average"
+  }
+
+  message Evaluation {
+  }
+
+  message Gaussian {
+    double mean = 1;
+    double stdev = 2;
+    string neuron_dims = 3;
+    bool training_only = 4;
+  }
+
+  message Bernoulli {
+    double prob = 1;
+    string neuron_dims = 2;
+  }
+
+  message Uniform {
+    double min = 1;
+    double max = 2;
+    string neuron_dims = 3;
+    bool training_only = 4;
+  }
+
+
+  message Crop {
+    string dims = 3;
+  }
+
+  message CategoricalRandom {
+  }
+
+  message DiscreteRandom {
+    string values = 1;
+    string dims = 2;
+  }
+
+  message Dummy {
+  }
+
+  message StopGradient {
+  }
+
+  message InTopK {
+    int64 k = 1;
+  }
+
+  message Sort {
+    bool descending = 1;
+  }
+
+  message WeightsLayer {
+    string dims = 1;
+  }
+
+  message Tessellate {
+    string dims = 1;
+  }
+
+  /////////////////////
+  // Learning layers //
+  /////////////////////
+
+  /** @brief Affine transformation
+   *
+   *  Flattens the input tensor, multiplies with a weights matrix, and
+   *  optionally applies an entry-wise bias. Following the
+   *  column-vector convention:
+   *    @f[ y = W * \text{vec}(x) + b @f]
+   *
+   *  Two weights are required if bias is applied: the linearity and the
+   *  bias. Only the linearity weights are required if bias is not
+   *  applied. If weights aren't provided, the linearity weights are
+   *  initialized with He normal initialization and the bias weights are
+   *  initialized to zero.
+   */
+  message FullyConnected {
+    // Output tensor size
+    int64 num_neurons = 1;
+    // Whether to apply entry-wise bias
+    bool has_bias = 2;
+    // Whether to apply transpose of weights matrix
+    bool transpose = 3;
+  }
+
+  message Convolution {
+    int64 num_dims = 1;
+    int64 num_output_channels = 4;
+    int64 num_groups = 3;
+
+    bool has_vectors = 2;
+
+    // these are used if has_vector = true
+    string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
+    string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
+    string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
+    string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
+
+    // these are used if has_vector = false
+    int64 conv_dims_i = 50;
+    int64 conv_pads_i = 60;
+    int64 conv_strides_i = 70;
+    int64 conv_dilations_i = 80;
+
+    string weight_initialization = 9;     //DEPRECATED
+    bool has_bias = 10;                   //default: true
+    double bias_initial_value = 11;       //default: 0
+    double l2_regularization_factor = 12; //default: 0
+
+    // This field is ignored for non-GPU layers.
+    ConvTensorOpsMode conv_tensor_op_mode = 13;
+  }
+
+  message Deconvolution {
+    int64 num_dims = 1;
+    int64 num_output_channels = 4;
+    int64 num_groups = 3;
+
+    bool has_vectors = 2;
+
+    // these are used if has_vector = true
+    string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
+    string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
+    string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
+    string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
+
+    // these are used if has_vector = false
+    int64 conv_dims_i = 50;
+    int64 conv_pads_i = 60;
+    int64 conv_strides_i = 70;
+    int64 conv_dilations_i = 80;
+
+    string weight_initialization = 9;     //DEPRECATED
+    bool has_bias = 10;                   //default: true
+    double bias_initial_value = 11;       //default: 0
+    double l2_regularization_factor = 12; //default: 0
+
+    // This field is ignored for non-GPU layers.
+    ConvTensorOpsMode conv_tensor_op_mode = 13;
+  }
+
+  /** @brief Lookup table to embedding vectors.
+   *
+   *  Takes a scalar input, interprets it as an index, and outputs the
+   *  corresponding vector. The number of embedding vectors and the
+   *  size of vectors are fixed. If the index is out-of-range, then
+   *  the output is a vector of zeros.
+   *
+   *  The embedding vectors are stored in an
+   *  @f$ \text{embedding_dim} \times \text{num_embeddings} @f$
+   *  weights matrix. Note that this is the transpose of the weights
+   *  in the PyTorch embedding layer.
+   */
+  message Embedding {
+    /// Size of dictionary of embeddings
+    int64 num_embeddings = 1;
+    /// Size of embedding vectors
+    int64 embedding_dim = 2;
+    /** If the padding index is set, then the corresponding embedding
+     *  vector is initialized with zeros. The objective function
+     *  gradient w.r.t. this embedding vector is always zero.
+     */
+    google.protobuf.Int64Value padding_idx = 3;
+  }
+
+  message ChannelwiseScaleBias {}
+  message EntrywiseScaleBias {}
+
+  /** @brief Apply affine transformation to tensor channels.
+   *
+   *  The input tensor is sliced along the first tensor dimension (the
+   *  "channel" dimension for image data in CHW format) and the same
+   *  affine transformation is applied to each slice. Following a
+   *  row-vector convention:
+   *    @f[ y(i,*) = \text{vec}( x(i,*) ) W^T + b @f]
+   *
+   *  Two weights are required if bias is applied: the linearity and the
+   *  bias. Only the linearity weights are required if bias is not
+   *  applied. If weights aren't provided, the linearity weights are
+   *  initialized with He normal initialization and the bias weights are
+   *  initialized to zero.
+   *
+   */
+  message ChannelwiseFullyConnected {
+    /// Output tensor dimensions, excluding the first dimension.
+    repeated uint64 output_channel_dims = 1;
+    /** @brief Whether to apply bias.
+     *  @details Default: true
+     */
+    google.protobuf.BoolValue bias = 2;
+    /** @brief Whether to apply transpose of weights matrix.
+     *  @details Default: false
+     */
+    google.protobuf.BoolValue transpose = 3;
+  }
+
+  //////////////////
+  // Image layers //
+  //////////////////
+  message BilinearResize {
+    int64 height = 1;
+    int64 width = 2;
+  }
+
+  //////////////////////////
+  // Miscellaneous layers //
+  //////////////////////////
+
+  message Covariance {
+    bool biased = 1; //Whether to use a biased covariance estimate
+  }
+  message Variance {
+    bool biased = 1; //Whether to use a biased variance estimate
+  }
+  message ChannelwiseMean {}
+  message MiniBatchIndex {}
+  message MiniBatchSize {}
+
+  // Get index of maximum-value tensor entry
+  //
+  // Expects a 1-D input tensor. If multiple entries have the same
+  // maximum value, outputs the index of the first one.
+  message Argmax {}
+
+  // Get index of minimum-value tensor entry
+  //
+  // Expects a 1-D input tensor. If multiple entries have the same
+  // minimum value, outputs the index of the first one.
+  message Argmin {}
+
+  // Convert index to a one-hot vector
+  //
+  // Expects a scalar input tensor and outputs a 1-D output tensor.
+  // The input is interpreted as an index, and output entries are one
+  // if they correspond to that index and zero otherwise. If the input
+  // is outside [0,size), then the output is all zeros.
+  message OneHot {
+    // Size of one-hot vector
+    int64 size = 1;
+  }
+
+  message ChannelwiseSoftmax {}
+
+  /** @brief Embedding layer with distributed weights.
+   *
+   *  @warning This is extremely experimental.
+   */
+  message DistEmbedding {
+
+    /** Size of dictionary of embeddings. */
+    int64 num_embeddings = 1;
+    /** Size of embedding vectors. */
+    int64 embedding_dim = 2;
+
+    /** Perform sparse SGD during backprop.
+     *
+     *  Bypasses optimizer class.
+     */
+    bool sparse_sgd = 3;
+    /** SGD learning rate. */
+    double learning_rate = 4;
+
+    /** Perform a blocking barrier at the beginning of forward prop.
+     *
+     *  This layer performs synchronization with non-blocking barriers
+     *  to ensure the correctness of asynchronous communication.
+     *  However, gradient checking changes the embedding values without
+     *  performing any synchronization. The quickest fix is to do a
+     *  blocking barrier at the beginning of forward prop to make sure
+     *  that all the embeddings are ready to be accessed.
+     *
+     *  @todo Think of a way to avoid this synchronization.
+     */
+    bool barrier_in_forward_prop = 5;
+
+  }
+
+} // message Layer
+
+//note: I'd like to put this enum inside of Layer, but if I do the enum values
+//      become, e.g, Layer_Imcomm_EXCLUDE, which is just ugly
+enum Imcomm {
+  DEFAULT = 0; //add Layer to Imcomm callback if all_learning_layers = true in
+               //the CallbackImComm
+  EXCLUDE = 1; //*do not* add Layer to Imcomm callback if all_learning_layers = true in
+               //the CallbackImComm
+  INCLUDE = 2;  //add Layer to Imcomm callback regardless of whether all_learning_layers
+                //in the CallbackImComm is set to true or false
+}
+
+// Weight data for exporting
+message WeightsShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message WeightsData {
+  WeightsShape shape = 5;
+  string name = 1;
+  int64 height = 2;
+  int64 width = 3;
+  //@todo assume float above, add other datatype
+  repeated float data = 4 [packed=true];
+
+  Imcomm imcomm = 55;
+}
+
+//========================================================================
+// Parallel strategies for generalized layer-wise parallelism
+//========================================================================
+message ParallelStrategy {
+  int64 sample_groups = 1;
+  int64 sample_splits = 2;
+  int64 height_groups = 3;
+  int64 height_splits = 4;
+  int64 width_groups = 5;
+  int64 width_splits = 6;
+  int64 channel_groups = 7;
+  int64 channel_splits = 8;
+  int64 filter_groups = 9;
+  int64 filter_splits = 10;
+  // For fully-connected layers.
+  int64 replications = 11;
+  int64 procs_per_replica = 12;
+  int64 depth_groups = 13;
+  int64 depth_splits = 14;
+}
diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto
index 4e344af5330..aa9b09f9e91 100644
--- a/src/proto/lbann.proto
+++ b/src/proto/lbann.proto
@@ -1,1252 +1,44 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
 syntax = "proto3";
 
 package lbann_data;
 
+import "reader.proto";
+import "model.proto";
+import "optimizers.proto";
+import "trainer.proto";
+import "training_algorithm.proto";
+
 message LbannPB {
   DataReader data_reader = 1;
   Model model = 2;
   Optimizer optimizer = 3;
-  MotifDefinitions motif_definitions = 4;
   DataSetMetaData data_set_metadata = 5;
+  Trainer trainer = 6;
+  TrainingAlgorithm training_algorithm = 76;
 }
-
-//========================================================================
-// DataReaders
-//========================================================================
-message DataReader {
-  int64 max_par_io_size = 1;
-  repeated Reader reader = 2;
-  bool requires_data_set_metadata = 3;
-}
-
-message Reader {
-  string name = 1; //mnist, nci, nci_regression, numpy, imagenet, synthetic, merge_samples
-  string role = 3; //train, validation, test
-  bool shuffle = 4;
-  string data_filedir = 5;
-  string data_local_filedir = 50; //to support data_store
-  string data_filename = 6;
-  string label_filename = 7;
-  string index_list = 8;
-  double validation_percent = 9;
-  int64 absolute_sample_count = 11;
-  int64 first_n = 200;
-  double percent_of_data_to_use = 12;
-
-  //for GAN model
-  bool gan_labelling = 201;
-  int32 gan_label_value = 202;
-  ImagePreprocessor image_preprocessor = 13;
-
-  int32 num_labels = 99; //for imagenet and synthetic
-  int64 num_samples = 100; //only for synthetic
-  string synth_dimensions = 101; //only for synthetic
-  string synth_response_dimensions = 115; //only for synthetic
-  //csv attributes
-  string separator = 102;
-  int32 skip_cols = 103;
-  int32 skip_rows = 104;
-  bool has_header = 105;
-  int32 label_col = 106;
-  int32 response_col = 107;
-  bool disable_labels = 108;
-  bool disable_responses = 109;
-  string format = 110; // numpy, csv
-  string data_file_pattern = 111;
-  int64 num_neighbors = 112; // pilot2_molecular_reader
-  int64 max_neighborhood = 113; // pilot2_molecular_reader
-  int32 num_image_srcs = 114; // data_reader_multi_images
-  float scaling_factor_int16 = 116; // for numpy_npz_reader with int16 data
-
-  int32 max_files_to_load = 1000;
-
-  //------------- start of only for partitioned data sets ------------------
-  bool is_partitioned = 300;
-  double partition_overlap = 301;
-  int32 partition_mode = 302;
-       // 1 - share a portion of your data with two neighbors;
-       // 2 - there's a set of overlap indices that are common to all models
-  //------------- end of only for partitioned data sets ------------------
-
-  //------------- start of only for index lists ------------------
-  bool index_list_per_trainer = 400;
-  bool index_list_per_model   = 401;
-  //------------- end of only for index lists ------------------
-
-  PythonDataReader python = 501;
-}
-
-message PythonDataReader {
-  string module = 1;                // Python module
-  string module_dir = 2;            // Directory containing Python module
-  string sample_function = 3;       // Function that gets data sample
-  string num_samples_function = 4;  // Function that gets number of data samples
-  string sample_dims_function = 5;  // Function that gets dimensions of data sample
-}
-
-message ImagePreprocessor {
-  string name = 1;
-  bool disable = 2;
-  int32 raw_width = 3;
-  int32 raw_height = 4;
-  int32 raw_num_channels = 5;
-
-  message Cropper {
-    string name = 1;
-    bool disable = 2;
-    bool crop_randomly = 3;
-    uint32 crop_width = 4;
-    uint32 crop_height = 5;
-    int32 resized_width = 6;
-    int32 resized_height = 7;
-    bool adaptive_interpolation = 8;
-  }
-
-  message Resizer {
-    string name = 1;
-    bool disable = 2;
-    int32 resized_width = 3;
-    int32 resized_height = 4;
-    bool adaptive_interpolation = 5;
-  }
-
-  message Augmenter {
-    string name = 1;
-    bool disable = 2;
-    bool horizontal_flip = 3;
-    bool vertical_flip = 4;
-    double rotation = 5;
-    double horizontal_shift = 6;
-    double vertical_shift = 7;
-    double shear_range = 8;
-  }
-
-  message Decolorizer {
-    string name = 1;
-    bool disable = 2;
-    bool pick_1ch = 3;
-  }
-
-  message Colorizer {
-    string name = 1;
-    bool disable = 2;
-  }
-
-  message Normalizer {
-    string name = 1;
-    bool disable = 2;
-    bool scale = 3;
-    bool subtract_mean = 4;
-    bool unit_variance = 5;
-    bool z_score = 6;
-  }
-
-  message Subtractor {
-    string name = 1;
-    bool disable = 2;
-    string image_to_sub = 3;
-    string image_to_div = 4;
-    repeated float channel_mean = 5 [packed = true];
-    repeated float channel_stddev = 6 [packed = true];
-  }
-
-  message PatchExtractor {
-    string name = 1;
-    bool disable = 2;
-    uint32 patch_width = 3;
-    uint32 patch_height = 4;
-    uint32 patch_gap = 5; // gap between patches
-    uint32 patch_jitter = 6; // max jittering amount for patch positions
-    uint32 centering_mode = 7; // center patch positioning mode
-    uint32 ca_correction_mode = 8; // chromatic abberation correction mode
-  }
-
-  message Noiser {
-    string name = 1;
-    bool disable = 2;
-    float factor = 3;
-  }
-
-  Cropper cropper = 6;
-  Resizer resizer = 7;
-  Augmenter augmenter = 8;
-  Decolorizer decolorizer = 9;
-  Colorizer colorizer = 10;
-  Subtractor subtractor = 11;
-  Normalizer normalizer = 12;
-  Noiser noiser = 13;
-  PatchExtractor patch_extractor = 14;
-
-  int32 early_normalization = 33; // for data_reader_jag only
-}
-
-// TODO: wrap El::Mat based normalization into a generic preprocessor
-message GenericPreprocessor {
-  string name = 1;
-  bool disable = 2;
-
-  message Normalizer {
-    string name = 1;
-    bool disable = 2;
-    bool scale = 3;
-    bool subtract_mean = 4;
-    bool unit_variance = 5;
-    bool z_score = 6;
-  }
-
-  Normalizer normalizer = 3;
-}
-
-//========================================================================
-// Metadata for a Data set
-//========================================================================
-message DataSetMetaData {
-  message Schema {
-    string scalar_prefix = 1;
-    string image_prefix = 2;
-    string input_prefix = 3;
-
-    //------------------ start of only for jag_conduit -----------------------
-    bool split_jag_image_channels = 89;
-    repeated string jag_image_keys = 90;
-    repeated string jag_scalar_keys = 91;
-    repeated string jag_input_keys = 92;
-    message JagKeyPrefixFilter {
-      string key_prefix = 1;
-      uint32 min_len = 2;
-    }
-    repeated string jag_scalar_filters = 93;
-    repeated JagKeyPrefixFilter jag_scalar_prefix_filters = 94;
-    repeated string jag_input_filters = 95;
-    repeated JagKeyPrefixFilter jag_input_prefix_filters = 96;
-
-    enum JAG_Data {
-      Undefined  = 0;
-      JAG_Image  = 1;
-      JAG_Scalar = 2;
-      JAG_Input  = 3;
-    }
-    message JAGDataSlice {
-      repeated JAG_Data pieces = 1;
-    }
-    repeated JAGDataSlice independent = 97;
-    repeated JAGDataSlice dependent = 98;
-    //------------------  end of only for jag_conduit  -----------------------
-  }
-
-  message Normalization {
-    //------------------ start of only for jag_conduit -----------------------
-    message JagLinearNormalizationParams {
-      double scale = 1;
-      double bias = 2;
-    }
-
-    repeated JagLinearNormalizationParams jag_image_normalization_params = 86;
-    repeated JagLinearNormalizationParams jag_scalar_normalization_params = 87;
-    repeated JagLinearNormalizationParams jag_input_normalization_params = 88;
-
-    //------------------  end of only for jag_conduit  -----------------------
-  }
-    Schema schema = 1;
-  Normalization normalization = 2;
-}
-
-//========================================================================
-// Model
-//========================================================================
-
-message Model {
-  string type = 1;
-  string name = 3;
-  ObjectiveFunction objective_function = 2;
-  repeated Metric metric = 5;
-  string data_layout = 6;
-  bool shareable_training_data_reader = 42; // Can the data reader be shared across multiple models( e.g. GAN)
-  bool shareable_testing_data_reader = 43; // Can the data reader be shared across multiple models (e.g. GAN)
-  bool shareable_validation_data_reader = 44; // Can the data reader be shared across multiple models (e.g. GAN)
-
-  int64 mini_batch_size = 12;
-  int64 num_epochs = 4;
-  int64 super_steps = 121; //multiple steps/epochs currently use in GAN
-  int64 num_batches = 122; //multiple batches/sub epoch
-  int64 block_size = 50;
-  int64 procs_per_trainer = 51;
-  int64 num_gpus = 53; //has no effect
-  int64 evaluation_frequency = 54;
-  int64 num_parallel_readers = 100;
-  bool  serialize_io = 101;
-
-  bool disable_cuda = 8;
-
-  repeated Layer layer = 10;
-
-  repeated Weights weights = 11;
-
-  repeated Callback callback = 20;
-
-  int64 random_seed = 30;
-  // If true, models will have their model rank mixed into their random seed.
-  bool random_init_models_differently = 31;
-
-}
-
-//========================================================================
-// Objective function
-//========================================================================
-
-message ObjectiveFunction {
-  repeated LayerTerm layer_term = 1;
-  repeated L2WeightRegularization l2_weight_regularization = 2;
-}
-
-message LayerTerm {
-  double scale_factor = 1;
-  string layer = 2;
-}
-
-message L2WeightRegularization {
-  double scale_factor = 1;
-  string weights = 2;   // If empty, L2 regularization is applied to all weights
-}
-
-//========================================================================
-// Metrics
-//========================================================================
-
-message Metric {
-  LayerMetric layer_metric = 11;
-}
-
-message LayerMetric {
-  string layer = 1;
-  string name = 2;
-  string unit = 3;
-}
-
-//========================================================================
-// Optimizers
-//========================================================================
-message Optimizer {
-  oneof optimizer_type {
-    AdaGrad adagrad = 1;
-    Adam adam = 2;
-    HypergradientAdam hypergradient_adam = 3;
-    RMSprop rmsprop = 4;
-    SGD sgd = 5;
-  }
-}
-
-message AdaGrad {
-  double learn_rate = 1;
-  double eps = 2;           // Suggested: 1e-8
-}
-
-message Adam {
-  double learn_rate = 1;
-  double beta1 = 6;         // Suggested: 0.9
-  double beta2 = 7;         // Suggested: 0.99
-  double eps = 8;           // Suggested: 1e-8
-}
-
-message HypergradientAdam {
-  double init_learning_rate = 1;
-  double hyper_learning_rate = 2;   // Suggested: 1e-7
-  double beta1 = 6;                 // Suggested: 0.9
-  double beta2 = 7;                 // Suggested: 0.99
-  double eps = 8;                   // Suggested: 1e-8
-}
-
-message RMSprop {
-  double learn_rate = 1;
-  double decay_rate = 2;
-  double eps = 3;           // Suggested: 1e-8
-}
-
-message SGD {
-  double learn_rate = 1;
-  double momentum = 2;      // Set to zero for vanilla SGD
-  bool nesterov = 4;
-}
-
-
-//========================================================================
-// Callbacks
-//========================================================================
-message Callback {
-   // a Callback should contain exactly one of the following
-   CallbackPrint print = 1;
-   CallbackTimer timer = 2;
-   CallbackSummary summary = 3;
-   CallbackDumpWeights dump_weights = 4;
-   CallbackDumpOutputs dump_outputs = 5;
-   CallbackDumpErrorSignals dump_error_signals = 35;
-   CallbackDumpGradients dump_gradients = 6;
-   CallbackDumpMBIndices dump_mb_indices = 7;
-   CallbackDispIOStats disp_io_stats = 8;
-   CallbackImComm imcomm = 9;
-   CallbackSaveImages save_images = 10;
-   CallbackDebug debug = 11;
-   CallbackAdaptiveLearningRate adaptive_learning_rate = 12;
-   CallbackStepLearningRate step_learning_rate = 13;
-   CallbackCustomLearningRate custom_learning_rate = 14;
-   CallbackCheckSmall check_small = 15;
-   CallbackCheckNaN check_nan = 16;
-   CallbackCheckDataset check_dataset = 17;
-   CallbackHang hang = 18;
-   CallbackDropFixedLearningRate drop_fixed_learning_rate = 19;
-   CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20;
-   CallbackProfiler profiler = 21;
-   CallbackStepMinibatch step_minibatch = 22;
-   CallbackCheckGradients check_gradients = 23;
-   CallbackLTFB ltfb = 24;
-   CallbackDebugIO debug_io = 25;
-   CallbackMinibatchSchedule minibatch_schedule = 26;
-   CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27;
-   CallbackCheckpoint checkpoint = 28;
-   CallbackSaveModel save_model = 29;
-   CallbackPolyLearningRate poly_learning_rate = 30;
-   CallbackReplaceWeights replace_weights = 31;
-   CallbackGPUMemoryUsage gpu_memory_usage = 32;
-   CallbackSyncLayers sync_layers = 33;
-   CallbackSyncSelected sync_selected = 34;
-   CallbackConfusionMatrix confusion_matrix = 36;
-   CallbackCheckMetric check_metric = 37;
-   CallbackPerturbAdam perturb_adam = 38;
-}
-
-message CallbackLTFB {
-  int64 batch_interval = 1;
-  string metric = 2;
-  string weights = 3;       // default: all weights
-  bool low_score_wins = 4;
-  string communication_algorithm = 5;   // default: "sendrecv_weights"
-}
-
-message CallbackStepLearningRate {
-  string weights = 1; //default: all weights
-  int64 step = 2;
-  double amt = 3;
-}
-
-message CallbackCustomLearningRate {
-  //don't know how to support this, since it takes an std::function as an argument
-}
-
-message CallbackAdaptiveLearningRate {
-  string weights = 1; //default: all weights
-  int64 patience = 2;
-  double amt = 3;
-}
-
-message CallbackSaveImages {
-  string layers       = 1; // Layer outputs to save as images
-  string image_format = 2; // Image format (e.g. jpg, png, pgm)
-  string image_prefix = 3; // Prefix for saved image files
-}
-
-message CallbackPrint {
-  int64 interval = 1; //default in lbann_callback_print.hpp is 1
-  bool  print_global_stat_only = 2; //useful in large scale multi-trainer, default is false
-}
-
-message CallbackProfiler {
-  bool sync = 1;
-  bool skip_init = 2;
-}
-
-message CallbackTimer {
-}
-
-message CallbackSummary {
-  string dir = 1; //directory for the lbann_summary
-  int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1
-  int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25
-}
-
-message CallbackDumpWeights {
-  string basename = 1;
-}
-
-message CallbackDumpOutputs {
-  string layers = 1;          // Default: all layers
-  string execution_modes = 2; // Default: all modes
-  int64 batch_interval = 3;   // Frequency for output dumping (default: all steps)
-  string directory = 4;       // Directory for output files
-  string format = 5;          // Options: csv, tsv, npy, npz (default: csv)
-}
-
-message CallbackDumpErrorSignals {
-  string basename = 1;
-}
-
-message CallbackDumpGradients {
-  string basename = 1;
-  int64 interval = 2;
-}
-
-message CallbackDumpMBIndices {
-  string basename = 1;
-  int64 interval = 2;
-}
-
-message CallbackDispIOStats {
-  string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers
-}
-
-message CallbackImComm {
-  string intertrainer_comm_method = 1;
-  bool all_optimizers = 2;
-}
-
-message CallbackDebug {
-  string phase = 1; //should be called "modes"
-}
-
-message CallbackDebugIO {
-  string phase = 1;
-  int32 lvl = 2;
-}
-
-message CallbackCheckSmall {
-}
-
-message CallbackCheckNaN {
-}
-
-message CallbackCheckDataset {
-}
-
-message CallbackHang {
-  int64 rank = 1;
-}
-
-message CallbackDropFixedLearningRate {
-  string weights = 1;
-  repeated int64 drop_epoch = 2;
-  double amt = 3;
-}
-
-message CallbackLinearGrowthLearningRate {
-  string weights = 1;
-  double target = 2;
-  int64 num_epochs = 3;
-  int64 delay = 4;
-}
-
-message CallbackPolyLearningRate {
-  string weights = 1;
-  double power = 2;
-  uint64 num_epochs = 3;
-  uint64 max_iter = 4;
-  double end_lr = 5;
-}
-
-message CallbackStepMinibatch {
-  int64 starting_mbsize = 1;
-  int64 step = 2;
-  int64 ramp_time = 3;
-}
-
-message MinibatchScheduleStep {
-  int64 epoch = 1;
-  int64 mbsize = 2;
-  double lr = 3;
-  int64 ramp_time = 4;
-}
-
-message CallbackOptimizerwiseAdaptiveLearningRate {
-  string weights = 1;
-  double scale = 2;
-}
-
-message CallbackMinibatchSchedule {
-  int64 starting_mbsize = 1;
-  repeated MinibatchScheduleStep step = 2;
-}
-
-message CallbackCheckGradients {
-  double step_size = 1;
-  bool verbose = 2;
-  bool error_on_failure = 3; // Throw error if gradient check fails
-}
-
-message CallbackCheckMetric {
-  string metric = 1;
-  double lower_bound = 2;
-  double upper_bound = 3;
-  bool error_on_failure = 4;  // Throw error if metric check fails
-  string execution_modes = 5; // Default: all modes
-}
-
-message CallbackCheckpoint {
-  string checkpoint_dir = 1;
-  int64 checkpoint_epochs = 2;
-  int64 checkpoint_steps = 3;
-  double checkpoint_secs = 4;
-  string per_rank_dir = 5;
-  int64 ckpt_dist_epochs = 6;
-  int64 ckpt_dist_steps = 7;
-}
-
-
-message CallbackSaveModel {
-  string dir = 1;
-  string extension = 2;
-  bool disable_save_after_training = 3;
-}
-
-message CallbackReplaceWeights {
-  string source_layers = 1; //set of layers to copy weights from
-  string destination_layers = 2;  //set of layers to copy weights to
-  int64 batch_interval = 3;
-}
-message CallbackGPUMemoryUsage {
-}
-
-message CallbackSyncLayers {
-  bool sync_gpus = 1;
-  bool sync_mpi = 2;
-  bool only_input = 3;
-}
-
-message CallbackSyncSelected {
-  message LayerToSync {
-    enum PropDirection {
-      Both = 0;
-      Forward = 1;
-      Backward = 2;
-    }
-    string name = 1; // name of the layer to synchronize
-    PropDirection prop = 2; // propagation setep to synchronize
-  }
-
-  message CudaProfilerSetup {
-    enum OutputMode {
-      KeyValuePair = 0;
-      CSV = 1;
-    }
-    bool no_init = 1;
-    string config_file = 2;
-    string output_dir = 3;
-    OutputMode output_mode = 4;
-  }
-
-  bool async_gpus = 1;
-  bool async_mpi = 2;
-  repeated LayerToSync layer_to_sync = 3;
-  CudaProfilerSetup cuda_profiler_setup = 4;
-}
-
-message CallbackConfusionMatrix {
-  string prediction = 1; // Prediction layer
-  string label = 2;      // Label layer
-  string prefix = 3;     // Prefix for output files
-}
-
-message CallbackPerturbAdam {
-  float learning_rate_factor = 1;   // Learning rate perturbation (in log space)
-  float beta1_factor = 2;           // beta1 perturbation (in log space)
-  float beta2_factor = 3;           // beta2 perturbation (in log space)
-  float eps_factor = 4;             // eps perturbation (in log space)
-  bool perturb_during_training = 5; // Whether to periodically perturb during training
-  int64 batch_interval = 6;         // Frequency of perturbation if perturb_during_training is true
-  string weights = 7;               // Weights with Adam optimizer
-}
-
-//========================================================================
-// Weights
-//========================================================================
-
-message Weights {
-
-  string name = 1;
-  Optimizer optimizer = 2;
-
-  ConstantInitializer constant_initializer = 20;
-  ValueInitializer value_initializer = 21;
-  UniformInitializer uniform_initializer = 22;
-  NormalInitializer normal_initializer = 23;
-  GlorotNormalInitializer glorot_normal_initializer = 24;
-  GlorotUniformInitializer glorot_uniform_initializer = 25;
-  HeNormalInitializer he_normal_initializer = 26;
-  HeUniformInitializer he_uniform_initializer = 27;
-  LeCunNormalInitializer lecun_normal_initializer = 28;
-  LeCunUniformInitializer lecun_uniform_initializer = 29;
-
-}
-
-// Weight initializers
-message ConstantInitializer {
-  double value = 1;
-}
-message ValueInitializer {
-  string values = 1;
-}
-message UniformInitializer {
-  double min = 1;
-  double max = 2;
-}
-message NormalInitializer {
-  double mean = 1;
-  double standard_deviation = 2;
-}
-message GlorotNormalInitializer {}
-message GlorotUniformInitializer {}
-message HeNormalInitializer {}
-message HeUniformInitializer {}
-message LeCunNormalInitializer {}
-message LeCunUniformInitializer {}
-
-//note: I'd like to put this enum inside of Layer, but if I do the enum values
-//      become, e.g, Layer_Imcomm_EXCLUDE, which is just ugly
-enum Imcomm {
-  DEFAULT = 0; //add Layer to Imcomm callback if all_learning_layers = true in
-               //the CallbackImComm
-  EXCLUDE = 1; //*do not* add Layer to Imcomm callback if all_learning_layers = true in
-               //the CallbackImComm
-  INCLUDE = 2;  //add Layer to Imcomm callback regardless of whether all_learning_layers
-                //in the CallbackImComm is set to true or false
-}
-
-// Weight data for exporting
-message WeightsShape {
-  repeated int64 dim = 1 [packed = true];
-}
-message WeightsData {
-  WeightsShape shape = 5;
-  string name = 1;
-  int64 height = 2;
-  int64 width = 3;
-  //@todo assume float above, add other datatype
-  repeated float data = 4 [packed=true];
-
-  Imcomm imcomm = 55;
-}
-
-//========================================================================
-// MotifDefinitions
-//========================================================================
-
-message MotifDefinitions {
-  repeated Motif motif = 1;
-}
-
-message Motif {
-  string name = 1;
-  repeated Layer layer = 2;
-}
-
-//========================================================================
-// Layers
-//========================================================================
-
-message Layer {
-   string name = 50;
-   string parents = 151;
-   string children = 152;
-   string data_layout = 52;
-   string device_allocation = 55;
-   string weights = 54;
-   bool num_neurons_from_data_reader = 53;
-   bool freeze = 5;
-   string hint_layer = 56;
-
-   repeated WeightsData weights_data = 153;
-   string top = 154;
-   string bottom = 155;
-   string type = 156;
-
-   // a Layer should contain exactly one of the following
-   // (this may or may not be properly checked for in proto_common.cpp)
-   //
-   // @todo: this should be done better using oneof:
-   //   oneof a_layer {
-   //       Reshape reshape = 306
-   //       Pooling pooling = 12;
-   //       ...
-   //   }
-   //
-   //
-
-   // motif layer
-   MotifLayer motif_layer = 4;
-
-   // Input layers
-   Input input = 2;
-
-   // Transform layers
-   Reshape reshape = 306;
-   Pooling pooling = 12;
-   Concatenation concatenation = 300;
-   Slice slice = 301;
-   Split split = 302;
-   Sum sum = 303;
-   WeightedSum weighted_sum = 323;
-   Unpooling unpooling = 304;
-   Hadamard hadamard = 308;
-   Constant constant = 309;
-   Reduction reduction = 310;
-   Evaluation evaluation = 311;
-   Gaussian gaussian = 312;
-   Bernoulli bernoulli = 313;
-   Uniform uniform = 314;
-   Crop crop = 316;
-   CategoricalRandom categorical_random = 317;
-   DiscreteRandom discrete_random = 318;
-   Dummy dummy = 319;
-   StopGradient stop_gradient = 320;
-   InTopK in_top_k = 324;
-   Sort sort = 325;
-   WeightsLayer weights_layer = 326;
-   Tessellate tessellate = 327;
-
-   // Learning layers
-   FullyConnected fully_connected = 11;
-   Convolution convolution = 13;
-   Deconvolution deconvolution = 305;
-
-   // Loss layers
-   CrossEntropy cross_entropy = 60;
-   MeanSquaredError mean_squared_error = 61;
-   MeanAbsoluteError mean_absolute_error = 62;
-   CategoricalAccuracy categorical_accuracy = 63;
-   TopKCategoricalAccuracy top_k_categorical_accuracy = 64;
-   L2Norm2 l2_norm2 = 65;
-   L1Norm l1_norm = 66;
-   BinaryCrossEntropy binary_cross_entropy = 67;
-   SigmoidBinaryCrossEntropy sigmoid_binary_cross_entropy = 68;
-   BooleanAccuracy boolean_accuracy = 69;
-   BooleanFalseNegative boolean_false_negative = 70;
-   BooleanFalsePositive boolean_false_positive = 71;
-
-   // Math layers
-   LogicalNot logical_not = 401;
-   Abs abs = 402;
-   Negative negative = 403;
-   Sign sign = 404;
-   Round round = 405;
-   Ceil ceil = 406;
-   Floor floor = 407;
-   Reciprocal reciprocal = 408;
-   Square square = 409;
-   Sqrt sqrt = 410;
-   Rsqrt rsqrt = 411;
-   SafeReciprocal safe_reciprocal = 412;
-   Exp exp = 413;
-   Expm1 expm1 = 414;
-   Log log = 415;
-   Log1p log1p = 416;
-   Cos cos = 417;
-   Sin sin = 418;
-   Tan tan = 419;
-   Acos acos = 420;
-   Asin asin = 421;
-   Atan atan = 422;
-   Cosh cosh = 423;
-   Sinh sinh = 424;
-   Tanh tanh = 425;
-   Acosh acosh = 426;
-   Asinh asinh = 427;
-   Atanh atanh = 428;
-   Add add = 450;
-   Subtract subtract = 451;
-   Multiply multiply = 452;
-   Divide divide = 453;
-   Mod mod = 454;
-   Pow pow = 455;
-   SafeDivide safe_divide = 456;
-   SquaredDifference squared_difference = 457;
-   Max max = 458;
-   Min min = 459;
-   Equal equal = 460;
-   NotEqual not_equal = 461;
-   Less less = 462;
-   LessEqual less_equal = 463;
-   Greater greater = 464;
-   GreaterEqual greater_equal = 465;
-   LogicalAnd logical_and = 466;
-   LogicalOr logical_or = 467;
-   LogicalXor logical_xor = 468;
-   Clamp clamp = 469;
-
-   // Regularization layers
-   BatchNormalization batch_normalization = 19;
-   LocalResponseNormalization local_response_normalization = 20;
-   Dropout dropout = 21;
-   SeluDropout selu_dropout = 229;
-
-   // Activation layers
-   Elu elu = 200;
-   Identity identity = 201;
-   LeakyRelu leaky_relu = 202;
-   LogSigmoid log_sigmoid = 203;
-   LogSoftmax log_softmax = 204;
-   Relu relu = 205;
-   Selu selu = 206;
-   Sigmoid sigmoid = 207;
-   Softmax softmax = 208;
-   Softplus softplus = 209;
-   Softsign softsign = 210;
-
-   // Image layers
-   BilinearResize bilinear_resize = 500;
-
-   // Miscellaneous layers
-   Covariance covariance = 600;
-   Variance variance = 601;
-   ChannelwiseMean channelwise_mean = 602;
-   MiniBatchIndex mini_batch_index = 603;
-   MiniBatchSize mini_batch_size = 604;
-
-}
-///////////////////////
-// MotifLayer //
-///////////////////////
-message MotifLayer {
-  string motif_id = 1;
-  repeated string variable = 2;
-}
-
-///////////////////////
-// Math layers       //
-///////////////////////
-message LogicalNot {}
-message Abs {}
-message Negative {}
-message Sign {}
-message Round {}
-message Ceil {}
-message Floor {}
-message Reciprocal {}
-message Square {}
-message Sqrt {}
-message Rsqrt {}
-message SafeReciprocal {}
-message Exp {}
-message Expm1 {}
-message Log {}
-message Log1p {}
-message Cos {}
-message Sin {}
-message Tan {}
-message Acos {}
-message Asin {}
-message Atan {}
-message Cosh {}
-message Sinh {}
-message Tanh {}
-message Acosh {}
-message Asinh {}
-message Atanh {}
-message Add {}
-message Subtract {}
-message Multiply {}
-message Divide {}
-message Mod {}
-message Pow {}
-message SafeDivide {}
-message SquaredDifference {}
-message Max {}
-message Min {}
-message Equal {}
-message NotEqual {}
-message Less {}
-message LessEqual {}
-message Greater {}
-message GreaterEqual {}
-message LogicalAnd {}
-message LogicalOr {}
-message LogicalXor {}
-message Clamp {
-  double min = 1;
-  double max = 2;
-}
-
-///////////////////////
-// Activation layers //
-///////////////////////
-message Elu {
-  double alpha = 1; //default: 1.0; should be >= 0
-}
-message Identity {}
-message LeakyRelu {
-  double negative_slope = 1; //default: 0.01
-}
-message LogSigmoid {}
-message LogSoftmax {}
-message Relu {}
-message Selu {}
-message Sigmoid {}
-message Softmax {}
-message Softplus {}
-message Softsign {}
-
-///////////////////////
-// Loss layers //
-///////////////////////
-message CrossEntropy {}
-message MeanSquaredError {}
-message MeanAbsoluteError {}
-message CategoricalAccuracy {}
-message TopKCategoricalAccuracy {
-  int64 k = 1;
-}
-message L2Norm2 {}
-message L1Norm {}
-message BinaryCrossEntropy {}
-message SigmoidBinaryCrossEntropy {}
-message BooleanAccuracy {}
-message BooleanFalseNegative {}
-message BooleanFalsePositive {}
-
-///////////////////////////
-// Regularization layers //
-///////////////////////////
-message BatchNormalization {
-  double decay = 1;          //default: 0.9
-  double scale_init = 2;     //default: 1.0
-  double bias_init = 3;      //default: 0.0
-  double epsilon = 4;        //default: 1e-5
-  string stats_aggregation = 5; // default: local
-}
-
-message SeluDropout {
-  double keep_prob = 2; //default: 0.95
-  double alpha = 3;     //default: 1.6732632423543772848170429916717
-  double scale = 4;     //default: 1.0507009873554804934193349852946
-}
-
-message LocalResponseNormalization {
-  int64 window_width = 4;
-  double lrn_alpha = 5;
-  double lrn_beta = 6;
-  double lrn_k = 7;
-}
-
-message Dropout {
-  double keep_prob = 2;  //default: 0.5
-}
-
-//////////////////
-// Input layers //
-//////////////////
-message Input {
-  bool data_set_per_model = 1;  // Default: false
-  string io_buffer = 2;         // Options: "partitioned" (default)
-  string target_mode = 3;       // Options: "classification" (default), "regression", "reconstruction", "N/A"
-}
-
-//////////////////////
-// Transform layers //
-//////////////////////
-message Reshape {
-  int64 num_dims = 1; //DEPRECATED
-  string dims = 2; //should be space-separated list of ints, e.g, "2 6 7"
-}
-
-message Pooling {
-  int64 num_dims = 1;
-
-  bool has_vectors = 2;
-
-  //these are used if has_vectors = true
-  string pool_dims = 4; //should be space-separated list, e.g, "2 2 3"
-  string pool_pads = 5; //should be space-separated list, e.g, "2 2 3"
-  string pool_strides = 6; //should be space-separated list, e.g, "2 2 3"
-
-  //these are used if has_vectors = false
-  int64 pool_dims_i = 10;
-  int64 pool_pads_i = 11;
-  int64 pool_strides_i = 12;
-
-  //pool_mode should be one of: max, average, average_no_pad
-  //see: lbann/include/lbann/lbann_base.hpp
-  string pool_mode = 7;
-}
-
-message Unpooling {
-  int64 num_dims = 1;
-  string pooling_layer = 13; //should be name of the pooling layer
-}
-
-
-message Concatenation {
-  int64 axis = 1;
-}
-
-message Slice {
-  int64 axis = 1;
-  string slice_points = 2; //should be space-separated list of ints, e.g, "2 6 7"
-  //the following is for jag_conduit_hdf5;
-  string get_slice_points_from_reader = 4;
-  bool get_slice_points_from_reader_bool = 5;
-}
-
-message Split {
-}
-
-message Sum {
-}
-
-message WeightedSum {
-  string scaling_factors = 1; //should be a space-separated list of doubles, e.g. "1.0 2.0 -1.0"
-}
-
-message Hadamard {
-}
-
-message Constant {
-  double value=1;
-  string num_neurons=2;
-}
-
-message Reduction {
-  string mode=1; //"sum" or "average"
-}
-
-message Evaluation {
-}
-
-message Gaussian {
-  double mean = 1;
-  double stdev = 2;
-  string neuron_dims = 3;
-}
-
-message Bernoulli {
-  double prob = 1;
-  string neuron_dims = 2;
-}
-
-message Uniform {
-  double min = 1;
-  double max = 2;
-  string neuron_dims = 3;
-}
-
-
-message Crop {
-  string dims = 3;
-}
-
-message CategoricalRandom {
-}
-
-message DiscreteRandom {
-  string values = 1;
-  string dims = 2;
-}
-
-message Dummy {
-}
-
-message StopGradient {
-}
-
-message InTopK {
-  int64 k = 1;
-}
-
-message Sort {
-  bool descending = 1;
-}
-
-message WeightsLayer {
-  string dims = 1;
-}
-
-message Tessellate {
-  string dims = 1;
-}
-
-/////////////////////
-// Learning layers //
-/////////////////////
-message FullyConnected {
-  int64 num_neurons = 1;
-  string weight_initialization = 2;    //DEPRECATED
-  bool has_bias = 3;                   //default: true
-  double bias_initial_value = 4;       //default: 0
-  double l2_regularization_factor = 5; //default: 0
-  double group_lasso_regularization_factor = 6; //default: 0
-  bool transpose = 7;
-  bool num_neurons_is_num_labels = 8;
-
-  bool get_input_dimension_from_reader = 9;
-  bool get_image_and_scalar_dimension_from_reader = 10;
-  bool get_image_dimension_from_reader = 11;
-  bool get_scalar_dimension_from_reader = 12;
-  repeated uint32 get_num_neurons_of_slice_from_reader = 13;
-  string get_slice_points_from_reader = 14;
-}
-
-message Convolution {
-  int64 num_dims = 1;
-  int64 num_output_channels = 4;
-  int64 num_groups = 3;
-
-  bool has_vectors = 2;
-
-  // these are used if has_vector = true
-  string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
-  string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
-  string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
-  string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
-
-  // these are used if has_vector = false
-  int64 conv_dims_i = 50;
-  int64 conv_pads_i = 60;
-  int64 conv_strides_i = 70;
-  int64 conv_dilations_i = 80;
-
-  string weight_initialization = 9;     //DEPRECATED
-  bool has_bias = 10;                   //default: true
-  double bias_initial_value = 11;       //default: 0
-  double l2_regularization_factor = 12; //default: 0
-}
-
-message Deconvolution {
-  int64 num_dims = 1;
-  int64 num_output_channels = 4;
-  int64 num_groups = 3;
-
-  bool has_vectors = 2;
-
-  // these are used if has_vector = true
-  string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
-  string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
-  string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
-  string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
-
-  // these are used if has_vector = false
-  int64 conv_dims_i = 50;
-  int64 conv_pads_i = 60;
-  int64 conv_strides_i = 70;
-  int64 conv_dilations_i = 80;
-
-  string weight_initialization = 9;     //DEPRECATED
-  bool has_bias = 10;                   //default: true
-  double bias_initial_value = 11;       //default: 0
-  double l2_regularization_factor = 12; //default: 0
-}
-
-//////////////////
-// Image layers //
-//////////////////
-message BilinearResize {
-  int64 height = 1;
-  int64 width = 2;
-}
-
-//////////////////////////
-// Miscellaneous layers //
-//////////////////////////
-message Covariance {
-  bool biased = 1; //Whether to use a biased covariance estimate
-}
-message Variance {
-  bool biased = 1; //Whether to use a biased variance estimate
-}
-message ChannelwiseMean {}
-message MiniBatchIndex {}
-message MiniBatchSize {}
diff --git a/src/proto/metrics.proto b/src/proto/metrics.proto
new file mode 100644
index 00000000000..5da7c671f4d
--- /dev/null
+++ b/src/proto/metrics.proto
@@ -0,0 +1,40 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message Metric {
+
+  message LayerMetric {
+    string layer = 1;
+    string name = 2;
+    string unit = 3;
+  }
+
+  LayerMetric layer_metric = 11;
+}
diff --git a/src/proto/model.proto b/src/proto/model.proto
new file mode 100644
index 00000000000..6ebbee047cc
--- /dev/null
+++ b/src/proto/model.proto
@@ -0,0 +1,63 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+import "callbacks.proto";
+import "layers.proto";
+import "metrics.proto";
+import "objective_functions.proto";
+import "weights.proto";
+
+message Model {
+  message Summarizer {
+    string dir = 1;
+  }
+
+  string type = 1;
+  string name = 3;
+  ObjectiveFunction objective_function = 2;
+  repeated Metric metric = 5;
+  string data_layout = 6;
+
+  int64 num_epochs = 4;
+  int64 super_steps = 121; //multiple steps/epochs currently use in GAN
+  int64 num_batches = 122; //multiple batches/sub epoch
+  int64 evaluation_frequency = 54;
+  bool  serialize_io = 101;
+
+  bool disable_cuda = 8;
+
+  repeated Layer layer = 10;
+
+  repeated Weights weights = 11;
+
+  repeated Callback callback = 20;
+
+  Summarizer summarizer = 32;
+}
diff --git a/src/proto/objective_functions.proto b/src/proto/objective_functions.proto
new file mode 100644
index 00000000000..168482b93e3
--- /dev/null
+++ b/src/proto/objective_functions.proto
@@ -0,0 +1,45 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message ObjectiveFunction {
+
+  message LayerTerm {
+    double scale_factor = 1;
+    string layer = 2;
+  }
+
+  message L2WeightRegularization {
+    double scale_factor = 1;
+    string weights = 2;   // If empty, L2 regularization is applied to all weights
+  }
+
+  repeated LayerTerm layer_term = 1;
+  repeated L2WeightRegularization l2_weight_regularization = 2;
+}
diff --git a/src/proto/optimizers.proto b/src/proto/optimizers.proto
new file mode 100644
index 00000000000..259624c8be7
--- /dev/null
+++ b/src/proto/optimizers.proto
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message Optimizer {
+  oneof optimizer_type {
+    NoOptimizer no_optimizer = 1;
+    AdaGrad adagrad = 2;
+    Adam adam = 3;
+    HypergradientAdam hypergradient_adam = 4;
+    RMSprop rmsprop = 5;
+    SGD sgd = 6;
+  }
+
+  message NoOptimizer {}
+
+  message AdaGrad {
+    double learn_rate = 1;
+    double eps = 2;           // Suggested: 1e-8
+  }
+
+  message Adam {
+    double learn_rate = 1;
+    double beta1 = 6;         // Suggested: 0.9
+    double beta2 = 7;         // Suggested: 0.99
+    double eps = 8;           // Suggested: 1e-8
+  }
+
+  message HypergradientAdam {
+    double init_learning_rate = 1;
+    double hyper_learning_rate = 2;   // Suggested: 1e-7
+    double beta1 = 6;                 // Suggested: 0.9
+    double beta2 = 7;                 // Suggested: 0.99
+    double eps = 8;                   // Suggested: 1e-8
+  }
+
+  message RMSprop {
+    double learn_rate = 1;
+    double decay_rate = 2;
+    double eps = 3;           // Suggested: 1e-8
+  }
+
+  message SGD {
+    double learn_rate = 1;
+    double momentum = 2;      // Set to zero for vanilla SGD
+    bool nesterov = 4;
+  }
+}
diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp
index 3cf4cca001c..6955e4e0a84 100644
--- a/src/proto/proto_common.cpp
+++ b/src/proto/proto_common.cpp
@@ -1,10 +1,41 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
 #include "lbann/proto/proto_common.hpp"
 
 #include "lbann/lbann.hpp"
 #include "lbann/base.hpp"
 #include "lbann/comm.hpp"
 #include "lbann/proto/init_image_data_readers.hpp"
+#include "lbann/proto/factories.hpp"
 #include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <reader.pb.h>
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -18,33 +49,6 @@
 
 namespace lbann {
 
-bool has_motifs(const lbann_comm& comm, const lbann_data::LbannPB& p) {
-  const bool master = comm.am_world_master();
-  if (master) {
-    std::cout << "starting has_motifs\n";
-  }
-  const lbann_data::Model& m = p.model();
-  const int num_layers = m.layer_size();
-  for (int j=0; j<num_layers; j++) {
-    const lbann_data::Layer& layer = m.layer(j);
-    if (layer.has_motif_layer()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb) {
-  const bool master = comm.am_world_master();
-  if (master) {
-    std::cout << "starting expand_motifs\n";
-  }
-  const lbann_data::MotifDefinitions& m = pb.motif_definitions();
-  const int num_motifs = m.motif_size();
-  for (int j=0; j<num_motifs; j++) {
-  }
-}
-
 int get_requested_num_parallel_readers(
   const lbann_comm& comm, const lbann_data::LbannPB& p);
 
@@ -55,9 +59,7 @@ void init_data_readers(
   bool is_shareable_testing_data_reader,
   bool is_shareable_validation_data_reader)
 {
-#ifdef LBANN_HAS_CONDUIT
   static std::unordered_map<std::string, data_reader_jag_conduit*> leading_reader_jag_conduit;
-#endif
   const bool master = comm->am_world_master();
   std::ostringstream err;
 
@@ -84,8 +86,6 @@ void init_data_readers(
 
   for (int j=0; j<size; j++) {
     const lbann_data::Reader& readme = d_reader.reader(j);
-    // This is a temporary measure until we individually setup data reader specific preprocessors
-    bool set_up_generic_preprocessor = true;
 
     const std::string& name = readme.name();
 
@@ -94,59 +94,22 @@ void init_data_readers(
     generic_data_reader *reader = nullptr;
     generic_data_reader *reader_validation = nullptr;
 
-    if ((name == "mnist") || (name == "cifar10") || (name == "moving_mnist")) {
+    // This is a hack that should be fixed when we clean up data reader setup.
+    bool set_transform_pipeline = true;
+
+    if ((name == "mnist") || (name == "cifar10")) {
       init_org_image_data_reader(readme, master, reader);
-      set_up_generic_preprocessor = false;
-    } else if ((name == "imagenet") || (name == "imagenet_patches") ||
-               (name == "multihead_siamese") || (name == "mnist_siamese") || (name == "multi_images")) {
+      set_transform_pipeline = false;
+    } else if ((name == "imagenet")) {
       init_image_data_reader(readme, pb_metadata, master, reader);
-      set_up_generic_preprocessor = false;
-    } else if (name == "jag") {
-      auto* reader_jag = new data_reader_jag(shuffle);
-
-      const lbann_data::DataSetMetaData::Schema& pb_schema = pb_metadata.schema();
-
-      using var_t = data_reader_jag::variable_t;
-
-      // composite independent variable
-      std::vector<std::vector<var_t>> independent_type(pb_schema.independent_size());
-
-      for (int i=0; i < pb_schema.independent_size(); ++i) {
-        const lbann_data::DataSetMetaData::Schema::JAGDataSlice& slice = pb_schema.independent(i);
-        const int slice_size = slice.pieces_size();
-        for (int k=0; k < slice_size; ++k) {
-          const auto var_type = static_cast<var_t>(slice.pieces(k));
-          independent_type[i].push_back(var_type);
-        }
-      }
-
-      reader_jag->set_independent_variable_type(independent_type);
-
-      // composite dependent variable
-      std::vector<std::vector<var_t>> dependent_type(pb_schema.dependent_size());
-
-      for (int i=0; i < pb_schema.dependent_size(); ++i) {
-        const lbann_data::DataSetMetaData::Schema::JAGDataSlice& slice = pb_schema.dependent(i);
-        const int slice_size = slice.pieces_size();
-        for (int k=0; k < slice_size; ++k) {
-          const auto var_type = static_cast<var_t>(slice.pieces(k));
-          dependent_type[i].push_back(var_type);
-        }
-      }
-
-      reader_jag->set_dependent_variable_type(dependent_type);
-
-      const lbann_data::ImagePreprocessor& pb_preproc = readme.image_preprocessor();
-      reader_jag->set_image_dims(pb_preproc.raw_width(), pb_preproc.raw_height());
-      reader_jag->set_normalization_mode(pb_preproc.early_normalization());
-      reader = reader_jag;
-      set_up_generic_preprocessor = false;
-#ifdef LBANN_HAS_CONDUIT
+      set_transform_pipeline = false;
     } else if (name == "jag_conduit") {
       init_image_data_reader(readme, pb_metadata, master, reader);
+      set_transform_pipeline = false;
       auto reader_jag_conduit = dynamic_cast<data_reader_jag_conduit*>(reader);
       const lbann_data::Model& pb_model = p.model();
-      reader->set_mini_batch_size(static_cast<int>(pb_model.mini_batch_size()));
+      const lbann_data::Trainer& pb_trainer = p.trainer();
+      reader->set_mini_batch_size(static_cast<int>(pb_trainer.mini_batch_size()));
       reader->set_data_index_list(readme.index_list());
       reader_jag_conduit->set_list_per_trainer(readme.index_list_per_trainer());
       reader_jag_conduit->set_list_per_model(readme.index_list_per_model());
@@ -177,13 +140,18 @@ void init_data_readers(
           break;
         }
       }
-      set_up_generic_preprocessor = false;
     } else if (name == "jag_conduit_hdf5") {
       init_image_data_reader(readme, pb_metadata, master, reader);
-      set_up_generic_preprocessor = false;
-#endif // LBANN_HAS_CONDUIT
+      set_transform_pipeline = false;
     } else if (name == "nci") {
       reader = new data_reader_nci(shuffle);
+    } else if (name == "smiles") {
+      smiles_data_reader * smiles = new smiles_data_reader(shuffle);
+      reader = smiles;
+    } else if (name == "ras_lipid") {
+      auto *ras_lipid = new ras_lipid_conduit_data_reader(shuffle);
+      ras_lipid->set_num_labels(readme.num_labels());
+      reader = ras_lipid;
     } else if (name == "csv") {
       auto* reader_csv = new csv_reader(shuffle);
       reader_csv->set_label_col(readme.label_col());
@@ -242,12 +210,9 @@ void init_data_readers(
           reader_numpy_npz->set_has_responses(!readme.disable_responses());
           reader_numpy_npz->set_scaling_factor_int16(readme.scaling_factor_int16());
           npy_readers.push_back(reader_numpy_npz);
-#ifdef LBANN_HAS_CONDUIT
         } else if (readme.format() == "jag_conduit") {
           init_image_data_reader(readme, pb_metadata, master, reader);
-          set_up_generic_preprocessor = false;
           npy_readers.push_back(reader);
-#endif
         } else if (readme.format() == "pilot2_molecular_reader") {
           pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle);
           reader_pilot2_molecular->set_data_filename(path);
@@ -320,20 +285,18 @@ void init_data_readers(
       if (readme.num_labels() != 0) {
         reader = new data_reader_synthetic(
           readme.num_samples(),
-          proto::parse_list<int>(readme.synth_dimensions()),
+          parse_list<int>(readme.synth_dimensions()),
           readme.num_labels(),
           shuffle);
       } else {
         reader = new data_reader_synthetic(
           readme.num_samples(),
-          proto::parse_list<int>(readme.synth_dimensions()),
-          proto::parse_list<int>(readme.synth_response_dimensions()),
+          parse_list<int>(readme.synth_dimensions()),
+          parse_list<int>(readme.synth_response_dimensions()),
           shuffle);
       }
     } else if (name == "mesh") {
       reader = new mesh_reader(shuffle);
-    } else if (name == "moving_mnist") {
-      reader = new moving_mnist_reader(7, 40, 40, 2);
     } else if (name == "python") {
 #ifdef LBANN_HAS_PYTHON
       const auto& params = readme.python();
@@ -341,7 +304,8 @@ void init_data_readers(
                                  params.module_dir(),
                                  params.sample_function(),
                                  params.num_samples_function(),
-                                 params.sample_dims_function());
+                                 params.sample_dims_function(),
+                                 shuffle);
 #else
       LBANN_ERROR("attempted to construct Python data reader, "
                   "but LBANN is not built with Python/C API");
@@ -353,6 +317,11 @@ void init_data_readers(
     }
     reader->set_comm(comm);
 
+    if (set_transform_pipeline) {
+      reader->set_transform_pipeline(
+        proto::construct_transform_pipeline(readme));
+    }
+
     if (readme.data_filename() != "") {
       reader->set_data_filename( readme.data_filename() );
     }
@@ -386,10 +355,6 @@ void init_data_readers(
       reader->set_gan_label_value(readme.gan_label_value());
 
       reader->set_partitioned(readme.is_partitioned(), readme.partition_overlap(), readme.partition_mode());
-
-      if (set_up_generic_preprocessor) {
-        init_generic_preprocessor(readme, master, reader);
-      }
     }
 
     if (readme.role() == "train") {
@@ -432,18 +397,8 @@ void init_data_readers(
         reader_validation = new numpy_npz_conduit_reader(*dynamic_cast<const numpy_npz_conduit_reader*>(reader));
       } else if (name == "imagenet") {
         reader_validation = new imagenet_reader(*dynamic_cast<const imagenet_reader*>(reader));
-      } else if (name == "imagenet_patches") {
-        reader_validation = new imagenet_reader_patches(*dynamic_cast<const imagenet_reader_patches*>(reader));
-      } else if (name == "multihead_siamese") {
-  	reader_validation = new data_reader_multihead_siamese(*dynamic_cast<const data_reader_multihead_siamese*>(reader));
-      } else if (name == "mnist_siamese") {
-        reader_validation = new data_reader_mnist_siamese(*dynamic_cast<const data_reader_mnist_siamese*>(reader));
-      } else if (name == "multi_images") {
-        reader_validation = new data_reader_multi_images(*dynamic_cast<const data_reader_multi_images*>(reader));
-      } else if (name == "jag") {
-        reader_validation = new data_reader_jag(shuffle);
-        *dynamic_cast<data_reader_jag*>(reader_validation) = *dynamic_cast<const data_reader_jag*>(reader);
-#ifdef LBANN_HAS_CONDUIT
+      } else if (name == "smiles") {
+        reader_validation = new smiles_data_reader(*dynamic_cast<const smiles_data_reader*>(reader));
       } else if (name == "jag_conduit") {
         /// If the training data reader was shared and the validate reader is split from it, then the validation data reader
         /// is also shared
@@ -466,14 +421,18 @@ void init_data_readers(
             reader_jag_conduit->set_leading_reader(leader);
           }
         } else {
-          reader_validation = new data_reader_jag_conduit(*dynamic_cast<const data_reader_jag_conduit*>(reader), reader->get_unused_indices());
+          reader_validation = new data_reader_jag_conduit(*dynamic_cast<const data_reader_jag_conduit*>(reader));
           const std::string role = "validate";
           auto reader_jag_conduit = dynamic_cast<data_reader_jag_conduit*>(reader_validation);
           reader_jag_conduit->set_leading_reader(reader_jag_conduit);
           reader_jag_conduit->set_role(role);
           leading_reader_jag_conduit[role] = reader_jag_conduit;
         }
-#endif // LBANN_HAS_CONDUIT
+      } else if (name == "ras_lipid") {
+        auto *ras_lipid = new ras_lipid_conduit_data_reader(shuffle);
+        ras_lipid->set_num_labels(readme.num_labels());
+        reader_validation = ras_lipid;
+        (*(ras_lipid_conduit_data_reader *)reader_validation) = (*(ras_lipid_conduit_data_reader *)reader);
       } else if (name == "nci") {
         reader_validation = new data_reader_nci(shuffle);
         (*(data_reader_nci *)reader_validation) = (*(data_reader_nci *)reader);
@@ -496,9 +455,6 @@ void init_data_readers(
       } else if (name == "mesh") {
         reader_validation = new mesh_reader(shuffle);
         (*(mesh_reader *)reader_validation) = (*(mesh_reader *)reader);
-      } else if (name == "moving_mnist") {
-        reader_validation = new moving_mnist_reader(7, 40, 40, 2);
-        (*(moving_mnist_reader *)reader_validation) = (*(moving_mnist_reader *)reader);
       } else if (name == "python") {
 #ifdef LBANN_HAS_PYTHON
         const auto& params = readme.python();
@@ -506,7 +462,9 @@ void init_data_readers(
                                               params.module_dir(),
                                               params.sample_function(),
                                               params.num_samples_function(),
-                                              params.sample_dims_function());
+                                              params.sample_dims_function(),
+                                              shuffle);
+        (*(python_reader *)reader_validation) = (*(python_reader *)reader);
 #else
         LBANN_ERROR("attempted to construct Python data reader, "
                     "but LBANN is not built with Python/C API");
@@ -515,13 +473,15 @@ void init_data_readers(
 
       reader_validation->set_role("validate");
       reader_validation->use_unused_index_set();
-      if(reader_validation->get_data_store_ptr() != nullptr) {
+      data_store_conduit *store = reader_validation->get_data_store_ptr();
+      if (store != nullptr) {
+        store->set_data_reader_ptr(reader_validation);
         reader_validation->get_data_store_ptr()->compact_nodes();
       }
-      /// At this point clean up any unused samples from the main data store
-      if(reader->get_data_store_ptr() != nullptr) {
-        auto&& data_store = reader->get_data_store_ptr();
-        data_store->purge_unused_samples(reader->get_unused_indices());
+
+      size_t ntrain = reader->get_num_data();
+      if (ntrain == 0) {
+        LBANN_ERROR("num train samples = 0; something is wrong");
       }
 
       if (master) {
@@ -531,12 +491,10 @@ void init_data_readers(
         double train_percent = ((double) num_train / (double) (num_train+num_validate))*100.0;
         std::cout << "Training using " << train_percent << "% of the training data set, which is " << reader->get_num_data() << " samples." << std::endl
                   << "Validating training using " << validate_percent << "% of the training data set, which is " << reader_validation->get_num_data() << " samples.";
-#ifdef LBANN_HAS_CONDUIT
         if (name == "jag_conduit") {
           std::cout << " jag conduit leading reader " << dynamic_cast<data_reader_jag_conduit*>(reader)->get_leading_reader()
                     << " of " << (is_shareable_training_data_reader? "shared" : "unshared") << " reader " << reader << " for " << reader->get_role() << std::endl;
         }
-#endif // LBANN_HAS_CONDUIT
         std::cout << std::endl;
       }
 
@@ -610,10 +568,10 @@ bool write_prototext_file(const std::string& fn, lbann_data::LbannPB& pb)
   return true;
 }
 
-bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data::Model& model)
+bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data::Trainer& trainer)
 {
   const bool master = comm.am_world_master();
-  const int parallel_io = model.num_parallel_readers();
+  const int parallel_io = trainer.num_parallel_readers();
 
   if (parallel_io == 0) {
     if (master) {
@@ -630,24 +588,24 @@ bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data:
 
 void set_num_parallel_readers(const lbann_comm& comm, lbann_data::LbannPB& p)
 {
-  lbann_data::Model *model = p.mutable_model();
-  const bool is_set = check_if_num_parallel_readers_set(comm, *model);
+  lbann_data::Trainer *trainer = p.mutable_trainer();
+  const bool is_set = check_if_num_parallel_readers_set(comm, *trainer);
 
   if (!is_set) {
     const int parallel_io = comm.get_procs_per_trainer();
-    model->set_num_parallel_readers(parallel_io); //adjust the prototext
+    trainer->set_num_parallel_readers(parallel_io); //adjust the prototext
   }
 }
 
 int get_requested_num_parallel_readers(const lbann_comm& comm, const lbann_data::LbannPB& p)
 {
-  const lbann_data::Model& model = p.model();
-  const bool is_set = check_if_num_parallel_readers_set(comm, model);
+  const lbann_data::Trainer& trainer = p.trainer();
+  const bool is_set = check_if_num_parallel_readers_set(comm, trainer);
 
   if (!is_set) {
     return comm.get_procs_per_trainer();
   }
-  return model.num_parallel_readers();
+  return trainer.num_parallel_readers();
 }
 
 void set_data_readers_filenames(
@@ -748,6 +706,7 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p)
   std::ostringstream err;
 
   options *opts = options::get();
+  lbann_data::Trainer *trainer = p.mutable_trainer();
   lbann_data::Model *model = p.mutable_model();
   lbann_data::DataReader *d_reader = p.mutable_data_reader();
   int size = d_reader->reader_size();
@@ -792,25 +751,25 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p)
     }
   }
   if (opts->has_int("mini_batch_size")) {
-    model->set_mini_batch_size(opts->get_int("mini_batch_size"));
+    trainer->set_mini_batch_size(opts->get_int("mini_batch_size"));
   }
   if (opts->has_int("num_epochs")) {
     model->set_num_epochs(opts->get_int("num_epochs"));
   }
-  if (opts->has_int("block_size")) {
-    model->set_block_size(opts->get_int("block_size"));
+  if (opts->has_int("hydrogen_block_size")) {
+    trainer->set_hydrogen_block_size(opts->get_int("hydrogen_block_size"));
   }
   if (opts->has_int("procs_per_trainer")) {
-    model->set_procs_per_trainer(opts->get_int("procs_per_trainer"));
+    trainer->set_procs_per_trainer(opts->get_int("procs_per_trainer"));
   }
   if (opts->has_int("num_parallel_readers")) {
-    model->set_num_parallel_readers(opts->get_int("num_parallel_readers"));
+    trainer->set_num_parallel_readers(opts->get_int("num_parallel_readers"));
   }
   if (opts->get_bool("disable_cuda")) {
     model->set_disable_cuda(opts->get_bool("disable_cuda"));
   }
   if (opts->has_int("random_seed")) {
-    model->set_random_seed(opts->get_int("random_seed"));
+    trainer->set_random_seed(opts->get_int("random_seed"));
   }
   if(opts->get_bool("serialize_io")) {
     model->set_serialize_io(opts->get_bool("serialize_io"));
@@ -818,27 +777,66 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p)
 
 }
 
-void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p)
+void print_parameters(const lbann_comm& comm,
+                      lbann_data::LbannPB& p,
+                      std::vector<int>& root_random_seeds,
+                      std::vector<int>& random_seeds,
+                      std::vector<int>& data_seq_random_seeds)
 {
   if (!comm.am_world_master()) {
     return;
   }
 
+  const lbann_data::Trainer &t = p.trainer();
   const lbann_data::Model &m = p.model();
 
+  bool disable_cuda = m.disable_cuda();
+#ifndef LBANN_HAS_GPU
+  disable_cuda = true;
+#endif // LBANN_HAS_GPU
+  bool disable_cudnn = disable_cuda;
+#ifndef LBANN_HAS_CUDNN
+  disable_cudnn = true;
+#endif // LBANN_HAS_CUDNN
+  bool enable_determinism = false;
+#ifdef LBANN_DETERMINISTIC
+  enable_determinism = true;
+#endif // LBANN_DETERMINISTIC
+
   std::cout << std::endl
             << "Running with these parameters:\n"
             << " General:\n"
-            << "  datatype size:           " << sizeof(DataType) << std::endl
-            << "  mini_batch_size:         " << m.mini_batch_size() << std::endl
-            << "  num_epochs:              " << m.num_epochs()  << std::endl
-            << "  block_size:              " << m.block_size()  << std::endl
-            << "  procs_per_trainer:       " << m.procs_per_trainer()  << std::endl
-            << "  num_parallel_readers:    " << m.num_parallel_readers()  << std::endl
-            << "  serialize_io:            " << m.serialize_io()  << std::endl
-            << "  disable_cuda:            " << m.disable_cuda()  << std::endl
-            << "  random_seed:             " << m.random_seed() << std::endl
-            << "  data_layout:             " << m.data_layout()  << std::endl
+            << "  datatype size:              " << sizeof(DataType) << std::endl
+            << "  mini_batch_size:            " << t.mini_batch_size() << std::endl
+            << "  num_epochs:                 " << m.num_epochs()  << std::endl
+            << "  hydrogen_block_size:        " << t.hydrogen_block_size()  << std::endl
+            << "  procs_per_trainer:          " << t.procs_per_trainer()  << std::endl
+            << "  num_parallel_readers:       " << t.num_parallel_readers()  << std::endl
+            << "  serialize_io:               " << m.serialize_io()  << std::endl
+            << "  cuda:                       " << (disable_cuda ? "disabled" : "enabled") << std::endl
+            << "  cudnn:                      " << (disable_cudnn ? "disabled" : "enabled") << std::endl;
+  auto& arg_parser = global_argument_parser();
+  std::stringstream root_rng, rng, data_seq_rng;
+  for(size_t i = 0; i < random_seeds.size(); i++) {
+    int trainer_rank = comm.map_world_rank_to_trainer_rank(i);
+    int rank_in_trainer = comm.map_world_rank_to_rank_in_trainer(i);
+    if(rank_in_trainer < arg_parser.get<int>(MAX_RNG_SEEDS_DISPLAY)) {
+      std::stringstream id;
+      id << "[" << trainer_rank << "][" << rank_in_trainer << "]";
+      root_rng << id.str() << "=" << std::setfill('0') << std::setw(10) << static_cast<unsigned int>(root_random_seeds[i]) << " " ;
+      rng << id.str() << "=" << std::setfill('0') << std::setw(10) << static_cast<unsigned int>(random_seeds[i]) << " " ;
+      data_seq_rng << id.str() << "=" << std::setfill('0') << std::setw(10) << static_cast<unsigned int>(data_seq_random_seeds[i]) << " " ;
+    }else {
+      root_rng << "... ";
+      rng << "... ";
+      data_seq_rng << "... ";
+    }
+  }
+  std::cout << "  root_random_seed[t][r]:     " << root_rng.str() << std::endl;
+  std::cout << "  random_seed[t][r]:          " << rng.str() << std::endl;
+  std::cout << "  data_seq_random_seed[t][r]: " << data_seq_rng.str() << std::endl;
+  std::cout << "  deterministic_exec:         " << (enable_determinism ? "enabled" : "disabled") << std::endl
+            << "  data_layout:                " << m.data_layout()  << std::endl
             << "     (only used for metrics)\n";
 }
 
@@ -863,21 +861,16 @@ void print_help(std::ostream& os)
        "  --saveme=<string>  You can suppress writing the file via the option:\n"
        "  --saveme=0\n"
        "\n"
-       "  To reload from a previous checkpoint you specify --ckpt_dir=<string>\n"
-       "\n"
-       "Some prototext values can be over-riden on the command line;\n"
+       "Some prototext values can be overriden on the command line;\n"
        "(notes: use '1' or '0' for bool; if no value is given for a flag,\n"
        "        e.g: --disable_cuda, then a value of '1' is assigned)\n"
        "\n"
        "General:\n"
        "  --mini_batch_size=<int>\n"
        "  --num_epochs=<int>\n"
-       "  --block_size=<int>\n"
+       "  --hydrogen_block_size=<int>\n"
        "  --procs_per_trainer=<int>\n"
-       "  --num_gpus=<int>\n"
        "  --num_parallel_readers=<int>\n"
-       "  --num_io_threads=<int>\n"
-       "      # of threads used for I/O by the data readers\n"
        "  --serialize_io=<bool>\n"
        "      force data readers to use a single thread for I/O\n"
        "  --disable_background_io_activity=<bool>\n"
@@ -903,6 +896,21 @@ void print_help(std::ostream& os)
        "      Writes out the sample list that was loaded into the current directory\n"
        "  --ltfb_verbose \n"
        "      Increases number of per-trainer messages that are reported\n"
+       "  --ckpt_dir=<string>\n"
+       "      Save to or restart from a specific checkpoint directory.\n"
+       "      Additionally, sets the output directory for dumping weights.\n"
+       "      Modifies callbacks: checkpoint, save_model, dump_weights\n"
+       "  --restart_dir=<string>\n"
+       "      Restart from a checkpoint found in the given directory.\n"
+       "      If the directory doesn't exist or doesn't contain a checkpoint,\n"
+       "      an error will be thrown.\n"
+       "  --load_model_weights_dir=<string>\n"
+       "      Load model wieghts found in the given directory.\n"
+       "      If the directory doesn't exist, doesn't contain valid weights,\n"
+       "      or doesn't contain a checkpoint,\n"
+       "      an error will be thrown.\n"
+       "  --load_model_weights_dir_is_complete=<bool>\n"
+       "      Use load_model_weights_dir as given, ignoring checkpoint hierarchy.\n"
        "\n"
        "DataReaders:\n"
        "  --data_filedir=<string>\n"
@@ -1018,4 +1026,19 @@ void save_session(const lbann_comm& comm, const int argc, char * const* argv, lb
   out.close();
 }
 
+std::string trim(std::string const& str)
+{
+  // Short-circuit on the empty string
+  if (str.size() == 0) return std::string();
+
+  const std::string whitespace = "\f\n\r\t\v ";
+  auto first = str.find_first_not_of(whitespace);
+
+  // All characters are whitespace; short-circuit.
+  if (first == std::string::npos) return std::string();
+
+  auto last = str.find_last_not_of(whitespace);
+  return str.substr(first, (last-first)+1);
+}
+
 } // namespace lbann
diff --git a/src/proto/reader.proto b/src/proto/reader.proto
new file mode 100644
index 00000000000..e06050aacff
--- /dev/null
+++ b/src/proto/reader.proto
@@ -0,0 +1,159 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+import "transforms.proto";
+
+message DataReader {
+  int64 max_par_io_size = 1;
+  repeated Reader reader = 2;
+  bool requires_data_set_metadata = 3;
+}
+
+message Reader {
+  string name = 1; //mnist, nci, nci_regression, numpy, imagenet, synthetic, merge_samples
+  string role = 3; //train, validation, test
+  bool shuffle = 4;
+  string data_filedir = 5;
+  string data_local_filedir = 50; //to support data_store
+  string data_filename = 6;
+  string label_filename = 7;
+  string index_list = 8;
+  double validation_percent = 9;
+  int64 absolute_sample_count = 11;
+  int64 first_n = 200;
+  double percent_of_data_to_use = 12;
+
+  //for GAN model
+  bool gan_labelling = 201;
+  int32 gan_label_value = 202;
+
+  int32 num_labels = 99; //for imagenet and synthetic
+  int64 num_samples = 100; //only for synthetic
+  string synth_dimensions = 101; //only for synthetic
+  string synth_response_dimensions = 115; //only for synthetic
+  //csv attributes
+  string separator = 102;
+  int32 skip_cols = 103;
+  int32 skip_rows = 104;
+  bool has_header = 105;
+  int32 label_col = 106;
+  int32 response_col = 107;
+  bool disable_labels = 108;
+  bool disable_responses = 109;
+  string format = 110; // numpy, csv
+  string data_file_pattern = 111;
+  int64 num_neighbors = 112; // pilot2_molecular_reader
+  int64 max_neighborhood = 113; // pilot2_molecular_reader
+  int32 num_image_srcs = 114; // data_reader_multi_images
+  float scaling_factor_int16 = 116; // for numpy_npz_reader with int16 data
+
+  int32 max_files_to_load = 1000;
+
+  //------------- start of only for partitioned data sets ------------------
+  bool is_partitioned = 300;
+  double partition_overlap = 301;
+  int32 partition_mode = 302;
+       // 1 - share a portion of your data with two neighbors;
+       // 2 - there's a set of overlap indices that are common to all models
+  //------------- end of only for partitioned data sets ------------------
+
+  //------------- start of only for index lists ------------------
+  bool index_list_per_trainer = 400;
+  bool index_list_per_model   = 401;
+  //------------- end of only for index lists ------------------
+
+  PythonDataReader python = 501;
+
+  repeated Transform transforms = 600;  // Ordered list of transforms to apply.
+}
+
+message PythonDataReader {
+  string module = 1;                // Python module
+  string module_dir = 2;            // Directory containing Python module
+  string sample_function = 3;       // Function that gets data sample
+  string num_samples_function = 4;  // Function that gets number of data samples
+  string sample_dims_function = 5;  // Function that gets dimensions of data sample
+}
+
+message DataSetMetaData {
+  message Schema {
+    string scalar_prefix = 1;
+    string image_prefix = 2;
+    string input_prefix = 3;
+
+    uint64 image_height = 11;
+    uint64 image_width = 12;
+    uint64 image_num_channels = 13;
+
+    //------------------ start of only for jag_conduit -----------------------
+    bool split_jag_image_channels = 89;
+    repeated string jag_image_keys = 90;
+    repeated string jag_scalar_keys = 91;
+    repeated string jag_input_keys = 92;
+    message JagKeyPrefixFilter {
+      string key_prefix = 1;
+      uint32 min_len = 2;
+    }
+    repeated string jag_scalar_filters = 93;
+    repeated JagKeyPrefixFilter jag_scalar_prefix_filters = 94;
+    repeated string jag_input_filters = 95;
+    repeated JagKeyPrefixFilter jag_input_prefix_filters = 96;
+
+    enum JAG_Data {
+      Undefined  = 0;
+      JAG_Image  = 1;
+      JAG_Scalar = 2;
+      JAG_Input  = 3;
+    }
+    message JAGDataSlice {
+      repeated JAG_Data pieces = 1;
+    }
+    repeated JAGDataSlice independent = 97;
+    repeated JAGDataSlice dependent = 98;
+    //------------------  end of only for jag_conduit  -----------------------
+  }
+
+  message Normalization {
+    //------------------ start of only for jag_conduit -----------------------
+    message JagLinearNormalizationParams {
+      double scale = 1;
+      double bias = 2;
+    }
+
+    repeated JagLinearNormalizationParams jag_image_normalization_params = 86;
+    repeated JagLinearNormalizationParams jag_scalar_normalization_params = 87;
+    repeated JagLinearNormalizationParams jag_input_normalization_params = 88;
+
+    //------------------  end of only for jag_conduit  -----------------------
+  }
+
+  Schema schema = 1;
+  Normalization normalization = 2;
+}
diff --git a/src/proto/trainer.proto b/src/proto/trainer.proto
new file mode 100644
index 00000000000..c14f23899e4
--- /dev/null
+++ b/src/proto/trainer.proto
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+import "callbacks.proto";
+
+message Trainer {
+
+  // Unique identifier
+  string name = 1;
+
+  // Parallel processes per trainer
+  //
+  // The number of processes per trainer must evenly divide the total
+  // number of MPI ranks. The number of resulting trainers is
+  // num_procs / procs_per_trainer.
+  //
+  // If procs_per_trainer is not provided, then all MPI ranks are
+  // assigned to one trainer.
+  int64 procs_per_trainer = 2;
+
+  // I/O threads per parallel process
+  //
+  // These threads are typically used to perform data ingestion in the
+  // background.
+  int64 num_parallel_readers = 3;
+
+  repeated Callback callback = 20;
+  int64 mini_batch_size = 12;
+
+  // -------------------------------
+  // Advanced options
+  // -------------------------------
+
+  // BVE FIXME these should go away
+  bool shareable_training_data_reader = 42; // Can the data reader be shared across multiple models( e.g. GAN)
+  bool shareable_testing_data_reader = 43; // Can the data reader be shared across multiple models (e.g. GAN)
+  bool shareable_validation_data_reader = 44; // Can the data reader be shared across multiple models (e.g. GAN)
+
+  // If false, trainers will have their trainer rank mixed into their random seed.
+  bool random_init_trainers_identically = 4;
+
+  // Set a random seed for the entire trainer
+  int64 random_seed = 30;
+
+  // Algorithmic block size for Hydrogen
+  int64 hydrogen_block_size = 100;
+}
diff --git a/src/proto/training_algorithm.proto b/src/proto/training_algorithm.proto
new file mode 100644
index 00000000000..df2ca5162ea
--- /dev/null
+++ b/src/proto/training_algorithm.proto
@@ -0,0 +1,34 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message TrainingAlgorithm {
+  string type = 1;
+  string name = 3;
+}
diff --git a/src/proto/transforms.proto b/src/proto/transforms.proto
new file mode 100644
index 00000000000..c398595719e
--- /dev/null
+++ b/src/proto/transforms.proto
@@ -0,0 +1,167 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+package lbann_data;
+
+message Transform {
+  // Transforms that apply to LBANN data.
+  // Normalize channel-wise with mean and standard deviation.
+  message Normalize {
+    string means = 1;
+    string stddevs = 2;
+  }
+  // Normalize each sample to have mean 0, standard deviation 1.
+  message SampleNormalize {}
+  // Scale by a constant.
+  message Scale {
+    float scale = 1;
+  }
+
+  // Transforms that apply to images.
+  // Adjust the brightness of an image.
+  message AdjustBrightness {
+    float factor = 1;
+  }
+  // Adjust the contrast of an image.
+  message AdjustContrast {
+    float factor = 1;
+  }
+  // Adjust the saturation of an image.
+  message AdjustSaturation {
+    float factor = 1;
+  }
+  // Crop of size height x width from the center.
+  message CenterCrop {
+    uint64 height = 1;
+    uint64 width = 2;
+  }
+  // Convert to color.
+  message Colorize {}
+  // Randomly jitter brightness/contrast/saturation.
+  message ColorJitter {
+    float min_brightness_factor = 1;
+    float max_brightness_factor = 2;
+    float min_contrast_factor = 3;
+    float max_contrast_factor = 4;
+    float min_saturation_factor = 5;
+    float max_saturation_factor = 6;
+  }
+  // Apply cutout augmentation.
+  message Cutout {
+    uint64 num_holes = 1;
+    uint64 length = 2;
+  }
+  // Convert to grayscale.
+  message Grayscale {}
+  // Horizontal flip with probability p.
+  message HorizontalFlip {
+    float p = 1;
+  }
+  // Fused Normalize + ToLBANNLayout.
+  message NormalizeToLBANNLayout {
+    string means = 1;
+    string stddevs = 2;
+  }
+  // Apply a random affine transform.
+  message RandomAffine {
+    float rotate_min = 1;
+    float rotate_max = 2;
+    float translate_h = 3;
+    float translate_w = 4;
+    float scale_min = 5;
+    float scale_max = 6;
+    float shear_min = 7;
+    float shear_max = 8;
+  }
+  // Crop of size height x width from a random location.
+  message RandomCrop {
+    uint64 height = 1;
+    uint64 width = 2;
+  }
+  // Random crop with scale and aspect ratio augmentation.
+  message RandomResizedCrop {
+    uint64 height = 1;
+    uint64 width = 2;
+    float scale_min = 3;
+    float scale_max = 4;
+    float ar_min = 5;
+    float ar_max = 6;
+  }
+  // Resize to height x width, then randomly crop to crop_height x crop_width.
+  message RandomResizedCropWithFixedAspectRatio {
+    uint64 height = 1;
+    uint64 width = 2;
+    uint64 crop_height = 3;
+    uint64 crop_width = 4;
+  }
+  // Resize to height x width.
+  message Resize {
+    uint64 height = 1;
+    uint64 width = 2;
+  }
+  // Resize to height x width then crop to crop_height x crop_width at the center.
+  message ResizedCenterCrop {
+    uint64 height = 1;
+    uint64 width = 2;
+    uint64 crop_height = 3;
+    uint64 crop_width = 4;
+  }
+  // Convert from an image to LBANN data.
+  message ToLBANNLayout { }
+  // Vertical flip with probability p.
+  message VerticalFlip {
+    float p = 1;
+  }
+
+  oneof transform_type {
+    // On LBANN data:
+    Normalize normalize = 1;
+    SampleNormalize sample_normalize = 2;
+    Scale scale = 3;
+
+    // On images:
+    CenterCrop center_crop = 100;
+    Colorize colorize = 101;
+    Grayscale grayscale = 102;
+    HorizontalFlip horizontal_flip = 103;
+    NormalizeToLBANNLayout normalize_to_lbann_layout = 104;
+    RandomAffine random_affine = 105;
+    RandomCrop random_crop = 106;
+    RandomResizedCrop random_resized_crop = 107;
+    RandomResizedCropWithFixedAspectRatio random_resized_crop_with_fixed_aspect_ratio = 108;
+    Resize resize = 109;
+    ResizedCenterCrop resized_center_crop = 110;
+    ToLBANNLayout to_lbann_layout = 111;
+    VerticalFlip vertical_flip = 112;
+    AdjustBrightness adjust_brightness = 113;
+    AdjustContrast adjust_contrast = 114;
+    AdjustSaturation adjust_saturation = 115;
+    ColorJitter color_jitter = 116;
+    Cutout cutout = 117;
+  }
+}
diff --git a/src/proto/unit_test/CMakeLists.txt b/src/proto/unit_test/CMakeLists.txt
new file mode 100644
index 00000000000..2a4e5e66bc6
--- /dev/null
+++ b/src/proto/unit_test/CMakeLists.txt
@@ -0,0 +1,10 @@
+set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES
+  parse_list_test.cpp
+  parse_set_test.cpp
+  trim_test.cpp
+  )
+
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQCATCH2_TEST_FILES}"
+  "${THIS_DIR_SEQ_CATCH2_TEST_FILES}"
+  PARENT_SCOPE)
diff --git a/src/proto/unit_test/parse_list_test.cpp b/src/proto/unit_test/parse_list_test.cpp
new file mode 100644
index 00000000000..40fdc8aed33
--- /dev/null
+++ b/src/proto/unit_test/parse_list_test.cpp
@@ -0,0 +1,52 @@
+#include <catch2/catch.hpp>
+
+#include <lbann/base.hpp>
+#include <lbann/proto/proto_common.hpp>
+
+#include <string>
+#include <vector>
+
+TEST_CASE("Testing parse_list", "[proto][utilities]")
+{
+  SECTION("execution_mode")
+  {
+    const std::vector<lbann::execution_mode> expected =
+      { lbann::execution_mode::training,
+        lbann::execution_mode::validation,
+        lbann::execution_mode::testing };
+
+    auto const answer =
+      lbann::parse_list<lbann::execution_mode>("train validate test");
+    CHECK(answer == expected);
+    CHECK(
+      lbann::parse_list<lbann::execution_mode>("")
+      == std::vector<lbann::execution_mode>{});
+    CHECK(
+      lbann::parse_list<lbann::execution_mode>(" ")
+      == std::vector<lbann::execution_mode>{});
+
+    CHECK_THROWS(
+      lbann::parse_list<lbann::execution_mode>("banana tuna salad"));
+  }
+
+  SECTION("std::string")
+  {
+    const std::vector<std::string> expected = { "this", "is", "a", "test" };
+    auto const answer =
+      lbann::parse_list<std::string>("this is a test");
+    CHECK(answer == expected);
+    CHECK(
+      lbann::parse_list<std::string>("") == std::vector<std::string>{});
+
+  }
+
+  SECTION("int")
+  {
+    const std::vector<int> expected = { 1, 2, 3, 4, 5 };
+    auto const answer =
+      lbann::parse_list<int>("1 2 3 4 5");
+    CHECK(answer == expected);
+    CHECK(lbann::parse_list<int>("") == std::vector<int>{});
+    CHECK(lbann::parse_list<int>(" ") == std::vector<int>{});
+  }
+}
diff --git a/src/proto/unit_test/parse_set_test.cpp b/src/proto/unit_test/parse_set_test.cpp
new file mode 100644
index 00000000000..a79cfc43a9d
--- /dev/null
+++ b/src/proto/unit_test/parse_set_test.cpp
@@ -0,0 +1,47 @@
+#include <catch2/catch.hpp>
+
+#include <lbann/base.hpp>
+#include <lbann/proto/proto_common.hpp>
+
+#include <string>
+#include <set>
+
+TEST_CASE("Testing parse_set", "[proto][utilities]")
+{
+  SECTION("execution_mode")
+  {
+    const std::set<lbann::execution_mode> expected =
+      { lbann::execution_mode::training,
+        lbann::execution_mode::validation,
+        lbann::execution_mode::testing };
+
+    auto const answer =
+      lbann::parse_set<lbann::execution_mode>("train validate train test test");
+    CHECK(answer == expected);
+    CHECK(
+      lbann::parse_set<lbann::execution_mode>("")
+      == std::set<lbann::execution_mode>{});
+    CHECK(
+      lbann::parse_set<lbann::execution_mode>(" ")
+      == std::set<lbann::execution_mode>{});
+  }
+
+  SECTION("std::string")
+  {
+    const std::set<std::string> expected = { "this", "is", "a", "test" };
+    auto const answer =
+      lbann::parse_set<std::string>("this is a test");
+    CHECK(answer == expected);
+    CHECK(lbann::parse_set<std::string>("") == std::set<std::string>{});
+  }
+
+  SECTION("int")
+  {
+    const std::set<int> expected = { 1, 2, 3, 4, 5 };
+    auto const answer =
+      lbann::parse_set<int>("1 1 2 1 3 4 3 3 5 2");
+    CHECK(answer == expected);
+    CHECK(lbann::parse_set<int>("") == std::set<int>{});
+    CHECK(lbann::parse_set<int>(" ") == std::set<int>{});
+  }
+}
diff --git a/src/proto/unit_test/trim_test.cpp b/src/proto/unit_test/trim_test.cpp
new file mode 100644
index 00000000000..1214dd82a59
--- /dev/null
+++ b/src/proto/unit_test/trim_test.cpp
@@ -0,0 +1,74 @@
+#include <catch2/catch.hpp>
+
+#include <lbann/proto/proto_common.hpp>
+
+#include <string>
+
+TEST_CASE("Testing string trimming", "[proto][utilities]")
+{
+  SECTION("Leading spaces")
+  {
+    CHECK(lbann::trim(" my string") == "my string");
+    CHECK(lbann::trim("\nmy string") == "my string");
+    CHECK(lbann::trim("\tmy string") == "my string");
+    CHECK(lbann::trim(" \n\tmy string") == "my string");
+    CHECK(lbann::trim("      my string") == "my string");
+  }
+  SECTION("Trailing spaces")
+  {
+    CHECK(lbann::trim("my string ") == "my string");
+    CHECK(lbann::trim("my string\n") == "my string");
+    CHECK(lbann::trim("my string\t") == "my string");
+    CHECK(lbann::trim("my string \n\t") == "my string");
+    CHECK(lbann::trim("my string    ") == "my string");
+  }
+  SECTION("Leading and trailing spaces")
+  {
+    CHECK(lbann::trim(" my string ") == "my string");
+    CHECK(lbann::trim(" my string\n") == "my string");
+    CHECK(lbann::trim(" my string\t") == "my string");
+    CHECK(lbann::trim(" my string \n\t") == "my string");
+    CHECK(lbann::trim(" my string    ") == "my string");
+
+    CHECK(lbann::trim("\nmy string ") == "my string");
+    CHECK(lbann::trim("\nmy string\n") == "my string");
+    CHECK(lbann::trim("\nmy string\t") == "my string");
+    CHECK(lbann::trim("\nmy string \n\t") == "my string");
+    CHECK(lbann::trim("\nmy string    ") == "my string");
+
+    CHECK(lbann::trim("\tmy string ") == "my string");
+    CHECK(lbann::trim("\tmy string\n") == "my string");
+    CHECK(lbann::trim("\tmy string\t") == "my string");
+    CHECK(lbann::trim("\tmy string \n\t") == "my string");
+    CHECK(lbann::trim("\tmy string    ") == "my string");
+
+    CHECK(lbann::trim(" \n\tmy string ") == "my string");
+    CHECK(lbann::trim(" \n\tmy string\n") == "my string");
+    CHECK(lbann::trim(" \n\tmy string\t") == "my string");
+    CHECK(lbann::trim(" \n\tmy string \n\t") == "my string");
+    CHECK(lbann::trim(" \n\tmy string    ") == "my string");
+
+    CHECK(lbann::trim("  my string ") == "my string");
+    CHECK(lbann::trim("   my string\n") == "my string");
+    CHECK(lbann::trim("    my string\t") == "my string");
+    CHECK(lbann::trim("     my string \n\t") == "my string");
+    CHECK(lbann::trim("      my string    ") == "my string");
+  }
+  SECTION("Neither leading nor trailing spaces")
+  {
+    CHECK(lbann::trim("my string") == "my string");
+    CHECK(lbann::trim("lbann") == "lbann");
+  }
+  SECTION("Only spaces")
+  {
+    CHECK(lbann::trim(" ") == "");
+    CHECK(lbann::trim("\n") == "");
+    CHECK(lbann::trim("\t") == "");
+    CHECK(lbann::trim(" \n\t") == "");
+    CHECK(lbann::trim("     \t\n\t") == "");
+  }
+  SECTION("Empty string")
+  {
+    CHECK(lbann::trim("") == "");
+  }
+}
diff --git a/src/proto/weights.proto b/src/proto/weights.proto
new file mode 100644
index 00000000000..929fa8fcf5a
--- /dev/null
+++ b/src/proto/weights.proto
@@ -0,0 +1,76 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+syntax = "proto3";
+
+import "optimizers.proto";
+import "layers.proto";
+
+package lbann_data;
+
+message Weights {
+  string name = 1;
+  Optimizer optimizer = 2;
+  Initializer initializer = 3;
+  DataType datatype = 4;
+}
+
+message Initializer {
+  oneof initializer_type {
+    ConstantInitializer constant_initializer = 20;
+    ValueInitializer value_initializer = 21;
+    UniformInitializer uniform_initializer = 22;
+    NormalInitializer normal_initializer = 23;
+    GlorotNormalInitializer glorot_normal_initializer = 24;
+    GlorotUniformInitializer glorot_uniform_initializer = 25;
+    HeNormalInitializer he_normal_initializer = 26;
+    HeUniformInitializer he_uniform_initializer = 27;
+    LeCunNormalInitializer lecun_normal_initializer = 28;
+    LeCunUniformInitializer lecun_uniform_initializer = 29;
+  }
+
+  // Weight initializers
+  message ConstantInitializer {
+    double value = 1;
+  }
+  message ValueInitializer {
+    string values = 1;
+  }
+  message UniformInitializer {
+    double min = 1;
+    double max = 2;
+  }
+  message NormalInitializer {
+    double mean = 1;
+    double standard_deviation = 2;
+  }
+  message GlorotNormalInitializer {}
+  message GlorotUniformInitializer {}
+  message HeNormalInitializer {}
+  message HeUniformInitializer {}
+  message LeCunNormalInitializer {}
+  message LeCunUniformInitializer {}
+}
diff --git a/src/trainers/CMakeLists.txt b/src/trainers/CMakeLists.txt
new file mode 100644
index 00000000000..69c37b0c9fa
--- /dev/null
+++ b/src/trainers/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  trainer.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/trainers/trainer.cpp b/src/trainers/trainer.cpp
new file mode 100644
index 00000000000..fa0267587f2
--- /dev/null
+++ b/src/trainers/trainer.cpp
@@ -0,0 +1,368 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/trainers/trainer.hpp"
+#include "lbann/callbacks/callback.hpp"
+//#include "lbann/callbacks/callback_save_model.hpp"
+#include "lbann/io/persist.hpp"
+#include "lbann/layers/io/input/generic_input_layer.hpp"
+#include "lbann/layers/transform/dummy.hpp"
+#include "lbann/layers/transform/split.hpp"
+#include "lbann/layers/transform/evaluation.hpp"
+#include "lbann/objective_functions/layer_term.hpp"
+#include "lbann/metrics/layer_metric.hpp"
+#include "lbann/utils/random.hpp"
+#include "lbann/utils/omp_diagnostics.hpp"
+#include "lbann/utils/description.hpp"
+#include "lbann/execution_contexts/sgd_execution_context.hpp"
+#include "lbann/training_algorithms/sgd_training_algorithm.hpp"
+#include "lbann/data_coordinator/data_coordinator_metadata.hpp"
+#include <string>
+#include <unistd.h>
+#include <iomanip>
+#include <queue>
+#include <unordered_set>
+#include <lbann.pb.h>
+
+#include "mpi.h"
+
+namespace lbann {
+
+////////////////////////////////////////////////////////////
+// Constructors and destructor
+////////////////////////////////////////////////////////////
+
+trainer::trainer(lbann_comm *comm,
+                 size_t mini_batch_size)
+  : m_comm(comm),
+    m_max_mini_batch_size(mini_batch_size),
+    m_io_thread_pool(),
+    m_background_io_allowed(true),
+    m_data_coordinator(*this, comm) {
+
+  // Default trainer name
+  m_name = "trainer" + std::to_string(m_comm->get_trainer_rank());
+}
+
+trainer::trainer(const trainer& other) :
+  m_comm(other.m_comm),
+  m_max_mini_batch_size(other.m_max_mini_batch_size),
+  m_background_io_allowed(other.m_background_io_allowed),
+  m_data_coordinator(other.m_data_coordinator) {
+
+  // Deep copies
+  // m_io_thread_pool = (other.m_io_thread_pool ?
+  //                     other.m_io_thread_pool->copy() : nullptr);
+  m_callbacks.reserve(other.m_callbacks.size());
+  for (auto const& cb : other.m_callbacks) {
+    m_callbacks.emplace_back(cb->copy());
+  }
+}
+
+trainer& trainer::operator=(const trainer& other) {
+
+  // Shallow copies
+  m_comm = other.m_comm;
+  m_max_mini_batch_size = other.m_max_mini_batch_size;
+  m_background_io_allowed = other.m_background_io_allowed;
+
+  // Deep copies
+  // m_io_thread_pool = (other.m_io_thread_pool ?
+  //                     other.m_io_thread_pool->copy() : nullptr);
+  m_callbacks.reserve(other.m_callbacks.size());
+  for (auto const& cb : other.m_callbacks) {
+    m_callbacks.emplace_back(cb->copy());
+  }
+
+  return *this;
+}
+
+trainer::~trainer() {
+}
+
+////////////////////////////////////////////////////////////
+// Trainer specification
+////////////////////////////////////////////////////////////
+
+void trainer::set_name(std::string const& name) {
+  if (name.empty()) {
+    LBANN_ERROR("attempted to rename trainer \"", get_name(), "\" with empty string");
+  }
+  m_name = name;
+}
+
+description trainer::get_description() const {
+
+  // Construct description object
+  description desc(get_name());
+  desc.add("Background I/O", m_background_io_allowed);
+
+  // Result
+  return desc;
+
+}
+
+////////////////////////////////////////////////////////////
+// Setup
+////////////////////////////////////////////////////////////
+
+void trainer::setup(std::unique_ptr<thread_pool> io_thread_pool, std::map<execution_mode, generic_data_reader *> data_readers) {
+  // Setup I/O threads - set up before setting up the layers (input
+  // layer depends on having a properly initialized thread pool)
+  m_io_thread_pool = std::move(io_thread_pool);
+
+  for (auto d : data_readers) {
+    d.second->set_trainer(this);
+  }
+  m_data_coordinator.setup(get_max_mini_batch_size(), data_readers);
+
+  // Set up callbacks first - allow checkpoint / restart to reload state
+  for (auto& cb : m_callbacks) {
+    cb->setup(this);
+  }
+}
+
+/// Check if there is already an execution context for the model in this mode, if not create one
+trainer::execution_context_key_pair_t trainer::check_and_build_execution_context(training_algorithm& alg,
+                                                                                 observer_ptr<model> model,
+                                                                                 execution_mode mode) {
+  auto key = std::make_pair(model,mode);
+  if(m_model_execution_context.count(key) == 0) {
+    /// Create a execution context for each model and execution mode
+    std::unique_ptr<execution_context> context;
+    if(dynamic_cast<observer_ptr<sgd_training_algorithm>>(&alg) != nullptr) {
+      /// @todo BVE FIXME Figure out how to get a good mini-batch size
+      /// in here
+      context = make_unique<sgd_execution_context>(*this, alg, m_comm, mode, get_max_mini_batch_size());
+    }else {
+      context = make_unique<execution_context>(*this, alg, m_comm, mode);
+    }
+    m_model_execution_context.emplace(key,std::move(context));
+  }
+  return key;
+}
+
+/// Check if there is already an execution context for the model in this mode, if not create one
+trainer::execution_context_key_pair_t trainer::check_and_build_execution_context(execution_context& c,
+                                                                                 model& model,
+                                                                                 execution_mode mode) {
+  auto key = std::make_pair(&model, mode);
+  if(m_model_execution_context.count(key) == 0) {
+    std::unique_ptr<execution_context> context;
+    //    observer_ptr<training_algorithm> alg = const_cast
+    if(dynamic_cast<observer_ptr</*const */sgd_execution_context>>(&c) != nullptr) {
+      context = make_unique<sgd_execution_context>(*this, c.get_training_algorithm(), m_comm, mode, get_max_mini_batch_size());
+    }else {
+      context = make_unique<execution_context>(*this, c.get_training_algorithm(), m_comm, mode);
+    }
+    m_model_execution_context.emplace(key,std::move(context));
+  }
+  return key;
+}
+
+execution_context& trainer::get_execution_context(observer_ptr<model> model,
+                                                  execution_mode mode) {
+  auto key = std::make_pair(model,mode);
+  return get_execution_context(key);
+}
+
+execution_context& trainer::get_execution_context(execution_context_key_pair_t key) {
+  if(m_model_execution_context.count(key) == 0) {
+    LBANN_ERROR("No execution context for this model / mode pair");
+  }
+  return static_cast<sgd_execution_context&>(*(m_model_execution_context[key].get()));
+}
+
+void trainer::delete_execution_context(execution_context_key_pair_t key) {
+  if(m_model_execution_context.count(key) == 0) {
+    LBANN_WARNING("Attempting to delete an invalid execution context for model="
+                  + (key.first)->get_name() + " / " + to_string(key.second));
+  }
+  m_model_execution_context.erase(key);
+}
+
+  /// @todo BVE FIXME seems like there is a bug here about mapping
+  /// execution contexts to the right model
+void trainer::for_each_execution_context(std::function<void(observer_ptr<execution_context>)>fn) {
+  for(auto&& c : m_model_execution_context) {
+    // auto&& model = c.first.first;
+    // auto&& mode = c.first.second;
+    auto&& context = c.second;
+    fn(context.get());
+  }
+}
+
+
+////////////////////////////////////////////////////////////
+// Evaluation and training
+////////////////////////////////////////////////////////////
+void trainer::apply(training_algorithm& alg,
+                    observer_ptr<model> model,
+                    execution_mode mode,
+                    termination_criteria const& term_criteria) {
+
+  auto key = check_and_build_execution_context(alg, model, mode);
+  DataReaderMetaData dr_metadata = get_data_coordinator().get_dr_metadata();
+  alg.setup_models({model}, get_max_mini_batch_size(), dr_metadata);
+  /// Apply the training algorithm to train the model
+  alg.apply(*(m_model_execution_context[key].get()), *model, get_data_coordinator(), mode, term_criteria);
+}
+
+void trainer::train(observer_ptr<model> model, El::Int num_epochs, El::Int num_batches) {
+  auto sgd = make_unique<sgd_training_algorithm>();
+  auto key = check_and_build_execution_context(*sgd.get(), model, execution_mode::training);
+  DataReaderMetaData dr_metadata = get_data_coordinator().get_dr_metadata();
+  sgd.get()->setup_models({model}, get_max_mini_batch_size(), dr_metadata);
+  /// Apply the training algorithm to train the model
+  sgd.get()->train(static_cast<sgd_execution_context&>(*(m_model_execution_context[key].get())), *model, get_data_coordinator(), num_epochs, num_batches);
+}
+
+void trainer::evaluate(observer_ptr<model> model, execution_mode mode, El::Int num_batches) {
+  auto sgd = make_unique<sgd_training_algorithm>();
+  auto key = check_and_build_execution_context(*sgd.get(), model, mode);
+  DataReaderMetaData dr_metadata = get_data_coordinator().get_dr_metadata();
+  sgd.get()->setup_models({model}, get_max_mini_batch_size(), dr_metadata);
+  /// Apply the training algorithm to evaluate the model
+  sgd.get()->evaluate(static_cast<sgd_execution_context&>(*(m_model_execution_context[key].get())), *model, get_data_coordinator(), mode, num_batches);
+}
+
+// =============================================
+// Checkpointing
+// =============================================
+
+bool trainer::save_to_checkpoint_shared() {
+  auto save_checkpoint = [this](observer_ptr<execution_context> ctx) {
+    ctx->save_to_checkpoint_shared(this->get_persist_obj());
+  };
+  for_each_execution_context(save_checkpoint);
+  save_rng_to_checkpoint_shared(get_persist_obj(), m_comm);
+
+  if (m_comm->am_trainer_master()) {
+    write_cereal_archive(*this, get_persist_obj(), "trainer.xml");
+  }
+
+  auto flag = get_data_coordinator().save_to_checkpoint_shared(get_persist_obj());
+
+  return flag;
+}
+
+bool trainer::load_from_checkpoint_shared(persist& p) {
+  try {
+    load_from_shared_cereal_archive(*this, p, *get_comm(), "trainer.xml");
+  }catch (NonexistentArchiveFile const& e) {
+    LBANN_MSG(e.what());
+    return false;
+  }
+
+  auto flag = get_data_coordinator().load_from_checkpoint_shared(p);
+
+  return flag;
+}
+
+bool trainer::load_from_checkpoint_shared(model& m, execution_context& c) {
+  // Reload the RNG once the trainer and all of the  models are setup
+  // to avoid spurious turns of the RNGs
+  load_rng_from_checkpoint(get_persist_obj(), m_comm);
+
+  execution_mode current_mode = c.get_execution_mode();
+
+  for(execution_mode mode : execution_mode_iterator()) {
+    /// Restart should optionally load any other valid contexts
+    if(mode == execution_mode::invalid) { continue; }
+    trainer::execution_context_key_pair_t key;
+    try {
+      if(current_mode == mode) {
+        /// Restart has to be able to load the currently running execution context
+        c.load_from_checkpoint_shared(get_persist_obj());
+      }else {
+        key = check_and_build_execution_context(c, m, mode);
+        auto& evaluation_context = static_cast<sgd_execution_context&>(get_execution_context(key));
+        evaluation_context.load_from_checkpoint_shared(get_persist_obj());
+      }
+    }catch (NonexistentArchiveFile const&) {
+      // Ignore the exception if the file is not for the current execution mode
+      if(current_mode == mode) {
+        LBANN_ERROR("Failed to restart model, invalid execution mode: " + to_string(current_mode));
+      }else {
+        delete_execution_context(key);
+      }
+    }
+  }
+  return true;
+}
+
+bool trainer::save_to_checkpoint_distributed(){
+  auto save_checkpoint = [this](observer_ptr<execution_context> ctx) {
+    ctx->save_to_checkpoint_distributed(this->get_persist_obj());
+  };
+  for_each_execution_context(save_checkpoint);
+  save_rng_to_checkpoint_distributed(get_persist_obj(), m_comm);
+  return true;
+}
+
+bool trainer::load_from_checkpoint_distributed(persist& p){
+  read_cereal_archive(*this, p, "trainer.xml");
+  return false;
+}
+
+bool trainer::load_from_checkpoint_distributed(model& m, execution_context& c){
+  load_rng_from_checkpoint(get_persist_obj(), m_comm);
+
+  execution_mode current_mode = c.get_execution_mode();
+
+  for(execution_mode mode : execution_mode_iterator()) {
+    /// Restart should optionally load any other valid contexts
+    if(mode == execution_mode::invalid) { continue; }
+    trainer::execution_context_key_pair_t key;
+    try {
+      if(current_mode == mode) {
+        /// Restart has to be able to load the currently running  execution context
+        c.load_from_checkpoint_distributed(get_persist_obj());
+      }else {
+        key = check_and_build_execution_context(c, m, mode);
+        auto& evaluation_context = static_cast<sgd_execution_context&>(get_execution_context(key));
+        evaluation_context.load_from_checkpoint_distributed(get_persist_obj());
+      }
+    }catch (NonexistentArchiveFile const&) {
+      // Ignore the exception if the file is not for the current execution mode
+      if(current_mode == mode) {
+        LBANN_ERROR("Failed to restart model, invalid execution mode: " + to_string(current_mode));
+      }else {
+        delete_execution_context(key);
+      }
+    }
+  }
+  return true;
+}
+
+void trainer::write_proto(lbann_data::Trainer* proto) {
+  proto->Clear();
+  if (m_comm->am_world_master()) {
+    proto->set_mini_batch_size(m_max_mini_batch_size);
+  }
+}
+
+}  // namespace lbann
diff --git a/src/training_algorithms/CMakeLists.txt b/src/training_algorithms/CMakeLists.txt
new file mode 100644
index 00000000000..36730188551
--- /dev/null
+++ b/src/training_algorithms/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  sgd_training_algorithm.cpp
+  training_algorithm.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/training_algorithms/sgd_training_algorithm.cpp b/src/training_algorithms/sgd_training_algorithm.cpp
new file mode 100644
index 00000000000..f9767953c1f
--- /dev/null
+++ b/src/training_algorithms/sgd_training_algorithm.cpp
@@ -0,0 +1,298 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/training_algorithms/sgd_training_algorithm.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+
+////////////////////////////////////////////////////////////
+// Evaluation and training
+////////////////////////////////////////////////////////////
+
+void sgd_training_algorithm::apply(execution_context& context,
+                                   model& model,
+                                   data_coordinator& dc,
+                                   execution_mode mode,
+                                   termination_criteria const& term_criteria) {
+  sgd_execution_context& sgd_context = static_cast<sgd_execution_context&>(context);
+  const sgd_termination_criteria& sgd_term = static_cast<const sgd_termination_criteria&>(term_criteria);
+  switch(mode) {
+  case execution_mode::training:
+    train(sgd_context, model, dc, sgd_term.num_epochs, sgd_term.num_steps);
+    break;
+  case execution_mode::validation:
+  case execution_mode::testing:
+  case execution_mode::prediction:
+    evaluate(sgd_context, model, dc, mode, sgd_term.num_steps);
+    break;
+  default:
+    LBANN_ERROR(std::string{} + "Illegal mode: " + to_string(mode));
+  }
+}
+
+void sgd_training_algorithm::train(sgd_execution_context& c,
+                                   model& model,
+                                   data_coordinator& dc,
+                                   size_t num_epochs,
+                                   size_t num_batches) {
+
+  // Initialize epoch
+  model.reset_mode(c, execution_mode::training);
+  dc.reset_mode(c);
+
+  do_train_begin_cbs(model);
+  for (size_t epoch = c.get_epoch(); epoch < num_epochs; ++epoch) {
+    if (c.get_terminate_training()) { break; }
+
+    // Initialize epoch
+    model.reset_mode(c, execution_mode::training);
+    model.reset_epoch_statistics(execution_mode::training);
+    dc.reset_mode(c);
+    do_epoch_begin_cbs(model);
+
+    // Training iterations
+    if (num_batches > 0) {
+      for (size_t i = 0; i < num_batches; i++) { train_mini_batch(c, model, dc); }
+    } else {
+      while (!train_mini_batch(c, model, dc)) {}
+    }
+
+    // Finalize epoch
+    c.inc_epoch();
+    model.reconcile_weight_values();
+    do_epoch_end_cbs(model);
+
+    // Evaluate on validation set
+    auto key = c.get_trainer().check_and_build_execution_context(c, model, execution_mode::validation);
+    auto& evaluation_context = static_cast<sgd_execution_context&>(c.get_trainer().get_execution_context(key));
+    // Check to make sure that the model has a valid execution mode
+    // before trying to do inference
+    if (model.is_execution_mode_valid(execution_mode::validation)) {
+      evaluate(evaluation_context, model, dc, execution_mode::validation);
+    }
+  }
+  do_train_end_cbs(model);
+}
+
+////////////////////////////////////////////////////////////
+// Evaluation and training
+////////////////////////////////////////////////////////////
+
+bool sgd_training_algorithm::train_mini_batch(sgd_execution_context& c,
+                                              model& model,
+                                              data_coordinator& dc) {
+  model.reset_mode(c, execution_mode::training);
+  dc.reset_mode(c);
+  do_batch_begin_cbs(model, execution_mode::training);
+
+  bool finished;
+
+#if defined(LBANN_HAVE_OMP_TASKLOOP)
+  LBANN_OMP_PARALLEL
+  {
+    #pragma omp single
+    {
+#endif
+  // Forward prop step
+  model.clear_gradients();
+  model.forward_prop(execution_mode::training);
+  // Result is not needed until the end of the mini-batch.
+  model.get_objective_function()->start_evaluation(execution_mode::training,
+                                                    c.get_current_mini_batch_size());
+
+  // Backward prop step
+  model.get_objective_function()->differentiate();
+  model.backward_prop();
+  model.get_objective_function()->compute_weight_regularization();
+
+  // Finish evaluation.
+  model.get_objective_function()->finish_evaluation(execution_mode::training,
+                                                     c.get_current_mini_batch_size());
+  model.evaluate_metrics(execution_mode::training,
+                          c.get_current_mini_batch_size());
+
+  // Update step
+  model.update_weights();
+  finished = model.update_layers();
+#if defined(LBANN_HAVE_OMP_TASKLOOP)
+    }
+  }
+#endif
+
+  c.inc_step();
+  do_batch_end_cbs(model, execution_mode::training);
+  return finished;
+}
+
+void sgd_training_algorithm::evaluate(sgd_execution_context& c,
+                                      model& model,
+                                      data_coordinator& dc,
+                                      execution_mode mode,
+                                      size_t num_batches) {
+  /// @todo BVE FIXME this state needs to be set for inference-only
+  /// workflows -- however, if the model will bail due to a lack of a
+  /// valid mode, the state of the data coordinator is not
+  /// consistent.  Fix this once the data coordinator is fully
+  /// decoupled from the input layer.
+  model.reset_epoch_statistics(mode);
+  model.reset_mode(c, mode);
+  // Ensure that the data coordinator has the right execution context
+  dc.reset_mode(c);
+  // Return early if execution mode is invalid
+  if (!model.is_execution_mode_valid(mode)) return;
+  if (mode != execution_mode::validation
+      && mode != execution_mode::testing) {
+    std::stringstream err;
+    err << __FILE__ << " " << __LINE__ << " :: "
+        << "invalid execution mode for evaluation";
+    throw lbann_exception(err.str());
+  }
+
+  // Evaluate on all mini-batches
+  do_evaluate_begin_cbs(model, mode);
+  if (num_batches > 0) {
+    for (size_t i = 0; i < num_batches; i++) { evaluate_mini_batch(c, model, dc, mode); }
+  } else {
+    while (!evaluate_mini_batch(c, model, dc, mode)) {}
+  }
+  c.inc_epoch();
+  do_evaluate_end_cbs(model, mode);
+}
+
+bool sgd_training_algorithm::evaluate_mini_batch(sgd_execution_context& c,
+                                                 model& model,
+                                                 data_coordinator& dc,
+                                                 execution_mode mode) {
+  model.reset_mode(c, mode);
+  dc.reset_mode(c);
+  do_batch_begin_cbs(model, mode);
+  model.forward_prop(mode);
+  model.get_objective_function()->start_evaluation(mode, c.get_current_mini_batch_size());
+  model.get_objective_function()->finish_evaluation(mode, c.get_current_mini_batch_size());
+  model.evaluate_metrics(mode, c.get_current_mini_batch_size());
+  const bool finished = model.update_layers();
+  c.inc_step();
+  do_batch_end_cbs(model, mode);
+  return finished;
+}
+
+////////////////////////////////////////////////////////////
+// Callbacks
+////////////////////////////////////////////////////////////
+
+void sgd_training_algorithm::do_train_begin_cbs(model& model) {
+  for (const auto& cb : model.get_callbacks()) {
+    cb->on_train_begin(&model);
+  }
+}
+
+void sgd_training_algorithm::do_train_end_cbs(model& model) {
+  for (const auto& cb : model.get_callbacks()) {
+    cb->on_train_end(&model);
+  }
+}
+
+void sgd_training_algorithm::do_evaluate_begin_cbs(model& model, execution_mode mode) {
+  for (const auto& cb : model.get_callbacks()) {
+    switch (mode) {
+    case execution_mode::validation:
+      cb->on_validation_begin(&model); break;
+    case execution_mode::testing:
+      cb->on_test_begin(&model); break;
+    default:
+      LBANN_ERROR("invalid execution mode");
+    }
+  }
+}
+
+void sgd_training_algorithm::do_evaluate_end_cbs(model& model, execution_mode mode) {
+  for (const auto& cb : model.get_callbacks()) {
+    switch (mode) {
+    case execution_mode::validation:
+      cb->on_validation_end(&model); break;
+    case execution_mode::testing:
+      cb->on_test_end(&model); break;
+    default:
+      LBANN_ERROR("invalid execution mode");
+    }
+  }
+}
+
+void sgd_training_algorithm::do_epoch_begin_cbs(model& model) {
+  for (const auto& cb : model.get_callbacks()) {
+    cb->on_epoch_begin(&model);
+  }
+}
+
+void sgd_training_algorithm::do_epoch_end_cbs(model& model) {
+  for (const auto& cb : model.get_callbacks()) {
+    cb->on_epoch_end(&model);
+  }
+}
+
+void sgd_training_algorithm::do_batch_begin_cbs(model& model, execution_mode mode) {
+  sgd_execution_context& c = static_cast<sgd_execution_context&>(model.get_execution_context());
+
+  for (const auto& cb : model.get_callbacks()) {
+    switch (mode) {
+    case execution_mode::training:
+      if (c.get_step() % cb->get_batch_interval() == 0) {
+        cb->on_batch_begin(&model);
+      }
+      break;
+    case execution_mode::validation:
+    case execution_mode::testing:
+      cb->on_batch_evaluate_begin(&model);
+      break;
+    default:
+      LBANN_ERROR("invalid execution mode");
+    }
+  }
+}
+
+void sgd_training_algorithm::do_batch_end_cbs(model& model, execution_mode mode) {
+  sgd_execution_context& c = static_cast<sgd_execution_context&>(model.get_execution_context());
+
+  for (const auto& cb : model.get_callbacks()) {
+    switch (mode) {
+    case execution_mode::training:
+      if (c.get_step() % cb->get_batch_interval() == 0) {
+        cb->on_batch_end(&model);
+      }
+      break;
+    case execution_mode::validation:
+    case execution_mode::testing:
+      cb->on_batch_evaluate_end(&model);
+      break;
+    default:
+      LBANN_ERROR("invalid execution mode");
+    }
+  }
+}
+
+}  // namespace lbann
diff --git a/src/training_algorithms/training_algorithm.cpp b/src/training_algorithms/training_algorithm.cpp
new file mode 100644
index 00000000000..38142ec9bf1
--- /dev/null
+++ b/src/training_algorithms/training_algorithm.cpp
@@ -0,0 +1,54 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/training_algorithms/training_algorithm.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/callbacks/checkpoint.hpp"
+#include "lbann/callbacks/save_model.hpp"
+#include "lbann/callbacks/load_model.hpp"
+
+namespace lbann {
+
+void training_algorithm::setup_models(std::vector<observer_ptr<model>> models, size_t max_mini_batch_size, DataReaderMetaData& dr_metadata) {
+  for (observer_ptr<model> m : models) {
+    // Set up callbacks
+    for (auto* c : m->get_callbacks()) {
+      {
+        auto* cb = dynamic_cast<callback::checkpoint*>(c);
+        if(cb != nullptr) {
+          cb->set_active_training_algorithm(this);
+        }
+      }
+    }
+    // Setup models
+    m->setup(max_mini_batch_size, dr_metadata);
+  }
+  return;
+}
+
+
+}  // namespace lbann
diff --git a/src/transforms/CMakeLists.txt b/src/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..d934e6b5dbd
--- /dev/null
+++ b/src/transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  normalize.cpp
+  repack_HWC_to_CHW_layout.cpp
+  sample_normalize.cpp
+  scale.cpp
+  scale_and_translate.cpp
+  transform_pipeline.cpp
+  )
+
+add_subdirectory(vision)
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/transforms/normalize.cpp b/src/transforms/normalize.cpp
new file mode 100644
index 00000000000..374b5e340d6
--- /dev/null
+++ b/src/transforms/normalize.cpp
@@ -0,0 +1,117 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/normalize.hpp"
+
+#include "lbann/proto/proto_common.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void normalize::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  // Ensure we have the right number of channels.
+  if (dims.size() == 3 && m_means.size() != dims[0]) {
+    LBANN_ERROR("Normalize channels does not match data");
+  } else if (dims.size() != 3 && m_means.size() != 1) {
+    LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels");
+  }
+  // Only work with DataTypes to avoid rounding/floating point issues.
+  auto& mat = data.template get<DataType>();
+  if (mat.Height() != mat.LDim()) {
+    LBANN_ERROR("Normalizing non-contiguous matrix not supported");
+  }
+  DataType* __restrict__ buf = mat.Buffer();
+  if (m_means.size() == 1) {
+    const DataType mean = m_means[0];
+    const DataType std = m_stds[0];
+    const El::Int size = mat.Height() * mat.Width();
+    for (El::Int i = 0; i < size; ++i) {
+      buf[i] = (buf[i] - mean) / std;
+    }
+  } else {
+    for (size_t channel = 0; channel < dims[0]; ++channel) {
+      const DataType mean = m_means[channel];
+      const DataType std = m_stds[channel];
+      const size_t size = dims[1] * dims[2];
+      const size_t channel_start = channel*size;
+      const size_t channel_end = channel_start + size;
+      for (size_t i = channel_start; i < channel_end; ++i) {
+        buf[i] = (buf[i] - mean) / std;
+      }
+    }
+  }
+}
+
+void normalize::apply(utils::type_erased_matrix& data, CPUMat& out,
+                      std::vector<size_t>& dims) {
+  // Ensure we have the right number of channels.
+  if (dims.size() == 3 && m_means.size() != dims[0]) {
+    LBANN_ERROR("Normalize channels does not match data");
+  } else if (dims.size() != 3 && m_means.size() != 1) {
+    LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels");
+  }
+  if (out.Height() != out.LDim()) {
+    LBANN_ERROR("Normalizing to non-contiguous matrix not supported.");
+  }
+  const auto& src = data.template get<DataType>();
+  if (src.Height() != src.LDim()) {
+    LBANN_ERROR("Normalizing from non-contiguous matrix not supported.");
+  }
+  const DataType* __restrict__ src_buf = src.LockedBuffer();
+  DataType* __restrict__ dst_buf = out.Buffer();
+  if (m_means.size() == 1) {
+    const DataType mean = m_means[0];
+    const DataType std = m_stds[0];
+    const El::Int size = src.Height() * src.Width();
+    for (El::Int i = 0; i < size; ++i) {
+      dst_buf[i] = (src_buf[i] - mean) / std;
+    }
+  } else {
+    for (size_t channel = 0; channel < dims[0]; ++channel) {
+      const DataType mean = m_means[channel];
+      const DataType std = m_stds[channel];
+      const size_t size = dims[1] * dims[2];
+      const size_t channel_start = channel*size;
+      const size_t channel_end = channel_start + size;
+      for (size_t i = channel_start; i < channel_end; ++i) {
+        dst_buf[i] = (src_buf[i] - mean) / std;
+      }
+    }
+  }
+}
+
+std::unique_ptr<transform>
+build_normalize_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto& pb_trans = dynamic_cast<lbann_data::Transform::Normalize const&>(msg);
+  return make_unique<normalize>(
+    parse_list<float>(pb_trans.means()),
+    parse_list<float>(pb_trans.stddevs()));
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/repack_HWC_to_CHW_layout.cpp b/src/transforms/repack_HWC_to_CHW_layout.cpp
new file mode 100644
index 00000000000..6eeabe9d8bd
--- /dev/null
+++ b/src/transforms/repack_HWC_to_CHW_layout.cpp
@@ -0,0 +1,84 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/repack_HWC_to_CHW_layout.hpp"
+#include "lbann/utils/opencv.hpp"
+
+namespace lbann {
+namespace transform {
+
+void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  auto dst = CPUMat(utils::get_linearized_size(dims), 1);
+  apply(data, dst, dims);
+  data.emplace<DataType>(std::move(dst));
+}
+
+void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, CPUMat& out,
+                            std::vector<size_t>& dims) {
+  CPUMat &src = data.template get<DataType>();
+  if (!src.Contiguous()) {
+    LBANN_ERROR("RepackHWCtoCHWLayout does not support non-contiguous src.");
+  }
+  if (!out.Contiguous()) {
+    LBANN_ERROR("RepackHWCtoCHWLayout does not support non-contiguous destination.");
+  }
+  const DataType* __restrict__ src_buf = src.LockedBuffer();
+  const size_t out_size = utils::get_linearized_size(dims);
+  if (static_cast<size_t>(out.Height() * out.Width()) != out_size) {
+    LBANN_ERROR("Transform output does not have sufficient space.");
+  }
+  DataType* __restrict__ dst_buf = out.Buffer();
+  // Pack an interleave multi-channel data structure into a
+  // channel-strided data structure
+  const size_t size = dims[1] * dims[2];
+  for (size_t row = 0; row < dims[1]; ++row) {
+    for (size_t col = 0; col < dims[2]; ++col) {
+      int N = dims[0];
+      // Multiply by N because there are N channels.
+      const size_t src_base = N*(row + col*dims[1]);
+      const size_t dst_base = row + col*dims[1];
+      switch(N) {
+      case 4:
+        dst_buf[dst_base + 3*size] = src_buf[src_base + 3];
+        [[fallthrough]];
+      case 3:
+        dst_buf[dst_base + 2*size] = src_buf[src_base + 2];
+        [[fallthrough]];
+      case 2:
+        dst_buf[dst_base + size] = src_buf[src_base + 1];
+        [[fallthrough]];
+      case 1:
+        dst_buf[dst_base] = src_buf[src_base];
+        break;
+      default:
+        LBANN_ERROR("Unsupported number of channels");
+      }
+    }
+  }
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/sample_normalize.cpp b/src/transforms/sample_normalize.cpp
new file mode 100644
index 00000000000..392a9fdf901
--- /dev/null
+++ b/src/transforms/sample_normalize.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/sample_normalize.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/statistics.hpp"
+
+namespace lbann {
+namespace transform {
+
+void sample_normalize::apply(utils::type_erased_matrix& data, std::vector<size_t>&) {
+  // Only work with DataTypes to avoid rounding/floating point issues.
+  auto& mat = data.template get<DataType>();
+  if (mat.Height() != mat.LDim()) {
+    LBANN_ERROR("Normalizing non-contiguous matrix not supported.");
+  }
+  DataType mean, stdev;
+  entrywise_mean_and_stdev(mat, mean, stdev);
+  DataType* __restrict__ buf = mat.Buffer();
+  const El::Int size = mat.Height() * mat.Width();
+  for (El::Int i = 0; i < size; ++i) {
+    buf[i] = (buf[i] - mean) / stdev;
+  }
+}
+
+std::unique_ptr<transform>
+build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&) {
+  return make_unique<sample_normalize>();
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/scale.cpp b/src/transforms/scale.cpp
new file mode 100644
index 00000000000..bf16e71e9ce
--- /dev/null
+++ b/src/transforms/scale.cpp
@@ -0,0 +1,56 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/scale.hpp"
+#include "lbann/utils/memory.hpp"
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void scale::apply(utils::type_erased_matrix& data, std::vector<size_t>&) {
+  // Currently only works on DataTypes.
+  // Need to decide how to handle uint8_t matrices.
+  auto& mat = data.template get<DataType>();
+  if (mat.Height() != mat.LDim()) {
+    LBANN_ERROR("Scaling non-contiguous matrix not supported.");
+  }
+  // Don't use El::Scale because it spawns OpenMP threads.
+  DataType* __restrict__ buf = mat.Buffer();
+  const El::Int size = mat.Height() * mat.Width();
+  for (El::Int i = 0; i < size; ++i) {
+    buf[i] *= m_scale;
+  }
+}
+
+std::unique_ptr<transform>
+build_scale_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::Scale const&>(msg);
+  return make_unique<scale>(params.scale());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/scale_and_translate.cpp b/src/transforms/scale_and_translate.cpp
new file mode 100644
index 00000000000..8fe5d6ec3f4
--- /dev/null
+++ b/src/transforms/scale_and_translate.cpp
@@ -0,0 +1,48 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/scale_and_translate.hpp"
+
+namespace lbann {
+namespace transform {
+
+void scale_and_translate::apply(utils::type_erased_matrix& data, std::vector<size_t>&) {
+  // Currently only works on DataTypes.
+  // Need to decide how to handle uint8_t matrices.
+  auto& mat = data.template get<DataType>();
+  if (!mat.Contiguous()) {
+    LBANN_ERROR("Scaling and translating non-contiguous matrix not supported.");
+  }
+  // Don't use El::Scale because it spawns OpenMP threads.
+  DataType* __restrict__ buf = mat.Buffer();
+  const El::Int size = mat.Height() * mat.Width();
+  for (El::Int i = 0; i < size; ++i) {
+    buf[i] = m_scale * buf[i] + m_translate;
+  }
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/transform_pipeline.cpp b/src/transforms/transform_pipeline.cpp
new file mode 100644
index 00000000000..3c1c172a0a4
--- /dev/null
+++ b/src/transforms/transform_pipeline.cpp
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/transform_pipeline.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+namespace transform {
+
+transform_pipeline::transform_pipeline(const transform_pipeline& other) :
+  m_expected_out_dims(other.m_expected_out_dims) {
+  for (const auto& trans : other.m_transforms) {
+    m_transforms.emplace_back(trans->copy());
+  }
+}
+
+transform_pipeline& transform_pipeline::operator=(
+  const transform_pipeline& other) {
+  m_expected_out_dims = other.m_expected_out_dims;
+  m_transforms.clear();
+  for (const auto& trans : other.m_transforms) {
+    m_transforms.emplace_back(trans->copy());
+  }
+  return *this;
+}
+
+void transform_pipeline::apply(utils::type_erased_matrix& data,
+                               std::vector<size_t>& dims) {
+  for (auto& trans : m_transforms) {
+    trans->apply(data, dims);
+  }
+  assert_expected_out_dims(dims);
+}
+
+void transform_pipeline::apply(CPUMat& data, std::vector<size_t>& dims) {
+  utils::type_erased_matrix m = utils::type_erased_matrix(std::move(data));
+  apply(m, dims);
+  data = std::move(m.template get<DataType>());
+}
+
+void transform_pipeline::apply(El::Matrix<uint8_t>& data, CPUMat& out_data,
+                               std::vector<size_t>& dims) {
+  utils::type_erased_matrix m = utils::type_erased_matrix(std::move(data));
+  if (!m_transforms.empty()) {
+    bool applied_non_inplace = false;
+    size_t i = 0;
+    for (; !applied_non_inplace && i < m_transforms.size(); ++i) {
+      if (m_transforms[i]->supports_non_inplace()) {
+        applied_non_inplace = true;
+        m_transforms[i]->apply(m, out_data, dims);
+      } else {
+        m_transforms[i]->apply(m, dims);
+      }
+    }
+    if (!applied_non_inplace) {
+      LBANN_ERROR("No transform to go from uint8 -> DataType");
+    }
+    if (i < m_transforms.size()) {
+      // Apply the remaining transforms.
+      // TODO(pp): Prevent out_data from being resized/reallocated.
+      m = utils::type_erased_matrix(std::move(out_data));
+      for (; i < m_transforms.size(); ++i) {
+        m_transforms[i]->apply(m, dims);
+      }
+      out_data = std::move(m.template get<DataType>());
+    }
+  } else {
+    LBANN_ERROR("No transform to go from uint8 -> DataType");
+  }
+  assert_expected_out_dims(dims);
+}
+
+void transform_pipeline::assert_expected_out_dims(
+  const std::vector<size_t>& dims) {
+  if (!m_expected_out_dims.empty() && dims != m_expected_out_dims) {
+    std::stringstream ss;
+    ss << "Transformed dims do not match expected dims, got {";
+    for (const auto& d : dims) { ss << d << " "; }
+    ss << "} expected {";
+    for (const auto& d : m_expected_out_dims) { ss << d << " "; }
+    ss << "}";
+    LBANN_ERROR(ss.str());
+  }
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/unit_test/CMakeLists.txt b/src/transforms/unit_test/CMakeLists.txt
new file mode 100644
index 00000000000..bb7b8e9b201
--- /dev/null
+++ b/src/transforms/unit_test/CMakeLists.txt
@@ -0,0 +1,11 @@
+set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES
+  normalize_test.cpp
+  sample_normalize_test.cpp
+  scale_test.cpp
+  transform_pipeline_test.cpp
+  )
+
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQ_CATCH2_TEST_FILES}"
+  "${THIS_DIR_SEQ_CATCH2_TEST_FILES}"
+  PARENT_SCOPE)
diff --git a/src/transforms/unit_test/normalize_test.cpp b/src/transforms/unit_test/normalize_test.cpp
new file mode 100644
index 00000000000..5211b2e271f
--- /dev/null
+++ b/src/transforms/unit_test/normalize_test.cpp
@@ -0,0 +1,96 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/normalize.hpp>
+
+TEST_CASE("Testing normalize preprocessing", "[preproc]") {
+  SECTION("matrix with no channels") {
+    lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat());
+    El::Ones(mat.template get<lbann::DataType>(), 3, 3);
+    El::Scale(2.0f, mat.template get<lbann::DataType>());
+    std::vector<size_t> dims = {3, 3};
+    auto normalizer = lbann::transform::normalize({0.5}, {2.0});
+    SECTION("applying the normalizer") {
+      REQUIRE_NOTHROW(normalizer.apply(mat, dims));
+
+      SECTION("normalizing does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+      }
+      SECTION("normalizing does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+      }
+      SECTION("normalizing produces correct values") {
+        auto& real_mat = mat.template get<lbann::DataType>();
+        for (El::Int col = 0; col < 3; ++col) {
+          for (El::Int row = 0; row < 3; ++row) {
+            REQUIRE(real_mat(row, col) == Approx(0.75));
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with one channel") {
+    lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat());
+    El::Ones(mat.template get<lbann::DataType>(), 3, 3);
+    El::Scale(2.0f, mat.template get<lbann::DataType>());
+    std::vector<size_t> dims = {1, 3, 3};
+    auto normalizer = lbann::transform::normalize({0.5}, {2.0});
+    SECTION("applying the normalizer") {
+      REQUIRE_NOTHROW(normalizer.apply(mat, dims));
+
+      SECTION("normalizing does not change dims") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("normalizing does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+      }
+      SECTION("normalizing produces correct values") {
+        auto& real_mat = mat.template get<lbann::DataType>();
+        for (El::Int col = 0; col < 3; ++col) {
+          for (El::Int row = 0; row < 3; ++row) {
+            REQUIRE(real_mat(row, col) == Approx(0.75));
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat());
+    El::Ones(mat.template get<lbann::DataType>(), 27, 1);
+    El::Scale(2.0f, mat.template get<lbann::DataType>());
+    std::vector<size_t> dims = {3, 3, 3};
+    auto normalizer = lbann::transform::normalize({0.75, 0.5, 0.25},
+                                                  {1.0, 2.0, 4.0});
+    SECTION("applying the normalizer") {
+      REQUIRE_NOTHROW(normalizer.apply(mat, dims));
+
+      SECTION("normalizing does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("normalizing does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+      }
+      SECTION("normalizing produces correct values") {
+        auto& real_mat = mat.template get<lbann::DataType>();
+        const lbann::DataType* buf = real_mat.Buffer();
+        for (size_t i = 0; i < 9; ++i) {
+          REQUIRE(buf[i] == Approx(1.25));
+        }
+        for (size_t i = 9; i < 18; ++i) {
+          REQUIRE(buf[i] == Approx(0.75));
+        }
+        for (size_t i = 18; i < 27; ++i) {
+          REQUIRE(buf[i] == Approx(0.4375));
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/unit_test/sample_normalize_test.cpp b/src/transforms/unit_test/sample_normalize_test.cpp
new file mode 100644
index 00000000000..a8bfb434f1d
--- /dev/null
+++ b/src/transforms/unit_test/sample_normalize_test.cpp
@@ -0,0 +1,36 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/sample_normalize.hpp>
+
+TEST_CASE("Testing sample normalize preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat());
+  El::Identity(mat.template get<lbann::DataType>(), 3, 3);
+  El::Scale(2.0, mat.template get<lbann::DataType>());
+  std::vector<size_t> dims = {3, 3};
+  auto normalizer = lbann::transform::sample_normalize();
+  SECTION("applying the normalizer") {
+    REQUIRE_NOTHROW(normalizer.apply(mat, dims));
+
+    SECTION("normalizing does not change dims") {
+      REQUIRE(dims[0] == 3);
+      REQUIRE(dims[1] == 3);
+    }
+    SECTION("normalizing does not change matrix type") {
+      REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+    }
+    SECTION("normalizing produces correct values") {
+      auto& real_mat = mat.template get<lbann::DataType>();
+      for (El::Int col = 0; col < 3; ++col) {
+        for (El::Int row = 0; row < 3; ++row) {
+          if (row == col) {
+            REQUIRE(real_mat(row, col) == Approx(1.41421356));
+          } else {
+            REQUIRE(real_mat(row, col) == Approx(-0.70710678));
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/unit_test/scale_test.cpp b/src/transforms/unit_test/scale_test.cpp
new file mode 100644
index 00000000000..b21c4c408c9
--- /dev/null
+++ b/src/transforms/unit_test/scale_test.cpp
@@ -0,0 +1,32 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/scale.hpp>
+
+TEST_CASE("Testing scale preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat());
+  El::Ones(mat.template get<lbann::DataType>(), 3, 3);
+  std::vector<size_t> dims = {3, 3};
+  auto scaler = lbann::transform::scale(2.0);
+
+  SECTION("applying the scaler") {
+    REQUIRE_NOTHROW(scaler.apply(mat, dims));
+
+    SECTION("scaling does not change dims") {
+      REQUIRE(dims[0] == 3);
+      REQUIRE(dims[1] == 3);
+    }
+    SECTION("scaling does not change matrix type") {
+      REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+    }
+    SECTION("scaling changes matrix values") {
+      auto& real_mat = mat.template get<lbann::DataType>();
+      for (El::Int col = 0; col < 3; ++col) {
+        for (El::Int row = 0; row < 3; ++row) {
+          REQUIRE(real_mat(row, col) == 2.0);
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/unit_test/transform_pipeline_test.cpp b/src/transforms/unit_test/transform_pipeline_test.cpp
new file mode 100644
index 00000000000..dff68c4b883
--- /dev/null
+++ b/src/transforms/unit_test/transform_pipeline_test.cpp
@@ -0,0 +1,38 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/transform_pipeline.hpp>
+#include <lbann/transforms/scale.hpp>
+#include <lbann/transforms/sample_normalize.hpp>
+#include <lbann/utils/memory.hpp>
+
+TEST_CASE("Testing transform pipeline", "[preproc]") {
+  lbann::transform::transform_pipeline p;
+  p.add_transform(lbann::make_unique<lbann::transform::scale>(2.0f));
+  p.add_transform(lbann::make_unique<lbann::transform::sample_normalize>());
+  lbann::CPUMat mat;
+  El::Identity(mat, 3, 3);
+  std::vector<size_t> dims = {3, 3};
+
+  SECTION("applying the pipeline") {
+    REQUIRE_NOTHROW(p.apply(mat, dims));
+
+    SECTION("pipeline does not change dims") {
+      REQUIRE(dims[0] == 3);
+      REQUIRE(dims[1] == 3);
+    }
+
+    SECTION("pipeline produces correct values") {
+      for (El::Int col = 0; col < 3; ++col) {
+        for (El::Int row = 0; row < 3; ++row) {
+          if (row == col) {
+            REQUIRE(mat(row, col) == Approx(1.41421356));
+          } else {
+            REQUIRE(mat(row, col) == Approx(-0.70710678));
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/CMakeLists.txt b/src/transforms/vision/CMakeLists.txt
new file mode 100644
index 00000000000..e354a90d3e0
--- /dev/null
+++ b/src/transforms/vision/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  adjust_brightness.cpp
+  adjust_contrast.cpp
+  adjust_saturation.cpp
+  center_crop.cpp
+  colorize.cpp
+  color_jitter.cpp
+  cutout.cpp
+  grayscale.cpp
+  horizontal_flip.cpp
+  normalize_to_lbann_layout.cpp
+  random_affine.cpp
+  random_crop.cpp
+  random_resized_crop.cpp
+  random_resized_crop_with_fixed_aspect_ratio.cpp
+  resize.cpp
+  resized_center_crop.cpp
+  to_lbann_layout.cpp
+  vertical_flip.cpp
+  )
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
diff --git a/src/transforms/vision/adjust_brightness.cpp b/src/transforms/vision/adjust_brightness.cpp
new file mode 100644
index 00000000000..014cbc00444
--- /dev/null
+++ b/src/transforms/vision/adjust_brightness.cpp
@@ -0,0 +1,58 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/adjust_brightness.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void adjust_brightness::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  // Adjusting the brightness is simply scaling by a constant value
+  // taking care to saturate.
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (!src.isContinuous()) {
+    // This should not occur, but just in case.
+    LBANN_ERROR("Do not support non-contiguous OpenCV matrices.");
+  }
+  uint8_t* __restrict__ src_buf = src.ptr();
+  const size_t size = utils::get_linearized_size(dims);
+  for (size_t i = 0; i < size; ++i) {
+    src_buf[i] = cv::saturate_cast<uint8_t>(src_buf[i]*m_factor);
+  }
+}
+
+std::unique_ptr<transform>
+build_adjust_brightness_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::AdjustBrightness const&>(msg);
+  return make_unique<adjust_brightness>(params.factor());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/adjust_contrast.cpp b/src/transforms/vision/adjust_contrast.cpp
new file mode 100644
index 00000000000..eca8a33c68a
--- /dev/null
+++ b/src/transforms/vision/adjust_contrast.cpp
@@ -0,0 +1,94 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/adjust_contrast.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void adjust_contrast::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  // To adjust contrast, we essentially add the mean of the grayscale version
+  // of the image, scaled by (1 - m_factor) to each pixel.
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (!src.isContinuous()) {
+    // This should not occur, but just in case.
+    LBANN_ERROR("Do not support non-contiguous OpenCV matrices.");
+  }
+  // Get the grayscale version and compute its mean value.
+  // If need be, we could do this computation in-place by manually computing
+  // the grayscale value of each pixel.
+  uint8_t gray_mean = 0.0;
+  if (dims[0] == 1) {
+    // Already grayscale, just compute the mean.
+    uint64_t sum = 0;
+    const size_t size = utils::get_linearized_size(dims);
+    const uint8_t* __restrict__ gray_buf = src.ptr();
+    for (size_t i = 0; i < size; ++i) {
+      sum += gray_buf[i];
+    }
+    gray_mean = static_cast<uint8_t>(
+      std::round(static_cast<double>(sum) / static_cast<double>(size)));
+  } else {
+    std::vector<size_t> gray_dims = {1, dims[1], dims[2]};
+    const size_t size = utils::get_linearized_size(gray_dims);
+    auto gray_real = El::Matrix<uint8_t>(size, 1);
+    cv::Mat gray = utils::get_opencv_mat(gray_real, gray_dims);
+    cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY);
+    const uint8_t* __restrict__ gray_buf = gray.ptr();
+    // We sum integers, so accumulate into an integer.
+    // This should be large enough to avoid overflow, provided we have less than
+    // 2^56 pixels or so.
+    uint64_t sum = 0;
+    for (size_t i = 0; i < size; ++i) {
+      sum += gray_buf[i];
+    }
+    gray_mean = static_cast<uint8_t>(
+      std::round(static_cast<double>(sum) / static_cast<double>(size)));
+  }
+  // Mix the gray mean with the original image.
+  uint8_t* __restrict__ src_buf = src.ptr();
+  const float one_minus_factor = 1.0f - m_factor;
+  const size_t size = utils::get_linearized_size(dims);
+  for (size_t i = 0; i < size; ++i) {
+    src_buf[i] = cv::saturate_cast<uint8_t>(
+      src_buf[i]*m_factor + gray_mean*one_minus_factor);
+  }
+}
+
+std::unique_ptr<transform>
+build_adjust_contrast_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::AdjustContrast const&>(msg);
+  return make_unique<adjust_contrast>(params.factor());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/adjust_saturation.cpp b/src/transforms/vision/adjust_saturation.cpp
new file mode 100644
index 00000000000..2532a637856
--- /dev/null
+++ b/src/transforms/vision/adjust_saturation.cpp
@@ -0,0 +1,82 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/adjust_saturation.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void adjust_saturation::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  // To adjust contrast, we essentially blend between the grayscale and
+  // original image based on the given factor.
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (!src.isContinuous()) {
+    // This should not occur, but just in case.
+    LBANN_ERROR("Do not support non-contiguous OpenCV matrices.");
+  }
+  if (dims[0] == 1) {
+    // Already grayscale, nothing to do.
+  } else {
+    // Handle RGB.
+    // Get the grayscaled image.
+    // If need be, we could do this computation in-place by manually computing
+    // the grayscale value of each pixel.
+    std::vector<size_t> gray_dims = {1, dims[1], dims[2]};
+    const size_t gray_size = utils::get_linearized_size(gray_dims);
+    auto gray_real = El::Matrix<uint8_t>(gray_size, 1);
+    cv::Mat gray = utils::get_opencv_mat(gray_real, gray_dims);
+    cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY);
+    const uint8_t* __restrict__ gray_buf = gray.ptr();
+    // Mix the grayscale image with the original.
+    uint8_t* __restrict__ src_buf = src.ptr();
+    const float one_minus_factor = 1.0f - m_factor;
+    for (size_t i = 0; i < gray_size; ++i) {
+      // Handle the three channels, in OpenCV format.
+      const size_t src_base = 3*i;
+      src_buf[src_base] = cv::saturate_cast<uint8_t>(
+        src_buf[src_base]*m_factor + gray_buf[i]*one_minus_factor);
+      src_buf[src_base+1] = cv::saturate_cast<uint8_t>(
+        src_buf[src_base+1]*m_factor + gray_buf[i]*one_minus_factor);
+      src_buf[src_base+2] = cv::saturate_cast<uint8_t>(
+        src_buf[src_base+2]*m_factor + gray_buf[i]*one_minus_factor);
+    }
+  }
+}
+
+std::unique_ptr<transform>
+build_adjust_saturation_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::AdjustSaturation const&>(msg);
+  return make_unique<adjust_saturation>(params.factor());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/center_crop.cpp b/src/transforms/vision/center_crop.cpp
new file mode 100644
index 00000000000..fbc30e41fbe
--- /dev/null
+++ b/src/transforms/vision/center_crop.cpp
@@ -0,0 +1,75 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/center_crop.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <cmath>
+
+namespace lbann {
+namespace transform {
+
+void center_crop::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (dims[1] <= m_h || dims[2] <= m_w) {
+    std::stringstream ss;
+    ss << "Center crop to " << m_h << "x" << m_w
+       << " applied to input " << dims[1] << "x" << dims[2];
+    LBANN_ERROR(ss.str());
+  }
+  std::vector<size_t> new_dims = {dims[0], m_h, m_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  // Compute upper-left corner of crop.
+  const size_t x = std::round(float(src.cols - m_w) / 2.0);
+  const size_t y = std::round(float(src.rows - m_h) / 2.0);
+  // Sanity check.
+  if (x >= static_cast<size_t>(src.cols) ||
+      y >= static_cast<size_t>(src.rows) ||
+      (x + m_w) > static_cast<size_t>(src.cols) ||
+      (y + m_h) > static_cast<size_t>(src.rows)) {
+    std::stringstream ss;
+    ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": "
+       << m_h << "x" << m_w << " at (" << x << "," << y << ")";
+    LBANN_ERROR(ss.str());
+  }
+  // Copy is needed to ensure this is continuous.
+  src(cv::Rect(x, y, m_h, m_w)).copyTo(dst);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_center_crop_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::CenterCrop const&>(msg);
+  return make_unique<center_crop>(params.height(), params.width());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/color_jitter.cpp b/src/transforms/vision/color_jitter.cpp
new file mode 100644
index 00000000000..e8231a63d28
--- /dev/null
+++ b/src/transforms/vision/color_jitter.cpp
@@ -0,0 +1,128 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/color_jitter.hpp"
+#include "lbann/transforms/vision/adjust_brightness.hpp"
+#include "lbann/transforms/vision/adjust_contrast.hpp"
+#include "lbann/transforms/vision/adjust_saturation.hpp"
+#include "lbann/utils/random_number_generators.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <algorithm>
+
+namespace lbann {
+namespace transform {
+
+color_jitter::color_jitter(float min_brightness_factor, float max_brightness_factor,
+                           float min_contrast_factor, float max_contrast_factor,
+                           float min_saturation_factor, float max_saturation_factor) :
+  transform(),
+  m_min_brightness_factor(min_brightness_factor),
+  m_max_brightness_factor(max_brightness_factor),
+  m_min_contrast_factor(min_contrast_factor),
+  m_max_contrast_factor(max_contrast_factor),
+  m_min_saturation_factor(min_saturation_factor),
+  m_max_saturation_factor(max_saturation_factor) {
+  if (min_brightness_factor < 0.0f ||
+      max_brightness_factor < min_brightness_factor) {
+    LBANN_ERROR("Min/max brightness factors out of range: "
+                + std::to_string(min_brightness_factor) + " "
+                + std::to_string(max_brightness_factor));
+  }
+  if (min_contrast_factor < 0.0f ||
+      max_contrast_factor < min_contrast_factor) {
+    LBANN_ERROR("Min/max contrast factors out of range: "
+                + std::to_string(min_contrast_factor) + " "
+                + std::to_string(max_contrast_factor));
+  }
+  if (min_saturation_factor < 0.0f ||
+      max_saturation_factor < min_saturation_factor) {
+    LBANN_ERROR("Min/max saturation factors out of range: "
+                + std::to_string(min_saturation_factor) + " "
+                + std::to_string(max_saturation_factor));
+  }
+}
+
+void color_jitter::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  fast_rng_gen& gen = get_fast_generator();
+  // Determine the order to apply transforms.
+  // Unused transforms will be skipped.
+  // 1 == brightness, 2 == contrast, 3 == saturation.
+  std::vector<int> transform_order = {1, 2, 3};
+  std::shuffle(transform_order.begin(), transform_order.end(), gen);
+  // Now apply the random adjustments.
+  for (const auto& t : transform_order) {
+    switch (t) {
+    case 1:
+      // Brightness.
+      if (!(m_min_brightness_factor == 0.0f &&
+            m_min_brightness_factor == m_max_brightness_factor)) {
+        std::uniform_real_distribution<float> dist(
+          m_min_brightness_factor, m_max_brightness_factor);
+        adjust_brightness trans = adjust_brightness(dist(gen));
+        trans.apply(data, dims);
+      }
+      break;
+    case 2:
+      // Contrast.
+      if (!(m_min_contrast_factor == 0.0f &&
+            m_min_contrast_factor == m_max_contrast_factor)) {
+        std::uniform_real_distribution<float> dist(
+          m_min_contrast_factor, m_max_contrast_factor);
+        adjust_contrast trans = adjust_contrast(dist(gen));
+        trans.apply(data, dims);
+      }
+      break;
+    case 3:
+      // Saturation.
+      if (!(m_min_saturation_factor == 0.0f &&
+            m_min_saturation_factor == m_max_saturation_factor)) {
+        std::uniform_real_distribution<float> dist(
+          m_min_saturation_factor, m_max_saturation_factor);
+        adjust_saturation trans = adjust_saturation(dist(gen));
+        trans.apply(data, dims);
+      }
+      break;
+    default:
+      LBANN_ERROR("Unexpected transform number");
+    }
+  }
+}
+
+std::unique_ptr<transform>
+build_color_jitter_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::ColorJitter const&>(msg);
+  return make_unique<color_jitter>(
+    params.min_brightness_factor(), params.max_brightness_factor(),
+    params.min_contrast_factor(), params.max_contrast_factor(),
+    params.min_saturation_factor(), params.max_saturation_factor());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/colorize.cpp b/src/transforms/vision/colorize.cpp
new file mode 100644
index 00000000000..f32f87741d1
--- /dev/null
+++ b/src/transforms/vision/colorize.cpp
@@ -0,0 +1,54 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <opencv2/imgproc.hpp>
+#include "lbann/transforms/vision/colorize.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+namespace lbann {
+namespace transform {
+
+void colorize::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (dims[0] != 1) {
+    return;  // Already color.
+  }
+  std::vector<size_t> new_dims = {3, dims[1], dims[2]};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  cv::cvtColor(src, dst, cv::COLOR_GRAY2BGR);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_colorize_transform_from_pbuf(google::protobuf::Message const&) {
+  return make_unique<colorize>();
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp
new file mode 100644
index 00000000000..0a4ed9eeccd
--- /dev/null
+++ b/src/transforms/vision/cutout.cpp
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/cutout.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void cutout::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  for (size_t i = 0; i < m_num_holes; ++i) {
+    // Select the center of the hole.
+    const ssize_t center_x = transform::get_uniform_random_int(0, dims[2]);
+    const ssize_t center_y = transform::get_uniform_random_int(0, dims[1]);
+    // Compute top-left corner and bottom-right corners of the hole.
+    const ssize_t length = static_cast<ssize_t>(m_length);
+    const size_t x1 = std::max(center_x - length / 2, 0l);
+    const size_t x2 = std::min(x1 + length, dims[2] - 1);
+    const size_t y1 = std::max(center_y - length / 2, 0l);
+    const size_t y2 = std::min(y1 + length, dims[1] - 1);
+    // Convert to height/width.
+    const size_t h = y2 - y1;
+    const size_t w = x2 - x1;
+    // Sanity check.
+    if (x1 >= static_cast<size_t>(src.cols) ||
+        y1 >= static_cast<size_t>(src.rows) ||
+        (x1 + w) > static_cast<size_t>(src.cols) ||
+        (y1 + h) > static_cast<size_t>(src.rows)) {
+      std::stringstream ss;
+      ss << "Bad hole dimensions for " << src.rows << "x" << src.cols << ": "
+         << h << "x" << w << " at (" << x1 << "," << y1 << ")";
+      LBANN_ERROR(ss.str());
+    }
+    // This will be just a view into the original.
+    cv::Mat hole = src(cv::Rect(x1, y1, w, h));
+    hole = 0;
+  }
+}
+
+std::unique_ptr<transform>
+build_cutout_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::Cutout const&>(msg);
+  return make_unique<cutout>(params.num_holes(), params.length());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/grayscale.cpp b/src/transforms/vision/grayscale.cpp
new file mode 100644
index 00000000000..28fef85422b
--- /dev/null
+++ b/src/transforms/vision/grayscale.cpp
@@ -0,0 +1,54 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <opencv2/imgproc.hpp>
+#include "lbann/transforms/vision/grayscale.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+namespace lbann {
+namespace transform {
+
+void grayscale::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (dims[0] == 1) {
+    return;  // Only one channel: Already grayscale.
+  }
+  std::vector<size_t> new_dims = {1, dims[1], dims[2]};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  cv::cvtColor(src, dst, cv::COLOR_BGR2GRAY);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_grayscale_transform_from_pbuf(google::protobuf::Message const&) {
+  return make_unique<grayscale>();
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/horizontal_flip.cpp b/src/transforms/vision/horizontal_flip.cpp
new file mode 100644
index 00000000000..fa6306ed2c3
--- /dev/null
+++ b/src/transforms/vision/horizontal_flip.cpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/horizontal_flip.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void horizontal_flip::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  if (transform::get_bool_random(m_p)) {
+    cv::Mat src = utils::get_opencv_mat(data, dims);
+    auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(dims), 1);
+    cv::Mat dst = utils::get_opencv_mat(dst_real, dims);
+    cv::flip(src, dst, 1);
+    data.emplace<uint8_t>(std::move(dst_real));
+  }
+}
+
+std::unique_ptr<transform>
+build_horizontal_flip_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::HorizontalFlip const&>(msg);
+  return make_unique<horizontal_flip>(params.p());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/normalize_to_lbann_layout.cpp b/src/transforms/vision/normalize_to_lbann_layout.cpp
new file mode 100644
index 00000000000..67a47844b08
--- /dev/null
+++ b/src/transforms/vision/normalize_to_lbann_layout.cpp
@@ -0,0 +1,108 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/normalize_to_lbann_layout.hpp"
+#include "lbann/proto/proto_common.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void normalize_to_lbann_layout::apply(utils::type_erased_matrix& data,
+                                      std::vector<size_t>& dims) {
+  auto dst = CPUMat(utils::get_linearized_size(dims), 1);
+  apply(data, dst, dims);
+  data.emplace<DataType>(std::move(dst));
+}
+
+void normalize_to_lbann_layout::apply(utils::type_erased_matrix& data,
+                                      CPUMat& out,
+                                      std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (!src.isContinuous()) {
+    // This should not occur, but just in case.
+    LBANN_ERROR("Do not support non-contiguous OpenCV matrices.");
+  }
+  // Ensure we have the right number of channels.
+  if (dims.size() == 3 && m_means.size() != dims[0]) {
+    LBANN_ERROR("Normalize channels does not match data");
+  } else if (dims.size() != 3 && m_means.size() != 1) {
+    LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels");
+  }
+  if (!out.Contiguous()) {
+    LBANN_ERROR("NormalizeToLBANNLayout does not support non-contiguous destination.");
+  }
+  const uint8_t* __restrict__ src_buf = src.ptr();
+  const size_t out_size = utils::get_linearized_size(dims);
+  if (static_cast<size_t>(out.Height() * out.Width()) != out_size) {
+    LBANN_ERROR("Transform output does not have sufficient space.");
+  }
+  DataType* __restrict__ dst_buf = out.Buffer();
+  const float scale = 1.0f / 255.0f;
+  if (dims[0] == 1) {
+    // Greyscale.
+    const DataType mean = m_means[0];
+    const DataType std = m_stds[0];
+    for (size_t row = 0; row < dims[1]; ++row) {
+      for (size_t col = 0; col < dims[2]; ++col) {
+        dst_buf[row + col*dims[1]] =
+          (src_buf[row*dims[2] + col] * scale - mean) / std;
+      }
+    }
+  } else {
+    // RGB/three-channel.
+    const size_t size = dims[1] * dims[2];
+    for (size_t row = 0; row < dims[1]; ++row) {
+      for (size_t col = 0; col < dims[2]; ++col) {
+        // Multiply by 3 because there are three channels.
+        const size_t src_base = 3*(row*dims[2] + col);
+        const size_t dst_base = row + col*dims[1];
+        dst_buf[dst_base] =
+          (src_buf[src_base] * scale - m_means[0]) / m_stds[0];
+        dst_buf[dst_base + size] =
+          (src_buf[src_base + 1] * scale - m_means[1]) / m_stds[1];
+        dst_buf[dst_base + 2*size] =
+          (src_buf[src_base + 2] * scale - m_means[2]) / m_stds[2];
+      }
+    }
+  }
+}
+
+std::unique_ptr<transform>
+build_normalize_to_lbann_layout_transform_from_pbuf(
+  google::protobuf::Message const& msg) {
+  auto const& params =
+    dynamic_cast<lbann_data::Transform::NormalizeToLBANNLayout const&>(msg);
+  return make_unique<normalize_to_lbann_layout>(
+    parse_list<float>(params.means()),
+    parse_list<float>(params.stddevs()));
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/random_affine.cpp b/src/transforms/vision/random_affine.cpp
new file mode 100644
index 00000000000..aa43d381186
--- /dev/null
+++ b/src/transforms/vision/random_affine.cpp
@@ -0,0 +1,117 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/random_affine.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void random_affine::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, dims);
+  // Compute the random quantities for the transform.
+  // For converting to radians:
+  constexpr float pi_rad = 3.14159265358979323846f / 180.0f;
+  float angle = 0.0f;
+  if (m_rotate_min != 0.0f || m_rotate_max != 0.0f) {
+    angle = transform::get_uniform_random(m_rotate_min, m_rotate_max) * pi_rad;
+  }
+  float translate_x = 0.0f;
+  if (m_translate_h != 0.0f) {
+    const float dx = dims[2]*m_translate_w;
+    translate_x = std::round(transform::get_uniform_random(-dx, dx));
+  }
+  float translate_y = 0.0f;
+  if (m_translate_w != 0.0f) {
+    const float dy = dims[1]*m_translate_h;
+    translate_y = std::round(transform::get_uniform_random(-dy, dy));
+  }
+  float scale = 1.0f;
+  if (m_scale_min != 0.0f || m_scale_max != 0.0f) {
+    scale = transform::get_uniform_random(m_scale_min, m_scale_max);
+  }
+  float shear = 0.0f;
+  if (m_shear_min != 0.0f || m_shear_max != 0.0f) {
+    shear = transform::get_uniform_random(m_shear_min, m_shear_max) * pi_rad;
+  }
+  // Centering matrix:
+  const float center_x = dims[2]*0.5f + 0.5f;
+  const float center_y = dims[1]*0.5f + 0.5f;
+  // Compute the affine transformation matrix: M = T * C * R * S * Sc * C^-1
+  // where
+  // T = [1 0 translate_x | 0 1 translate_y | 0 0 1]
+  // is the translation matrix,
+  // C = [1 0 center_x | 0 1 center_y | 0 0 1]
+  // is the centering matrix,
+  // R = [cos(angle) -sin(angle) 0 | sin(angle) cos(angle) 0 | 0 0 1]
+  // is the rotation matrix,
+  // S = [1 -sin(shear) 0 | 0 cos(shear) 0 | 0 0 1]
+  // is the shear matrix, and
+  // Sc = [scale 0 0 | 0 scale 0 | 0 0 1]
+  // is the scale matrix.
+  // The centering matrix is used to ensure we rotate/shear about the center
+  // of the image.
+  // What we actually need is the inverse affine map (destination -> source):
+  // M^-1 = C * Sc^-1 S^-1 R^-1 C^-1 T^-1.
+  // This is a bit ugly to write out fully, but the below is the result, care of
+  // Mathematica.
+  const float sec_shear_scale = 1.0f / std::cos(shear) / scale;
+  float affine_mat[2][3] = {
+    {std::cos(angle+shear)*sec_shear_scale, std::sin(angle+shear)*sec_shear_scale, 0.0f},
+    {-std::sin(angle)*sec_shear_scale, std::cos(angle)*sec_shear_scale, 0.0f}
+  };
+  affine_mat[0][2] = affine_mat[0][0]*(-center_x - translate_x)
+    + affine_mat[0][1]*(-center_y - translate_y)
+    + center_x;
+  affine_mat[1][2] = affine_mat[1][0]*(-center_x - translate_x)
+    + affine_mat[1][1]*(-center_y - translate_y)
+    + center_y;
+  cv::Mat cv_affine(2, 3, CV_32F, affine_mat);
+  cv::warpAffine(src, dst, cv_affine, dst.size(),
+                 cv::INTER_LINEAR | cv::WARP_INVERSE_MAP,
+                 cv::BORDER_REPLICATE);
+  data.emplace<uint8_t>(std::move(dst_real));
+}
+
+std::unique_ptr<transform>
+build_random_affine_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::RandomAffine const&>(msg);
+  return make_unique<random_affine>(
+    params.rotate_min(), params.rotate_max(),
+    params.translate_h(), params.translate_w(),
+    params.scale_min(), params.scale_max(),
+    params.shear_min(), params.shear_max());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/random_crop.cpp b/src/transforms/vision/random_crop.cpp
new file mode 100644
index 00000000000..7c4a7ed8897
--- /dev/null
+++ b/src/transforms/vision/random_crop.cpp
@@ -0,0 +1,74 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/random_crop.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void random_crop::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (dims[1] <= m_h || dims[2] <= m_w) {
+    std::stringstream ss;
+    ss << "Random crop to " << m_h << "x" << m_w
+       << " applied to input " << dims[1] << "x" << dims[2];
+    LBANN_ERROR(ss.str());
+  }
+  std::vector<size_t> new_dims = {dims[0], m_h, m_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  // Select the upper-left corner of the crop.
+  const size_t x = transform::get_uniform_random_int(0, dims[2] - m_w + 1);
+  const size_t y = transform::get_uniform_random_int(0, dims[1] - m_h + 1);
+  // Sanity check.
+  if (x >= static_cast<size_t>(src.cols) ||
+      y >= static_cast<size_t>(src.rows) ||
+      (x + m_w) > static_cast<size_t>(src.cols) ||
+      (y + m_h) > static_cast<size_t>(src.rows)) {
+    std::stringstream ss;
+    ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": "
+       << m_h << "x" << m_w << " at (" << x << "," << y << ")";
+    LBANN_ERROR(ss.str());
+  }
+  // Copy is needed to ensure this is continuous.
+  src(cv::Rect(x, y, m_h, m_w)).copyTo(dst);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_random_crop_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params =
+    dynamic_cast<lbann_data::Transform::RandomCrop const&>(msg);
+  return make_unique<random_crop>(params.height(), params.width());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/random_resized_crop.cpp b/src/transforms/vision/random_resized_crop.cpp
new file mode 100644
index 00000000000..a5783ffd88a
--- /dev/null
+++ b/src/transforms/vision/random_resized_crop.cpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/random_resized_crop.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void random_resized_crop::apply(utils::type_erased_matrix& data,
+                                std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  std::vector<size_t> new_dims = {dims[0], m_h, m_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  size_t x = 0, y = 0, h = 0, w = 0;
+  const size_t area = dims[1]*dims[2];
+  // There's a chance this can fail, so we only make ten attempts.
+  for (int attempt = 0; attempt < 10; ++attempt) {
+    const float target_area = area*transform::get_uniform_random(m_scale_min,
+                                                                 m_scale_max);
+    const float target_ar = transform::get_uniform_random(m_ar_min, m_ar_max);
+    w = El::Sqrt(target_area * target_ar);
+    h = El::Sqrt(target_area / target_ar);
+    // Swap these with 50% probability.
+    if (transform::get_bool_random(0.5)) {
+      std::swap(w, h);
+    }
+    if (w <= dims[2] && h <= dims[1]) {
+      x = transform::get_uniform_random_int(0, dims[2] - w + 1);
+      y = transform::get_uniform_random_int(0, dims[1] - h + 1);
+      break;
+    }
+    // Reset.
+    h = 0;
+    w = 0;
+  }
+  bool fallback = false;
+  // Fallback.
+  if (h == 0) {
+    fallback = true;
+    w = std::min(dims[1], dims[2]);
+    h = w;
+    x = (dims[2] - w) / 2;
+    y = (dims[1] - h) / 2;
+  }
+  // Sanity check.
+  if (x >= static_cast<size_t>(src.cols) ||
+      y >= static_cast<size_t>(src.rows) ||
+      (x + w) > static_cast<size_t>(src.cols) ||
+      (y + h) > static_cast<size_t>(src.rows)) {
+    std::stringstream ss;
+    ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": "
+       << h << "x" << w << " at (" << x << "," << y << ") fallback=" << fallback;
+    LBANN_ERROR(ss.str());
+  }
+  // This is just a view.
+  cv::Mat tmp = src(cv::Rect(x, y, w, h));
+  cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR);
+  // Sanity check.
+  if (dst.ptr() != dst_real.Buffer()) {
+    LBANN_ERROR("Did not resize into dst_real.");
+  }
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_random_resized_crop_transform_from_pbuf(
+  google::protobuf::Message const& msg) {
+  auto const& params =
+    dynamic_cast<lbann_data::Transform::RandomResizedCrop const&>(msg);
+  if (params.scale_min() != 0.0f) {
+    return make_unique<random_resized_crop>(
+      params.height(), params.width(),
+      params.scale_min(), params.scale_max(),
+      params.ar_min(), params.ar_max());
+  } else {
+    return make_unique<random_resized_crop>(params.height(), params.width());
+  }
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp
new file mode 100644
index 00000000000..5245cdbce1b
--- /dev/null
+++ b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp
@@ -0,0 +1,86 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void random_resized_crop_with_fixed_aspect_ratio::apply(
+  utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  std::vector<size_t> new_dims = {dims[0], m_crop_h, m_crop_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  // Compute the projected crop area in the original image, crop it, and resize.
+  const float zoom = std::min(float(src.rows) / float(m_h),
+                              float(src.cols) / float(m_w));
+  const size_t zoom_h = m_h*zoom;
+  const size_t zoom_w = m_w*zoom;
+  const size_t zoom_crop_h = m_crop_h*zoom;
+  const size_t zoom_crop_w = m_crop_w*zoom;
+  const size_t dx = transform::get_uniform_random_int(
+    0, 2*(zoom*m_w - zoom_crop_w) + 1);
+  const size_t dy = transform::get_uniform_random_int(
+    0, 2*(zoom*m_h - zoom_crop_h) + 1);
+  const size_t x = (dims[2] - zoom_w + dx + 1) / 2;
+  const size_t y = (dims[1] - zoom_h + dy + 1) / 2;
+  // Sanity check.
+  if (x >= static_cast<size_t>(src.cols) ||
+      y >= static_cast<size_t>(src.rows) ||
+      (x + zoom_crop_w) > static_cast<size_t>(src.cols) ||
+      (y + zoom_crop_h) > static_cast<size_t>(src.rows)) {
+    std::stringstream ss;
+    ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": "
+       << zoom_crop_h << "x" << zoom_crop_w << " at (" << x << "," << y << ")";
+    LBANN_ERROR(ss.str());
+  }
+  // The crop is just a view.
+  cv::Mat tmp = src(cv::Rect(x, y, zoom_crop_h, zoom_crop_w));
+  cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf(
+  google::protobuf::Message const& msg) {
+  using namespace lbann_data;
+  auto const& params =
+    dynamic_cast<Transform::RandomResizedCropWithFixedAspectRatio const&>(msg);
+  return make_unique<random_resized_crop_with_fixed_aspect_ratio>(
+    params.height(), params.width(),
+    params.crop_height(), params.crop_width());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/resize.cpp b/src/transforms/vision/resize.cpp
new file mode 100644
index 00000000000..230b1bf2c50
--- /dev/null
+++ b/src/transforms/vision/resize.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/resize.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void resize::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  std::vector<size_t> new_dims = {dims[0], m_h, m_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  cv::resize(src, dst, dst.size(), 0, 0, cv::INTER_LINEAR);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_resize_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::Resize const&>(msg);
+  return make_unique<resize>(params.height(), params.width());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/resized_center_crop.cpp b/src/transforms/vision/resized_center_crop.cpp
new file mode 100644
index 00000000000..5721fc7511a
--- /dev/null
+++ b/src/transforms/vision/resized_center_crop.cpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/resized_center_crop.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+#include <opencv2/imgproc.hpp>
+
+namespace lbann {
+namespace transform {
+
+void resized_center_crop::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  std::vector<size_t> new_dims = {dims[0], m_crop_h, m_crop_w};
+  auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(new_dims), 1);
+  cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims);
+  // This computes the projected crop area in the original image, crops it,
+  // then resizes it.
+  // Thus, we resize a smaller image, which is faster.
+  // Method due to @JaeseungYeom.
+  const float zoom = std::min(float(src.rows) / float(m_h),
+                              float(src.cols) / float(m_w));
+  const size_t zoom_h = m_crop_h*zoom;
+  const size_t zoom_w = m_crop_w*zoom;
+  const size_t x = std::round(float(src.cols - zoom_w) / 2.0f);
+  const size_t y = std::round(float(src.rows - zoom_h) / 2.0f);
+  // Sanity check.
+  if (x >= static_cast<size_t>(src.cols) ||
+      y >= static_cast<size_t>(src.rows) ||
+      (x + zoom_w) > static_cast<size_t>(src.cols) ||
+      (y + zoom_h) > static_cast<size_t>(src.rows)) {
+    std::stringstream ss;
+    ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": "
+       << zoom_h << "x" << zoom_w << " at (" << x << "," << y << ")";
+    LBANN_ERROR(ss.str());
+  }
+  // The crop is just a view.
+  cv::Mat tmp = src(cv::Rect(x, y, zoom_h, zoom_w));
+  cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR);
+  data.emplace<uint8_t>(std::move(dst_real));
+  dims = new_dims;
+}
+
+std::unique_ptr<transform>
+build_resized_center_crop_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::ResizedCenterCrop const&>(msg);
+  return make_unique<resized_center_crop>(
+    params.height(), params.width(),
+    params.crop_height(), params.crop_width());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/to_lbann_layout.cpp b/src/transforms/vision/to_lbann_layout.cpp
new file mode 100644
index 00000000000..6dc741989c4
--- /dev/null
+++ b/src/transforms/vision/to_lbann_layout.cpp
@@ -0,0 +1,86 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/to_lbann_layout.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+namespace lbann {
+namespace transform {
+
+void to_lbann_layout::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  auto dst = CPUMat(utils::get_linearized_size(dims), 1);
+  apply(data, dst, dims);
+  data.emplace<DataType>(std::move(dst));
+}
+
+void to_lbann_layout::apply(utils::type_erased_matrix& data, CPUMat& out,
+                            std::vector<size_t>& dims) {
+  cv::Mat src = utils::get_opencv_mat(data, dims);
+  if (!src.isContinuous()) {
+    // This should not occur, but just in case.
+    LBANN_ERROR("Do not support non-contiguous OpenCV matrices.");
+  }
+  if (!out.Contiguous()) {
+    LBANN_ERROR("ToLBANNLayout does not support non-contiguous destination.");
+  }
+  const uint8_t* __restrict__ src_buf = src.ptr();
+  const size_t out_size = utils::get_linearized_size(dims);
+  if (static_cast<size_t>(out.Height() * out.Width()) != out_size) {
+    LBANN_ERROR("Transform output does not have sufficient space.");
+  }
+  DataType* __restrict__ dst_buf = out.Buffer();
+  const float scale = 1.0f / 255.0f;
+  if (dims[0] == 1) {
+    // Greyscale.
+    for (size_t row = 0; row < dims[1]; ++row) {
+      for (size_t col = 0; col < dims[2]; ++col) {
+        dst_buf[row + col*dims[1]] = src_buf[row*dims[2] + col] * scale;
+      }
+    }
+  } else {
+    // RGB/three-channel.
+    const size_t size = dims[1] * dims[2];
+    for (size_t row = 0; row < dims[1]; ++row) {
+      for (size_t col = 0; col < dims[2]; ++col) {
+        // Multiply by 3 because there are three channels.
+        const size_t src_base = 3*(row*dims[2] + col);
+        const size_t dst_base = row + col*dims[1];
+        dst_buf[dst_base] = src_buf[src_base] * scale;
+        dst_buf[dst_base + size] = src_buf[src_base + 1] * scale;
+        dst_buf[dst_base + 2*size] = src_buf[src_base + 2] * scale;
+      }
+    }
+  }
+}
+
+std::unique_ptr<transform>
+build_to_lbann_layout_transform_from_pbuf(google::protobuf::Message const&) {
+  return make_unique<to_lbann_layout>();
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/transforms/vision/unit_test/CMakeLists.txt b/src/transforms/vision/unit_test/CMakeLists.txt
new file mode 100644
index 00000000000..b28fcacf507
--- /dev/null
+++ b/src/transforms/vision/unit_test/CMakeLists.txt
@@ -0,0 +1,20 @@
+set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES
+  center_crop_test.cpp
+  colorize_test.cpp
+  grayscale_test.cpp
+  horizontal_flip_test.cpp
+  random_affine_test.cpp
+  random_crop_test.cpp
+  random_resized_crop_test.cpp
+  random_resized_crop_with_fixed_aspect_ratio_test.cpp
+  resize_test.cpp
+  resized_center_crop_test.cpp
+  to_lbann_layout_test.cpp
+  transform_pipeline_test.cpp
+  vertical_flip_test.cpp
+  )
+
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQ_CATCH2_TEST_FILES}"
+  "${THIS_DIR_SEQ_CATCH2_TEST_FILES}"
+  PARENT_SCOPE)
diff --git a/src/transforms/vision/unit_test/center_crop_test.cpp b/src/transforms/vision/unit_test/center_crop_test.cpp
new file mode 100644
index 00000000000..95dcdad1840
--- /dev/null
+++ b/src/transforms/vision/unit_test/center_crop_test.cpp
@@ -0,0 +1,76 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/center_crop.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing center crop preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    zeros(mat.template get<uint8_t>(), 5, 5, 1);
+    apply_elementwise(mat.template get<uint8_t>(), 5, 5, 1,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row >= 1 && row <= 3 && col >= 1 && col <= 3) {
+                          x = 1;
+                        }
+                      });
+    std::vector<size_t> dims = {1, 5, 5};
+    auto cropper = lbann::transform::center_crop(3, 3);
+
+    SECTION("applying the crop") {
+      REQUIRE_NOTHROW(cropper.apply(mat, dims));
+
+      SECTION("cropping changes dims correctly") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("cropping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("cropping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            REQUIRE(x == 1);
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    zeros(mat.template get<uint8_t>(), 5, 5, 3);
+    apply_elementwise(mat.template get<uint8_t>(), 5, 5, 3,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row >= 1 && row <= 3 && col >= 1 && col <= 3) {
+                          x = 1;
+                        }
+                      });
+    std::vector<size_t> dims = {3, 5, 5};
+    auto cropper = lbann::transform::center_crop(3, 3);
+
+    SECTION("applying the crop") {
+      REQUIRE_NOTHROW(cropper.apply(mat, dims));
+
+      SECTION("cropping changes dims correctly") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("cropping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("cropping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            REQUIRE(x == 1);
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/colorize_test.cpp b/src/transforms/vision/unit_test/colorize_test.cpp
new file mode 100644
index 00000000000..570ba6a630f
--- /dev/null
+++ b/src/transforms/vision/unit_test/colorize_test.cpp
@@ -0,0 +1,66 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/colorize.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing colorize preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    identity(mat.template get<uint8_t>(), 3, 3, 1);
+    std::vector<size_t> dims = {1, 3, 3};
+    auto gs = lbann::transform::colorize();
+
+    SECTION("applying grayscape") {
+      REQUIRE_NOTHROW(gs.apply(mat, dims));
+
+      SECTION("colorize changes dims correctly") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("colorize does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("colorize does not change values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == col) { REQUIRE(x == 1); }
+            else { REQUIRE(x == 0); }
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    identity(mat.template get<uint8_t>(), 3, 3, 3);
+    std::vector<size_t> dims = {3, 3, 3};
+    auto gs = lbann::transform::colorize();
+
+    SECTION("applying colorize") {
+      REQUIRE_NOTHROW(gs.apply(mat, dims));
+
+      SECTION("colorize does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("colorize does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("colorize produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == col) { REQUIRE(x == 1); }
+            else { REQUIRE(x == 0); }
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/grayscale_test.cpp b/src/transforms/vision/unit_test/grayscale_test.cpp
new file mode 100644
index 00000000000..0c1248c5abb
--- /dev/null
+++ b/src/transforms/vision/unit_test/grayscale_test.cpp
@@ -0,0 +1,66 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/grayscale.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing grayscale preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    identity(mat.template get<uint8_t>(), 3, 3, 1);
+    std::vector<size_t> dims = {1, 3, 3};
+    auto gs = lbann::transform::grayscale();
+
+    SECTION("applying grayscape") {
+      REQUIRE_NOTHROW(gs.apply(mat, dims));
+
+      SECTION("grayscale does not change dims") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("grayscale does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("grayscale does not change values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == col) { REQUIRE(x == 1); }
+            else { REQUIRE(x == 0); }
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    identity(mat.template get<uint8_t>(), 3, 3, 3);
+    std::vector<size_t> dims = {3, 3, 3};
+    auto gs = lbann::transform::grayscale();
+
+    SECTION("applying grayscale") {
+      REQUIRE_NOTHROW(gs.apply(mat, dims));
+
+      SECTION("grayscale changes dims correctly") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("grayscale does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("grayscale produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == col) { REQUIRE(x == 1); }
+            else { REQUIRE(x == 0); }
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/helper.hpp b/src/transforms/vision/unit_test/helper.hpp
new file mode 100644
index 00000000000..4512489e198
--- /dev/null
+++ b/src/transforms/vision/unit_test/helper.hpp
@@ -0,0 +1,58 @@
+#ifndef LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER
+#define LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER
+
+inline void apply_elementwise(
+  El::Matrix<uint8_t>& mat, El::Int height, El::Int width, El::Int channels,
+  std::function<void(uint8_t&, El::Int, El::Int, El::Int)> f) {
+  uint8_t* buf = mat.Buffer();
+  for (El::Int channel = 0; channel < channels; ++channel) {
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        f(buf[channels*(col+row*width) + channel], row, col, channel);
+      }
+    }
+  }
+}
+
+inline void identity(El::Matrix<uint8_t>& mat, El::Int height, El::Int width,
+              El::Int channels = 1) {
+  mat.Resize(height*width*channels, 1);
+  apply_elementwise(mat, height, width, channels,
+                    [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                      x = (row == col) ? 1 : 0;
+                    });
+}
+
+inline void zeros(El::Matrix<uint8_t>& mat, El::Int height, El::Int width,
+           El::Int channels = 1) {
+  mat.Resize(height*width*channels, 1);
+  uint8_t* buf = mat.Buffer();
+  for (El::Int i = 0; i < height*width*channels; ++i) {
+    buf[i] = 0;
+  }
+}
+
+inline void ones(El::Matrix<uint8_t>& mat, El::Int height, El::Int width,
+           El::Int channels = 1) {
+  mat.Resize(height*width*channels, 1);
+  uint8_t* buf = mat.Buffer();
+  for (El::Int i = 0; i < height*width*channels; ++i) {
+    buf[i] = 1;
+  }
+}
+
+inline void print(const El::Matrix<uint8_t>& mat, El::Int height, El::Int width,
+           El::Int channels = 1) {
+  const uint8_t* buf = mat.LockedBuffer();
+  for (El::Int channel = 0; channel < channels; ++channel) {
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        std::cout << ((int) buf[channels*(col+row*width) + channel]) << " ";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << "--" << std::endl;
+  }
+}
+
+#endif  // LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER
diff --git a/src/transforms/vision/unit_test/horizontal_flip_test.cpp b/src/transforms/vision/unit_test/horizontal_flip_test.cpp
new file mode 100644
index 00000000000..749f6029a2a
--- /dev/null
+++ b/src/transforms/vision/unit_test/horizontal_flip_test.cpp
@@ -0,0 +1,83 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/horizontal_flip.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing horizontal flip preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+
+  SECTION("matrix with one channel") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 1);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 1,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (col == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {1, 3, 3};
+    auto flipper = lbann::transform::horizontal_flip(1.0);
+
+    SECTION("applying the flip") {
+      REQUIRE_NOTHROW(flipper.apply(mat, dims));
+
+      SECTION("flipping does not change dims") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("flipping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("flipping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (col == 2) {
+              REQUIRE(x == 1);
+            } else {
+              REQUIRE(x == 0);
+            }
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 3);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 3,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (col == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {3, 3, 3};
+    auto flipper = lbann::transform::horizontal_flip(1.0);
+
+    SECTION("applying the flip") {
+      REQUIRE_NOTHROW(flipper.apply(mat, dims));
+
+      SECTION("flipping does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("flipping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("flipping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (col == 2) {
+              REQUIRE(x == 1);
+            } else {
+              REQUIRE(x == 0);
+            }
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/random_affine_test.cpp b/src/transforms/vision/unit_test/random_affine_test.cpp
new file mode 100644
index 00000000000..ddec365e997
--- /dev/null
+++ b/src/transforms/vision/unit_test/random_affine_test.cpp
@@ -0,0 +1,103 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/random_affine.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+// Note: This is *random* so we only do basic checks.
+TEST_CASE("Testing random affine preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+  // For simplicity, we'll only use a 3-channel matrix here.
+  identity(mat.template get<uint8_t>(), 10, 10, 3);
+  std::vector<size_t> dims = {3, 10, 10};
+
+  SECTION("rotation") {
+    auto affiner = lbann::transform::random_affine(0.0, 90.0, 0, 0, 0, 0, 0, 0);
+
+    SECTION("applying the transform") {
+      REQUIRE_NOTHROW(affiner.apply(mat, dims));
+
+      SECTION("transform does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 10);
+        REQUIRE(dims[2] == 10);
+      }
+      SECTION("transform does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+    }
+  }
+
+  SECTION("translate") {
+    auto affiner = lbann::transform::random_affine(0, 0, 0.1, 0.1, 0, 0, 0, 0);
+
+    SECTION("applying the transform") {
+      REQUIRE_NOTHROW(affiner.apply(mat, dims));
+
+      SECTION("transform does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 10);
+        REQUIRE(dims[2] == 10);
+      }
+      SECTION("transform does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+    }
+  }
+
+  SECTION("scale") {
+    auto affiner = lbann::transform::random_affine(0, 0, 0, 0, 0.0, 2.0, 0, 0);
+
+    SECTION("applying the transform") {
+      REQUIRE_NOTHROW(affiner.apply(mat, dims));
+
+      SECTION("transform does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 10);
+        REQUIRE(dims[2] == 10);
+      }
+      SECTION("transform does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+    }
+  }
+
+  SECTION("shear") {
+    auto affiner = lbann::transform::random_affine(0, 0, 0, 0, 0, 0, 0.0, 45.0);
+
+    SECTION("applying the transform") {
+      REQUIRE_NOTHROW(affiner.apply(mat, dims));
+
+      SECTION("transform does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 10);
+        REQUIRE(dims[2] == 10);
+      }
+      SECTION("transform does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+    }
+  }
+
+  SECTION("all") {
+    auto affiner = lbann::transform::random_affine(
+      0.0, 90.0, 0.1, 0.1, 0.0, 2.0, 0.0, 45.0);
+
+    SECTION("applying the transform") {
+      REQUIRE_NOTHROW(affiner.apply(mat, dims));
+
+      SECTION("transform does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 10);
+        REQUIRE(dims[2] == 10);
+      }
+      SECTION("transform does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/random_crop_test.cpp b/src/transforms/vision/unit_test/random_crop_test.cpp
new file mode 100644
index 00000000000..cf779d7f8b3
--- /dev/null
+++ b/src/transforms/vision/unit_test/random_crop_test.cpp
@@ -0,0 +1,67 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/random_crop.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing random crop preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+
+  SECTION("matrix with one channel") {
+    ones(mat.template get<uint8_t>(), 5, 5, 1);
+    std::vector<size_t> dims = {1, 5, 5};
+    auto cropper = lbann::transform::random_crop(3, 3);
+
+    SECTION("applying the crop") {
+      REQUIRE_NOTHROW(cropper.apply(mat, dims));
+
+      SECTION("cropping changes dims correctly") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("cropping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("cropping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            REQUIRE(x == 1);
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    ones(mat.template get<uint8_t>(), 5, 5, 3);
+    std::vector<size_t> dims = {3, 5, 5};
+    auto cropper = lbann::transform::random_crop(3, 3);
+
+    SECTION("applying the crop") {
+      REQUIRE_NOTHROW(cropper.apply(mat, dims));
+
+      SECTION("cropping changes dims correctly") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("cropping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("cropping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            REQUIRE(x == 1);
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/random_resized_crop_test.cpp b/src/transforms/vision/unit_test/random_resized_crop_test.cpp
new file mode 100644
index 00000000000..fe97d35fb7f
--- /dev/null
+++ b/src/transforms/vision/unit_test/random_resized_crop_test.cpp
@@ -0,0 +1,121 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/random_resized_crop.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing random resized crop preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+
+  SECTION("matrix with one channel") {
+    ones(mat.template get<uint8_t>(), 5, 5, 1);
+    std::vector<size_t> dims = {1, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop(3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop(1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    ones(mat.template get<uint8_t>(), 5, 5, 3);
+    std::vector<size_t> dims = {3, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop(3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop(1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp b/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp
new file mode 100644
index 00000000000..5280035d3ce
--- /dev/null
+++ b/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp
@@ -0,0 +1,191 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp>
+#include <lbann/transforms/vision/resize.hpp>
+#include <lbann/transforms/vision/random_crop.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing random resized crop with fixed aspect ratio preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+
+  SECTION("matrix with one channel") {
+    ones(mat.template get<uint8_t>(), 5, 5, 1);
+    std::vector<size_t> dims = {1, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(7, 7, 3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 1);
+          std::vector<size_t> dims2 = {1, 5, 5};
+          auto resizer = lbann::transform::resize(7, 7);
+          auto cropper = lbann::transform::random_crop(3, 3);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(3, 3, 1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 1);
+          std::vector<size_t> dims2 = {1, 5, 5};
+          auto resizer = lbann::transform::resize(3, 3);
+          auto cropper = lbann::transform::random_crop(1, 1);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    ones(mat.template get<uint8_t>(), 5, 5, 3);
+    std::vector<size_t> dims = {3, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(7, 7, 3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 3);
+          std::vector<size_t> dims2 = {3, 5, 5};
+          auto resizer = lbann::transform::resize(7, 7);
+          auto cropper = lbann::transform::random_crop(3, 3);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(3, 3, 1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 3);
+          std::vector<size_t> dims2 = {3, 5, 5};
+          auto resizer = lbann::transform::resize(3, 3);
+          auto cropper = lbann::transform::random_crop(1, 1);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/resize_test.cpp b/src/transforms/vision/unit_test/resize_test.cpp
new file mode 100644
index 00000000000..16c4bd48fc0
--- /dev/null
+++ b/src/transforms/vision/unit_test/resize_test.cpp
@@ -0,0 +1,118 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/resize.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing resize preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    ones(mat.template get<uint8_t>(), 3, 3, 1);
+    std::vector<size_t> dims = {1, 3, 3};
+
+    SECTION("resizing larger") {
+      auto resizer = lbann::transform::resize(5, 5);
+
+      SECTION("applying the resize") {
+        REQUIRE_NOTHROW(resizer.apply(mat, dims));
+
+        SECTION("resizing changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 5);
+          REQUIRE(dims[2] == 5);
+        }
+        SECTION("resizing does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 5, 5, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+    SECTION("resizing smaller") {
+      auto resizer = lbann::transform::resize(2, 2);
+
+      SECTION("applying the resize") {
+        REQUIRE_NOTHROW(resizer.apply(mat, dims));
+
+        SECTION("resizing changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 2);
+          REQUIRE(dims[2] == 2);
+        }
+        SECTION("resizing does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 2, 2, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    ones(mat.template get<uint8_t>(), 3, 3, 3);
+    std::vector<size_t> dims = {3, 3, 3};
+
+    SECTION("resizing larger") {
+      auto resizer = lbann::transform::resize(5, 5);
+
+      SECTION("applying the resize") {
+        REQUIRE_NOTHROW(resizer.apply(mat, dims));
+
+        SECTION("resizing changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 5);
+          REQUIRE(dims[2] == 5);
+        }
+        SECTION("resizing does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 5, 5, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+    SECTION("resizing smaller") {
+      auto resizer = lbann::transform::resize(2, 2);
+
+      SECTION("applying the resize") {
+        REQUIRE_NOTHROW(resizer.apply(mat, dims));
+
+        SECTION("resizing changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 2);
+          REQUIRE(dims[2] == 2);
+        }
+        SECTION("resizing does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 2, 2, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/resized_center_crop_test.cpp b/src/transforms/vision/unit_test/resized_center_crop_test.cpp
new file mode 100644
index 00000000000..8ed8a3b7cfe
--- /dev/null
+++ b/src/transforms/vision/unit_test/resized_center_crop_test.cpp
@@ -0,0 +1,188 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/resized_center_crop.hpp>
+#include <lbann/transforms/vision/resize.hpp>
+#include <lbann/transforms/vision/center_crop.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing resized center crop preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    ones(mat.template get<uint8_t>(), 5, 5, 1);
+    std::vector<size_t> dims = {1, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::resized_center_crop(7, 7, 3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 1);
+          std::vector<size_t> dims2 = {1, 5, 5};
+          auto resizer = lbann::transform::resize(7, 7);
+          auto cropper = lbann::transform::center_crop(3, 3);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::resized_center_crop(3, 3, 1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 1);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 1,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 1);
+          std::vector<size_t> dims2 = {1, 5, 5};
+          auto resizer = lbann::transform::resize(3, 3);
+          auto cropper = lbann::transform::center_crop(1, 1);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    ones(mat.template get<uint8_t>(), 5, 5, 3);
+    std::vector<size_t> dims = {3, 5, 5};
+
+    SECTION("resizing larger and cropping") {
+      auto resize_cropper = lbann::transform::resized_center_crop(7, 7, 3, 3);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 3);
+          REQUIRE(dims[2] == 3);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 3, 3, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 3);
+          std::vector<size_t> dims2 = {3, 5, 5};
+          auto resizer = lbann::transform::resize(7, 7);
+          auto cropper = lbann::transform::center_crop(3, 3);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+    SECTION("resizing smaller and cropping") {
+      auto resize_cropper = lbann::transform::resized_center_crop(3, 3, 1, 1);
+
+      SECTION("applying the resize/crop") {
+        REQUIRE_NOTHROW(resize_cropper.apply(mat, dims));
+
+        SECTION("resizing/cropping changes dims correctly") {
+          REQUIRE(dims[0] == 3);
+          REQUIRE(dims[1] == 1);
+          REQUIRE(dims[2] == 1);
+        }
+        SECTION("resizing/cropping does not change matrix type") {
+          REQUIRE_NOTHROW(mat.template get<uint8_t>());
+        }
+        SECTION("resizing/cropping produces correct values") {
+          auto& real_mat = mat.template get<uint8_t>();
+          apply_elementwise(
+            real_mat, 1, 1, 3,
+            [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+              REQUIRE(x == 1);
+            });
+        }
+
+        SECTION("compare with resize then crop") {
+          lbann::utils::type_erased_matrix mat2 =
+            lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+          ones(mat2.template get<uint8_t>(), 5, 5, 3);
+          std::vector<size_t> dims2 = {3, 5, 5};
+          auto resizer = lbann::transform::resize(3, 3);
+          auto cropper = lbann::transform::center_crop(1, 1);
+          REQUIRE_NOTHROW(resizer.apply(mat2, dims2));
+          REQUIRE_NOTHROW(cropper.apply(mat2, dims2));
+          REQUIRE(dims == dims2);
+          const uint8_t* buf = mat.template get<uint8_t>().LockedBuffer();
+          const uint8_t* buf2 = mat2.template get<uint8_t>().LockedBuffer();
+          for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) {
+            REQUIRE(buf[i] == buf2[i]);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/to_lbann_layout_test.cpp b/src/transforms/vision/unit_test/to_lbann_layout_test.cpp
new file mode 100644
index 00000000000..a737a0f69ac
--- /dev/null
+++ b/src/transforms/vision/unit_test/to_lbann_layout_test.cpp
@@ -0,0 +1,82 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/to_lbann_layout.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing to LBANN layout", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+
+  SECTION("matrix with one channel") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 1);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 1,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {1, 3, 3};
+    auto tll = lbann::transform::to_lbann_layout();
+
+    SECTION("converting the matrix") {
+      REQUIRE_NOTHROW(tll.apply(mat, dims));
+
+      SECTION("converting does not change dims") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("converting changes matrix type") {
+        REQUIRE_THROWS(mat.template get<uint8_t>());
+        REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+      }
+      SECTION("converting produces correct values") {
+        auto& real_mat = mat.template get<lbann::DataType>();
+        const lbann::DataType* buf = real_mat.LockedBuffer();
+        for (size_t col = 0; col < 3; ++col) {
+          for (size_t row = 0; row < 3; ++row) {
+            const lbann::DataType val = buf[row + col*3];
+            if (row == 0) { REQUIRE(val == 1.0f / 255.0f); }
+            else { REQUIRE(val == 0.0f); }
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 3);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 3,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {3, 3, 3};
+    auto tll = lbann::transform::to_lbann_layout();
+
+    SECTION("converting the matrix") {
+      REQUIRE_NOTHROW(tll.apply(mat, dims));
+
+      SECTION("converting does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("converting changes matrix type") {
+        REQUIRE_THROWS(mat.template get<uint8_t>());
+        REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+      }
+      SECTION("converting produces correct values") {
+        auto& real_mat = mat.template get<lbann::DataType>();
+        const lbann::DataType* buf = real_mat.LockedBuffer();
+        for (size_t channel = 0; channel < 3; ++channel) {
+          for (size_t col = 0; col < 3; ++col) {
+            for (size_t row = 0; row < 3; ++row) {
+              const lbann::DataType val = buf[3*3*channel + row + col*3];
+              if (row == 0) { REQUIRE(val == 1.0f / 255.0f); }
+              else { REQUIRE(val == 0.0f); }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/transform_pipeline_test.cpp b/src/transforms/vision/unit_test/transform_pipeline_test.cpp
new file mode 100644
index 00000000000..f314904b9c4
--- /dev/null
+++ b/src/transforms/vision/unit_test/transform_pipeline_test.cpp
@@ -0,0 +1,45 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/transform_pipeline.hpp>
+#include <lbann/transforms/vision/resized_center_crop.hpp>
+#include <lbann/transforms/vision/to_lbann_layout.hpp>
+#include <lbann/transforms/scale.hpp>
+#include <lbann/transforms/normalize.hpp>
+#include <lbann/utils/memory.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing vision transform pipeline", "[preproc]") {
+  lbann::transform::transform_pipeline p;
+  p.add_transform(
+    lbann::make_unique<lbann::transform::resized_center_crop>(7, 7, 3, 3));
+  p.add_transform(lbann::make_unique<lbann::transform::to_lbann_layout>());
+  p.add_transform(lbann::make_unique<lbann::transform::scale>(2.0f));
+  p.add_transform(lbann::make_unique<lbann::transform::normalize>(
+                    std::vector<float>({0.5f, 0.5f, 0.5f}),
+                    std::vector<float>({2.0f, 2.0f, 2.0f})));
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  ones(mat.template get<uint8_t>(), 5, 5, 3);
+  std::vector<size_t> dims = {3, 5, 5};
+
+  SECTION("applying the pipeline") {
+    REQUIRE_NOTHROW(p.apply(mat, dims));
+
+    SECTION("pipeline produces correct dims") {
+      REQUIRE(dims[0] == 3);
+      REQUIRE(dims[1] == 3);
+      REQUIRE(dims[2] == 3);
+    }
+    SECTION("pipeline produces correct type") {
+      REQUIRE_NOTHROW(mat.template get<lbann::DataType>());
+    }
+    SECTION("pipeline produces correct values") {
+      auto& real_mat = mat.template get<lbann::DataType>();
+      const lbann::DataType* buf = real_mat.LockedBuffer();
+      for (size_t i = 0; i < 3*3*3; ++i) {
+        REQUIRE(buf[i] == Approx(-0.24607843));
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/unit_test/vertical_flip_test.cpp b/src/transforms/vision/unit_test/vertical_flip_test.cpp
new file mode 100644
index 00000000000..d6c3c6358f2
--- /dev/null
+++ b/src/transforms/vision/unit_test/vertical_flip_test.cpp
@@ -0,0 +1,83 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/transforms/vision/vertical_flip.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+#include "helper.hpp"
+
+TEST_CASE("Testing vertical flip preprocessing", "[preproc]") {
+  lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix<uint8_t>());
+  // Grab the necessary I/O RNG and lock it
+  lbann::locked_io_rng_ref io_rng = lbann::set_io_generators_local_index(0);
+
+  SECTION("matrix with one channel") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 1);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 1,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {1, 3, 3};
+    auto flipper = lbann::transform::vertical_flip(1.0);
+
+    SECTION("applying the flip") {
+      REQUIRE_NOTHROW(flipper.apply(mat, dims));
+
+      SECTION("flipping does not change dims") {
+        REQUIRE(dims[0] == 1);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("flipping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("flipping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 1,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == 2) {
+              REQUIRE(x == 1);
+            } else {
+              REQUIRE(x == 0);
+            }
+          });
+      }
+    }
+  }
+
+  SECTION("matrix with three channels") {
+    zeros(mat.template get<uint8_t>(), 3, 3, 3);
+    apply_elementwise(mat.template get<uint8_t>(), 3, 3, 3,
+                      [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+                        if (row == 0) { x = 1; }
+                      });
+    std::vector<size_t> dims = {3, 3, 3};
+    auto flipper = lbann::transform::vertical_flip(1.0);
+
+    SECTION("applying the flip") {
+      REQUIRE_NOTHROW(flipper.apply(mat, dims));
+
+      SECTION("flipping does not change dims") {
+        REQUIRE(dims[0] == 3);
+        REQUIRE(dims[1] == 3);
+        REQUIRE(dims[2] == 3);
+      }
+      SECTION("flipping does not change matrix type") {
+        REQUIRE_NOTHROW(mat.template get<uint8_t>());
+      }
+      SECTION("flipping produces correct values") {
+        auto& real_mat = mat.template get<uint8_t>();
+        apply_elementwise(
+          real_mat, 3, 3, 3,
+          [](uint8_t& x, El::Int row, El::Int col, El::Int) {
+            if (row == 2) {
+              REQUIRE(x == 1);
+            } else {
+              REQUIRE(x == 0);
+            }
+          });
+      }
+    }
+  }
+}
diff --git a/src/transforms/vision/vertical_flip.cpp b/src/transforms/vision/vertical_flip.cpp
new file mode 100644
index 00000000000..bd48657a815
--- /dev/null
+++ b/src/transforms/vision/vertical_flip.cpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/transforms/vision/vertical_flip.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/opencv.hpp"
+
+#include <transforms.pb.h>
+
+namespace lbann {
+namespace transform {
+
+void vertical_flip::apply(utils::type_erased_matrix& data, std::vector<size_t>& dims) {
+  if (transform::get_bool_random(m_p)) {
+    cv::Mat src = utils::get_opencv_mat(data, dims);
+    auto dst_real = El::Matrix<uint8_t>(utils::get_linearized_size(dims), 1);
+    cv::Mat dst = utils::get_opencv_mat(dst_real, dims);
+    cv::flip(src, dst, 0);
+    data.emplace<uint8_t>(std::move(dst_real));
+  }
+}
+
+std::unique_ptr<transform>
+build_vertical_flip_transform_from_pbuf(google::protobuf::Message const& msg) {
+  auto const& params = dynamic_cast<lbann_data::Transform::VerticalFlip const&>(msg);
+  return make_unique<vertical_flip>(params.p());
+}
+
+}  // namespace transform
+}  // namespace lbann
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index aafbc5741cc..7a8c0c11ffe 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -1,34 +1,53 @@
+
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  argument_parser.cpp
   cnpy_utils.cpp
   cublas.cpp
   cudnn.cpp
   description.cpp
+  environment_variable.cpp
   exception.cpp
   file_utils.cpp
   graph.cpp
   im2col.cpp
+  image.cpp
   number_theory.cpp
   omp_diagnostics.cpp
   options.cpp
   profiling.cpp
   protobuf_utils.cpp
+  python.cpp
   random.cpp
+  random_number_generators.cpp
   stack_profiler.cpp
   stack_trace.cpp
   statistics.cpp
   summary.cpp
+  system_info.cpp
   lbann_library.cpp
   jag_common.cpp
+  commify.cpp
+  trainer_file_utils.cpp
 )
 
+if (LBANN_HAS_HALF)
+  list(APPEND THIS_DIR_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialization.cpp)
+endif (LBANN_HAS_HALF)
+
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
     cuda.cu
+    nvshmem.cu
     )
 endif ()
 
+if (LBANN_HAS_DISTCONV)
+  list(APPEND THIS_DIR_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/distconv.cpp")
+endif ()
+
 add_subdirectory(threads)
 
 # Propagate the files up the tree
diff --git a/src/utils/argument_parser.cpp b/src/utils/argument_parser.cpp
new file mode 100644
index 00000000000..eeab1b8ecdb
--- /dev/null
+++ b/src/utils/argument_parser.cpp
@@ -0,0 +1,108 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/argument_parser.hpp"
+#include "lbann/utils/any.hpp"
+#include "lbann/utils/exception.hpp"
+
+#include <clara.hpp>
+
+#include <iostream>
+#include <string>
+
+namespace
+{
+/** @brief Check a raw argument for a given token.
+ *
+ *  Returns "true" when EITHER:
+ *
+ *    - testtoken == rawargument is true.
+ *    - rawargument has an equals sign and the substring up to (but not
+ *      including) the equals sign compares equal to testtoken.
+ */
+bool token_match(std::string const& testtoken,
+                 std::string const& rawargument)
+{
+  return (rawargument.compare(0, rawargument.find('='), testtoken) == 0);
+}
+}// namespace <anon>
+
+namespace lbann
+{
+namespace utils
+{
+void strict_parsing::handle_error(
+  clara::detail::InternalParseResult result,
+  clara::Parser&,
+  std::vector<char const*>&)
+{
+  throw parse_error(
+    lbann::build_string(
+      "Arguments could not be parsed.\n\nMessage: ",
+      result.errorMessage()));
+}
+
+void allow_extra_parameters::handle_error(
+  clara::detail::InternalParseResult parse_result,
+  clara::Parser& parser_,
+  std::vector<char const*>& newargv)
+{
+  do
+  {
+    std::string const base_text = "Unrecognised token: ";
+    std::string const err_text = parse_result.errorMessage();
+    if ((err_text.size() > base_text.size())
+        && (err_text.substr(0, base_text.size()).compare(base_text) == 0))
+    {
+      auto const token = err_text.substr(base_text.size());
+      auto iter = std::find_if(newargv.cbegin(), newargv.cend(),
+                               [&token](char const* str)
+                               {
+                                 return token_match(token, str);
+                               });
+      newargv.erase(newargv.cbegin() + 1,
+                    (iter == newargv.cend() ? iter : iter + 1));
+      parse_result = parser_.parse(clara::Args(newargv.size(), newargv.data()));
+    }
+    else
+    {
+      throw parse_error(
+        lbann::build_string(
+          "Arguments could not be parsed.\n\nMessage: ",
+          parse_result.errorMessage()));
+    }
+  } while (!parse_result);
+}
+
+}// namespace utils
+
+default_arg_parser_type& global_argument_parser()
+{
+  static default_arg_parser_type args;
+  return args;
+}
+
+}// namespace lbann
diff --git a/src/utils/commify.cpp b/src/utils/commify.cpp
new file mode 100644
index 00000000000..40eab16dbf7
--- /dev/null
+++ b/src/utils/commify.cpp
@@ -0,0 +1,56 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/commify.hpp"
+#include <sstream>
+#include <algorithm>
+
+namespace lbann
+{
+namespace utils
+{
+
+std::string commify(size_t n) {
+  std::string s = std::to_string(n);
+  std::stringstream s2;
+  int c = 0;
+  for (int j = (int)s.size()-1; j>=0; j--) {
+    s2 << s[j];
+    ++c;
+    if (c == 3) {
+      if (j > 0) {
+        s2 << ",";
+        c = 0;
+      }
+    }
+  }
+  std::string r = s2.str();
+  std::reverse(r.begin(), r.end());
+  return r;
+}
+
+}// namespace utils
+}// namespace lbann
diff --git a/src/utils/cublas.cpp b/src/utils/cublas.cpp
index 765b02d9ae3..b63ec337981 100644
--- a/src/utils/cublas.cpp
+++ b/src/utils/cublas.cpp
@@ -24,6 +24,7 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#include <lbann_config.hpp>
 #include "lbann/utils/cublas.hpp"
 #include "lbann/utils/exception.hpp"
 
@@ -35,6 +36,13 @@
   void wrapper(Ts&&... args) {                                  \
     CHECK_CUBLAS(function(std::forward<Ts>(args)...));          \
   }
+
+#define ERROR_OUT(wrapper)                                          \
+  template <typename... Ts>                                         \
+  void wrapper(Ts&&... args) {                                      \
+    LBANN_ERROR("Cannot dispatch " #wrapper "() for this type.");   \
+  }
+
 namespace {
 
 template <typename T>
@@ -43,25 +51,75 @@ struct cuBLAS_Caller;
 template <>
 struct cuBLAS_Caller<float> {
   WRAP_CUBLAS(cublasSaxpy, axpy)
-  WRAP_CUBLAS(cublasSdot , dot )
+  WRAP_CUBLAS(cublasSdot, dot)
   WRAP_CUBLAS(cublasSnrm2, nrm2)
   WRAP_CUBLAS(cublasSscal, scal)
   WRAP_CUBLAS(cublasSgemv, gemv)
   WRAP_CUBLAS(cublasSgemm, gemm)
   WRAP_CUBLAS(cublasSgeam, geam)
+  WRAP_CUBLAS(cublasSgemmStridedBatched, gemm_strided_batched)
 };
 
 template <>
 struct cuBLAS_Caller<double> {
   WRAP_CUBLAS(cublasDaxpy, axpy)
-  WRAP_CUBLAS(cublasDdot , dot )
+  WRAP_CUBLAS(cublasDdot, dot)
   WRAP_CUBLAS(cublasDnrm2, nrm2)
   WRAP_CUBLAS(cublasDscal, scal)
   WRAP_CUBLAS(cublasDgemv, gemv)
   WRAP_CUBLAS(cublasDgemm, gemm)
   WRAP_CUBLAS(cublasDgeam, geam)
+  WRAP_CUBLAS(cublasDgemmStridedBatched, gemm_strided_batched)
 };
 
+#ifdef LBANN_HAS_GPU_FP16
+template <>
+struct cuBLAS_Caller<__half> {
+  void axpy(cublasHandle_t handle, int n,
+            __half const* alpha,
+            __half const* x, int incx,
+            __half* y, int incy)
+  {
+    CHECK_CUBLAS(
+      cublasAxpyEx(handle, n, alpha, CUDA_R_16F, x, CUDA_R_16F, incx,
+                   y, CUDA_R_16F, incy, CUDA_R_32F));
+  }
+
+  void dot(cublasHandle_t handle, int n,
+           __half const* x, int incx,
+           __half const* y, int incy,
+           __half* result)
+  {
+    CHECK_CUBLAS(
+      cublasDotEx(handle, n, x, CUDA_R_16F, incx, y, CUDA_R_16F, incy,
+                  result, CUDA_R_16F,  CUDA_R_32F));
+  }
+
+  void nrm2(cublasHandle_t handle, int n, __half const* x, int incx,
+            __half* result)
+  {
+    CHECK_CUBLAS(
+      cublasNrm2Ex(handle, n, x, CUDA_R_16F, incx,
+                   result, CUDA_R_16F, CUDA_R_32F));
+  }
+
+  void scal(cublasHandle_t handle, int n,
+            __half const* alpha,
+            __half* x, int incx)
+  {
+    CHECK_CUBLAS(
+      cublasScalEx(handle, n, alpha, CUDA_R_16F,
+                   x, CUDA_R_16F, incx,
+                   CUDA_R_32F));
+  }
+
+  WRAP_CUBLAS(cublasHgemm, gemm)
+  WRAP_CUBLAS(cublasHgemmStridedBatched, gemm_strided_batched)
+
+  ERROR_OUT(geam)
+  ERROR_OUT(gemv)
+};
+#endif // LBANN_HAS_GPU_FP16
 } // namespace
 
 namespace lbann {
@@ -94,90 +152,146 @@ const std::string get_error_string(cublasStatus_t status) {
   }
 }
 
+template <typename TensorDataType>
 void axpy(cublasHandle_t const& handle,
           int n,
-          DataType alpha,
-          DataType const* x, int incx,
-          DataType * y, int incy) {
-  cuBLAS_Caller<DataType>{}.axpy(handle, n, &alpha, x, incx, y, incy);
+          TensorDataType alpha,
+          TensorDataType const* x, int incx,
+          TensorDataType * y, int incy) {
+  cuBLAS_Caller<TensorDataType>{}.axpy(handle, n, &alpha, x, incx, y, incy);
 }
 
+template <typename TensorDataType>
 void dot(cublasHandle_t const& handle,
          int n,
-         DataType const* x, int incx,
-         DataType const* y, int incy,
-         DataType * result) {
-  cuBLAS_Caller<DataType>{}.dot(handle, n, x, incx, y, incy, result);
+         TensorDataType const* x, int incx,
+         TensorDataType const* y, int incy,
+         TensorDataType * result) {
+  cuBLAS_Caller<TensorDataType>{}.dot(handle, n, x, incx, y, incy, result);
 }
 
-DataType dot(cublasHandle_t const& handle,
+template <typename TensorDataType>
+TensorDataType dot(cublasHandle_t const& handle,
              int n,
-             DataType const* x, int incx,
-             DataType const* y, int incy) {
-  DataType result;
+             TensorDataType const* x, int incx,
+             TensorDataType const* y, int incy) {
+  TensorDataType result;
   dot(handle, n, x, incx, y, incy, &result);
   return result;
 }
 
+template <typename TensorDataType>
 void nrm2(cublasHandle_t const& handle,
           int n,
-          DataType const* x, int incx,
-          DataType * result) {
-  cuBLAS_Caller<DataType>{}.nrm2(handle, n, x, incx, result);
+          TensorDataType const* x, int incx,
+          TensorDataType * result) {
+  cuBLAS_Caller<TensorDataType>{}.nrm2(handle, n, x, incx, result);
 }
 
-DataType nrm2(cublasHandle_t const& handle,
+template <typename TensorDataType>
+TensorDataType nrm2(cublasHandle_t const& handle,
               int n,
-              DataType const* x, int incx) {
-  DataType result;
+              TensorDataType const* x, int incx) {
+  TensorDataType result;
   nrm2(handle, n, x, incx, &result);
   return result;
 
 }
 
+template <typename TensorDataType>
 void scal(cublasHandle_t const& handle,
           int n,
-          DataType alpha,
-          DataType * x, int incx) {
-  cuBLAS_Caller<DataType>{}.scal(handle, n, &alpha, x, incx);
+          TensorDataType alpha,
+          TensorDataType * x, int incx) {
+  cuBLAS_Caller<TensorDataType>{}.scal(handle, n, &alpha, x, incx);
 }
 
+template <typename TensorDataType>
 void gemv(cublasHandle_t const& handle,
           cublasOperation_t trans,
           int m, int n,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType const * x, int incx,
-          DataType beta,
-          DataType * y, int incy) {
-  cuBLAS_Caller<DataType>{}.gemv(handle, trans, m, n,
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType const * x, int incx,
+          TensorDataType beta,
+          TensorDataType * y, int incy) {
+  cuBLAS_Caller<TensorDataType>{}.gemv(handle, trans, m, n,
                                  &alpha, A, lda, x, incx, &beta, y, incy);
 }
 
+template <typename TensorDataType>
 void gemm(cublasHandle_t const& handle,
           cublasOperation_t transa, cublasOperation_t transb,
           int m, int n, int k,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType const * B, int ldb,
-          DataType beta,
-          DataType * C, int ldc) {
-  cuBLAS_Caller<DataType>{}.gemm(handle, transa, transb, m, n, k,
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType const * B, int ldb,
+          TensorDataType beta,
+          TensorDataType * C, int ldc) {
+  cuBLAS_Caller<TensorDataType>{}.gemm(handle, transa, transb, m, n, k,
                                  &alpha, A, lda, B, ldb, &beta, C, ldc);
 }
 
+template <typename TensorDataType>
 void geam(cublasHandle_t const& handle,
           cublasOperation_t transa, cublasOperation_t transb,
           int m, int n,
-          DataType alpha,
-          DataType const * A, int lda,
-          DataType beta,
-          DataType const * B, int ldb,
-          DataType * C, int ldc) {
-  cuBLAS_Caller<DataType>{}.geam(handle, transa, transb, m, n,
+          TensorDataType alpha,
+          TensorDataType const * A, int lda,
+          TensorDataType beta,
+          TensorDataType const * B, int ldb,
+          TensorDataType * C, int ldc) {
+  cuBLAS_Caller<TensorDataType>{}.geam(handle, transa, transb, m, n,
                                  &alpha, A, lda, &beta, B, ldb, C, ldc);
 }
 
+template <typename TensorDataType>
+void gemm_strided_batched(cublasHandle_t const& handle,
+                          cublasOperation_t transa, cublasOperation_t transb,
+                          int m, int n, int k,
+                          TensorDataType alpha,
+                          TensorDataType const * A, int lda,
+                          long long int strideA,
+                          TensorDataType const * B, int ldb,
+                          long long int strideB,
+                          TensorDataType beta,
+                          TensorDataType * C, int ldc,
+                          long long int strideC,
+                          int batchCount) {
+  cuBLAS_Caller<TensorDataType>{}.gemm_strided_batched(
+    handle, transa, transb, m, n, k,
+    &alpha, A, lda, strideA, B, ldb, strideB,
+    &beta, C, ldc, strideC, batchCount);
+}
+
+#define PROTO(T)                                                                       \
+  template void axpy<T>(cublasHandle_t const&, int, T, T const*, int, T *, int);       \
+  template void dot<T>(cublasHandle_t const&, int, T const*, int, T const*, int, T *); \
+  template T dot<T>(cublasHandle_t const&, int, T const*, int, T const*, int);         \
+  template void nrm2<T>(cublasHandle_t const&, int, T const*, int, T *);               \
+  template T nrm2<T>(cublasHandle_t const&, int, T const*, int);                       \
+  template void scal<T>(cublasHandle_t const&, int, T, T *, int);                      \
+  template void gemv<T>(cublasHandle_t const&, cublasOperation_t, int, int, T,         \
+    T const *, int, T const *, int, T, T *, int);                                      \
+  template void gemm<T>(cublasHandle_t const&, cublasOperation_t, cublasOperation_t,   \
+    int, int, int, T, T const *, int, T const *, int, T, T *, int);                    \
+  template void geam<T>(cublasHandle_t const&, cublasOperation_t, cublasOperation_t,   \
+    int, int, T, T const *, int, T, T const *, int, T *, int);                         \
+  template void gemm_strided_batched<T>(cublasHandle_t const&, cublasOperation_t,      \
+    cublasOperation_t, int, int, int, T, T const *, int, long long int, T const *,     \
+    int, long long int, T, T *, int, long long int, int)
+
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+void default_to_tensor_ops()
+{
+  CHECK_CUBLAS(
+        cublasSetMathMode(
+          hydrogen::cublas::GetLibraryHandle(),
+          CUBLAS_TENSOR_OP_MATH));
+}
+
 } // namespace cublas
 } // namespace lbann
 
diff --git a/src/utils/cudnn.cpp b/src/utils/cudnn.cpp
index 90debc6dc68..260fa2f002f 100644
--- a/src/utils/cudnn.cpp
+++ b/src/utils/cudnn.cpp
@@ -85,16 +85,19 @@ cudnnHandle_t& get_handle() {
 // Helper functions for cuDNN types
 ////////////////////////////////////////////////////////////
 
+template <typename TensorDataType>
 cudnnDataType_t get_data_type() {
-  switch (sizeof(DataType)) {
-  case 2: return CUDNN_DATA_HALF;
-  case 4: return CUDNN_DATA_FLOAT;
-  case 8: return CUDNN_DATA_DOUBLE;
-  default: LBANN_ERROR("invalid data type for cuDNN");
-  }
+  LBANN_ERROR("invalid data type for cuDNN");
   return CUDNN_DATA_FLOAT;
 }
 
+#ifdef LBANN_HAS_GPU_FP16
+template <> cudnnDataType_t get_data_type<fp16>() { return CUDNN_DATA_HALF; }
+#endif // LBANN_HAS_GPU_FP16
+template <> cudnnDataType_t get_data_type<float>() { return CUDNN_DATA_FLOAT; }
+template <> cudnnDataType_t get_data_type<double>() { return CUDNN_DATA_DOUBLE; }
+
+template <typename TensorDataType>
 void set_tensor_desc(cudnnTensorDescriptor_t& desc,
                      std::vector<int> dims,
                      std::vector<int> strides) {
@@ -164,7 +167,7 @@ void set_tensor_desc(cudnnTensorDescriptor_t& desc,
     CHECK_CUDNN(cudnnCreateTensorDescriptor(&desc));
   }
   CHECK_CUDNN(cudnnSetTensorNdDescriptor(desc,
-                                         get_data_type(),
+                                         get_data_type<TensorDataType>(),
                                          dims.size(),
                                          dims.data(),
                                          strides.data()));
@@ -242,12 +245,14 @@ void copy_activation_desc(const cudnnActivationDescriptor_t& src,
 // Base cuDNN tensor manager
 ////////////////////////////////////////////////////////////
 
-layer_tensor_manager::layer_tensor_manager(const Layer* l)
+template <typename TensorDataType>
+layer_tensor_manager<TensorDataType>::layer_tensor_manager(const data_type_layer<TensorDataType>* l)
   : m_layer(nullptr) {
   set_layer(l);
 }
 
-layer_tensor_manager::layer_tensor_manager(const layer_tensor_manager& other)
+template <typename TensorDataType>
+layer_tensor_manager<TensorDataType>::layer_tensor_manager(const layer_tensor_manager<TensorDataType>& other)
   : m_layer(other.m_layer),
     m_prev_activations(other.m_prev_activations.size(), nullptr),
     m_activations(other.m_activations.size(), nullptr),
@@ -267,7 +272,8 @@ layer_tensor_manager::layer_tensor_manager(const layer_tensor_manager& other)
   }
 }
 
-layer_tensor_manager& layer_tensor_manager::operator=(const layer_tensor_manager& other) {
+template <typename TensorDataType>
+layer_tensor_manager<TensorDataType>& layer_tensor_manager<TensorDataType>::operator=(const layer_tensor_manager<TensorDataType>& other) {
 
   // Set layer being managed
   m_layer = other.m_layer;
@@ -297,7 +303,8 @@ layer_tensor_manager& layer_tensor_manager::operator=(const layer_tensor_manager
   return *this;
 }
 
-layer_tensor_manager::~layer_tensor_manager() {
+template <typename TensorDataType>
+layer_tensor_manager<TensorDataType>::~layer_tensor_manager() {
   for (auto&& desc : m_prev_activations) {
     if (desc != nullptr) { cudnnDestroyTensorDescriptor(desc); }
   }
@@ -312,13 +319,15 @@ layer_tensor_manager::~layer_tensor_manager() {
   }
 }
 
-void layer_tensor_manager::set_layer(const Layer* new_layer) {
+template <typename TensorDataType>
+void layer_tensor_manager<TensorDataType>::set_layer(const data_type_layer<TensorDataType>* new_layer) {
   m_layer = new_layer;
-  set_num_parents(m_layer == nullptr ? 0 : m_layer->get_num_parents());
-  set_num_children(m_layer == nullptr ? 0 : m_layer->get_num_children());
+  set_num_parents(this->m_layer == nullptr ? 0 : m_layer->get_num_parents());
+  set_num_children(this->m_layer == nullptr ? 0 : m_layer->get_num_children());
 }
 
-void layer_tensor_manager::set_num_parents(int num_parents) {
+template <typename TensorDataType>
+void layer_tensor_manager<TensorDataType>::set_num_parents(int num_parents) {
 #ifdef LBANN_DEBUG
   if (num_parents < 0) { LBANN_ERROR("negative number of parents"); }
 #endif // LBANN_DEBUG
@@ -338,7 +347,8 @@ void layer_tensor_manager::set_num_parents(int num_parents) {
   m_error_signals.resize(num_parents, nullptr);
 }
 
-void layer_tensor_manager::set_num_children(int num_children) {
+template <typename TensorDataType>
+void layer_tensor_manager<TensorDataType>::set_num_children(int num_children) {
 #ifdef LBANN_DEBUG
   if (num_children < 0) { LBANN_ERROR("negative number of children"); }
 #endif // LBANN_DEBUG
@@ -362,17 +372,19 @@ void layer_tensor_manager::set_num_children(int num_children) {
 // Data-parallel cuDNN tensor manager
 ////////////////////////////////////////////////////////////
 
-data_parallel_layer_tensor_manager
-::data_parallel_layer_tensor_manager(const Layer* l)
-  : layer_tensor_manager(l) {}
+template <typename TensorDataType>
+data_parallel_layer_tensor_manager<TensorDataType>
+::data_parallel_layer_tensor_manager(const data_type_layer<TensorDataType>* l)
+  : layer_tensor_manager<TensorDataType>(l) {}
 
 namespace {
 
 /** Set a cuDNN tensor descriptor for a data-parallel data layout.
  */
+template <typename TensorDataType>
 void set_data_parallel_tensor_desc(cudnnTensorDescriptor_t& desc,
                                    std::vector<int> dims,
-                                   const AbsMat& local_data) {
+                                   const El::AbstractMatrix<TensorDataType>& local_data) {
 #ifdef LBANN_DEBUG
   if (local_data.GetDevice() != El::Device::GPU) {
     LBANN_ERROR("attempted to setup cuDNN tensor with non-GPU data");
@@ -385,57 +397,61 @@ void set_data_parallel_tensor_desc(cudnnTensorDescriptor_t& desc,
     }
     dims.insert(dims.begin(), local_data.Width());
     strides.insert(strides.begin(), local_data.LDim());
-    set_tensor_desc(desc, dims, strides);
+    set_tensor_desc<TensorDataType>(desc, dims, strides);
   }
 }
 
 } // namespace
 
-cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager::get_prev_activations(int parent_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager<TensorDataType>::get_prev_activations(int parent_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_prev_activations(parent_index);
-  const auto& dims = m_layer->get_input_dims(parent_index);
-  set_num_parents(m_layer->get_num_parents());
-  auto& desc = m_prev_activations[parent_index];
-  set_data_parallel_tensor_desc(desc, dims, local_data);
+  const auto& local_data = this->m_layer->get_local_prev_activations(parent_index);
+  const auto& dims = this->m_layer->get_input_dims(parent_index);
+  this->set_num_parents(this->m_layer->get_num_parents());
+  auto& desc = this->m_prev_activations[parent_index];
+  set_data_parallel_tensor_desc<TensorDataType>(desc, dims, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager::get_activations(int child_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager<TensorDataType>::get_activations(int child_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_activations(child_index);
-  const auto& dims = m_layer->get_output_dims(child_index);
-  set_num_children(m_layer->get_num_children());
-  auto& desc = m_activations[child_index];
-  set_data_parallel_tensor_desc(desc, dims, local_data);
+  const auto& local_data = this->m_layer->get_local_activations(child_index);
+  const auto& dims = this->m_layer->get_output_dims(child_index);
+  this->set_num_children(this->m_layer->get_num_children());
+  auto& desc = this->m_activations[child_index];
+  set_data_parallel_tensor_desc<TensorDataType>(desc, dims, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager::get_prev_error_signals(int child_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager<TensorDataType>::get_prev_error_signals(int child_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_prev_error_signals(child_index);
-  const auto& dims = m_layer->get_output_dims(child_index);
-  set_num_children(m_layer->get_num_children());
-  auto& desc = m_prev_error_signals[child_index];
-  set_data_parallel_tensor_desc(desc, dims, local_data);
+  const auto& local_data = this->m_layer->get_local_prev_error_signals(child_index);
+  const auto& dims = this->m_layer->get_output_dims(child_index);
+  this->set_num_children(this->m_layer->get_num_children());
+  auto& desc = this->m_prev_error_signals[child_index];
+  set_data_parallel_tensor_desc<TensorDataType>(desc, dims, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager::get_error_signals(int parent_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager<TensorDataType>::get_error_signals(int parent_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_error_signals(parent_index);
-  const auto& dims = m_layer->get_input_dims(parent_index);
-  set_num_parents(m_layer->get_num_parents());
-  auto& desc = m_error_signals[parent_index];
-  set_data_parallel_tensor_desc(desc, dims, local_data);
+  const auto& local_data = this->m_layer->get_local_error_signals(parent_index);
+  const auto& dims = this->m_layer->get_input_dims(parent_index);
+  this->set_num_parents(this->m_layer->get_num_parents());
+  auto& desc = this->m_error_signals[parent_index];
+  set_data_parallel_tensor_desc<TensorDataType>(desc, dims, local_data);
   return desc;
 }
 
@@ -443,9 +459,10 @@ cudnnTensorDescriptor_t& data_parallel_layer_tensor_manager::get_error_signals(i
 // Entry-wise cuDNN tensor manager
 ////////////////////////////////////////////////////////////
 
-entrywise_layer_tensor_manager
-::entrywise_layer_tensor_manager(const Layer* l)
-  : layer_tensor_manager(l) {}
+template <typename TensorDataType>
+entrywise_layer_tensor_manager<TensorDataType>
+::entrywise_layer_tensor_manager(const data_type_layer<TensorDataType>* l)
+  : layer_tensor_manager<TensorDataType>(l) {}
 
 namespace {
 
@@ -455,8 +472,9 @@ namespace {
  *  a*b*c=height. This is because cuDNN is optimized for 4D tensors
  *  and gets poor performance with 1D tensors and 2D tensors.
  */
+template <typename TensorDataType>
 void set_entrywise_tensor_desc(cudnnTensorDescriptor_t& desc,
-                               const AbsMat& local_data) {
+                               const El::AbstractMatrix<TensorDataType>& local_data) {
 #ifdef LBANN_DEBUG
   if (local_data.GetDevice() != El::Device::GPU) {
     LBANN_ERROR("attempted to setup cuDNN tensor with non-GPU data");
@@ -476,7 +494,7 @@ void set_entrywise_tensor_desc(cudnnTensorDescriptor_t& desc,
     }
 
     // Set cuDNN tensor descriptor with 4D tensor
-    set_tensor_desc(desc,
+    set_tensor_desc<TensorDataType>(desc,
                     {width, factors[2], factors[1], factors[0]},
                     {ldim, factors[1]*factors[0], factors[0], 1});
 
@@ -485,47 +503,51 @@ void set_entrywise_tensor_desc(cudnnTensorDescriptor_t& desc,
 
 } // namespace
 
-cudnnTensorDescriptor_t& entrywise_layer_tensor_manager::get_prev_activations(int parent_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& entrywise_layer_tensor_manager<TensorDataType>::get_prev_activations(int parent_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_prev_activations(parent_index);
-  set_num_parents(m_layer->get_num_parents());
-  auto& desc = m_prev_activations[parent_index];
-  set_entrywise_tensor_desc(desc, local_data);
+  const auto& local_data = this->m_layer->get_local_prev_activations(parent_index);
+  this->set_num_parents(this->m_layer->get_num_parents());
+  auto& desc = this->m_prev_activations[parent_index];
+  set_entrywise_tensor_desc<TensorDataType>(desc, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& entrywise_layer_tensor_manager::get_activations(int child_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& entrywise_layer_tensor_manager<TensorDataType>::get_activations(int child_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_activations(child_index);
-  set_num_children(m_layer->get_num_children());
-  auto& desc = m_activations[child_index];
-  set_entrywise_tensor_desc(desc, local_data);
+  const auto& local_data = this->m_layer->get_local_activations(child_index);
+  this->set_num_children(this->m_layer->get_num_children());
+  auto& desc = this->m_activations[child_index];
+  set_entrywise_tensor_desc<TensorDataType>(desc, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& entrywise_layer_tensor_manager::get_prev_error_signals(int child_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& entrywise_layer_tensor_manager<TensorDataType>::get_prev_error_signals(int child_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_prev_error_signals(child_index);
-  set_num_children(m_layer->get_num_children());
-  auto& desc = m_prev_error_signals[child_index];
-  set_entrywise_tensor_desc(desc, local_data);
+  const auto& local_data = this->m_layer->get_local_prev_error_signals(child_index);
+  this->set_num_children(this->m_layer->get_num_children());
+  auto& desc = this->m_prev_error_signals[child_index];
+  set_entrywise_tensor_desc<TensorDataType>(desc, local_data);
   return desc;
 }
 
-cudnnTensorDescriptor_t& entrywise_layer_tensor_manager::get_error_signals(int parent_index) {
-  if (m_layer == nullptr) {
+template <typename TensorDataType>
+cudnnTensorDescriptor_t& entrywise_layer_tensor_manager<TensorDataType>::get_error_signals(int parent_index) {
+  if (this->m_layer == nullptr) {
     LBANN_ERROR("tensor manager is not managing a layer");
   }
-  const auto& local_data = m_layer->get_local_error_signals(parent_index);
-  set_num_parents(m_layer->get_num_parents());
-  auto& desc = m_error_signals[parent_index];
-  set_entrywise_tensor_desc(desc, local_data);
+  const auto& local_data = this->m_layer->get_local_error_signals(parent_index);
+  this->set_num_parents(this->m_layer->get_num_parents());
+  auto& desc = this->m_error_signals[parent_index];
+  set_entrywise_tensor_desc<TensorDataType>(desc, local_data);
   return desc;
 }
 
@@ -861,6 +883,31 @@ cudnnConvolutionBwdFilterAlgo_t get_bwd_filter_algorithm(
   }
 }
 
+namespace {
+cudnnMathType_t default_tensor_ops_mode = CUDNN_DEFAULT_MATH;
+}
+
+void default_to_tensor_ops() noexcept
+{
+  default_tensor_ops_mode = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+}
+
+cudnnMathType_t get_default_convolution_math_type() noexcept
+{
+  return default_tensor_ops_mode;
+}
+
+#define PROTO(T)                                       \
+  template cudnnDataType_t get_data_type<T>();                   \
+  template void set_tensor_desc<T>(cudnnTensorDescriptor_t&, std::vector<int>, std::vector<int>); \
+  template class layer_tensor_manager<T>;               \
+  template class data_parallel_layer_tensor_manager<T>; \
+  template class entrywise_layer_tensor_manager<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace cudnn
 } // namespace lbann
 
diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp
new file mode 100644
index 00000000000..f20923726fb
--- /dev/null
+++ b/src/utils/distconv.cpp
@@ -0,0 +1,493 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_UTILS_DISTCONV_INSTANTIATE
+#include "lbann/utils/distconv.hpp"
+#include "lbann/utils/cudnn.hpp"
+#include "lbann/layers/layer.hpp"
+#include <cstdlib>
+
+#ifdef LBANN_HAS_DISTCONV
+
+using namespace distconv;
+
+namespace lbann {
+namespace dc {
+namespace {
+bool initialized = false;
+MPI_Comm mpi_comm = MPI_COMM_NULL;
+#ifdef DISTCONV_HAS_P2P
+p2p::P2P *p2p_instance = nullptr;
+#endif // DISTCONV_HAS_P2P
+Al::mpicuda_backend::comm_type *mpicuda_comm_instance = nullptr;
+Backend *backend_instance = nullptr;
+
+bool options_set = false;
+#ifdef DISTCONV_HAS_P2P
+HaloExchangeMethod opt_halo_exchange = HaloExchangeMethod::HYBRID;
+ShuffleMethod opt_tensor_shuffler = ShuffleMethod::HYBRID;
+#else
+HaloExchangeMethod opt_halo_exchange = HaloExchangeMethod::AL;
+ShuffleMethod opt_tensor_shuffler = ShuffleMethod::AL;
+#endif // DISTCONV_HAS_P2P
+int opt_rank_stride = 1;
+bool opt_evaluate_performance = false;
+std::string opt_convolution_fwd_algorithm("DEFAULT");
+std::string opt_convolution_bwd_data_algorithm("DEFAULT");
+std::string opt_convolution_bwd_filter_algorithm("DEFAULT");
+// Allowed values: MINSTD, MT, and ONE
+std::string opt_synthetic_data_reader_randgen("MINSTD");
+int opt_num_pre_generated_synthetic_data = 0;
+bool opt_deterministic = false;
+int opt_num_io_partitions = 1;
+bool opt_cosmoflow_parallel_io = false;
+
+void set_options() {
+  if (options_set) return;
+  char *env = nullptr;
+  env = std::getenv("LBANN_DISTCONV_HALO_EXCHANGE");
+  if (env) {
+    opt_halo_exchange = ::distconv::GetHaloExchangeMethod(env);
+  }
+  env = std::getenv("LBANN_DISTCONV_TENSOR_SHUFFLER");
+  if (env) {
+    opt_tensor_shuffler = ::distconv::GetShuffleMethod(env);
+  }
+  env = std::getenv("LBANN_DISTCONV_RANK_STRIDE");
+  if (env) {
+    opt_rank_stride = std::atoi(env);
+  }
+  if (std::getenv("LBANN_DISTCONV_EVALUATE_PERFORMANCE")) {
+    opt_evaluate_performance = true;
+  }
+  env = getenv("LBANN_DISTCONV_CONVOLUTION_FWD_ALGORITHM");
+  if (env) {
+    opt_convolution_fwd_algorithm = env;
+  }
+  env = getenv("LBANN_DISTCONV_CONVOLUTION_BWD_DATA_ALGORITHM");
+  if (env) {
+    opt_convolution_bwd_data_algorithm = env;
+  }
+  env = getenv("LBANN_DISTCONV_CONVOLUTION_BWD_FILTER_ALGORITHM");
+  if (env) {
+    opt_convolution_bwd_filter_algorithm = env;
+  }
+  env = getenv("LBANN_DISTCONV_SYNTHETIC_DATA_READER_RANDGEN");
+  if (env) {
+    opt_synthetic_data_reader_randgen = env;
+  }
+  env = getenv("LBANN_DISTCONV_NUM_PRE_GENERATED_SYNTHETIC_DATA");
+  if (env) {
+    opt_num_pre_generated_synthetic_data = atoi(env);
+  }
+  env = getenv("LBANN_DISTCONV_DETERMINISTIC");
+  if (env) {
+    opt_deterministic = true;
+  }
+  env = getenv("LBANN_DISTCONV_NUM_IO_PARTITIONS");
+  if (env) {
+    opt_num_io_partitions = std::atoi(env);
+  }
+  env = getenv("LBANN_DISTCONV_COSMOFLOW_PARALLEL_IO");
+  if (env) {
+    opt_cosmoflow_parallel_io = true;
+  }
+  options_set = true;
+}
+
+void print_options(std::ostream &os) {
+  if (is_mpi_root()) {
+    std::stringstream ss;
+    ss << "LBANN/Distconv options\n";
+    ss << "  halo_exchange:" << opt_halo_exchange << "\n";
+    ss << "  tensor_shuffler:" << opt_tensor_shuffler << "\n";
+    ss << "  rank_stride:" << opt_rank_stride << "\n";
+    ss << "  evaluate_performance: "
+       << opt_evaluate_performance << "\n";
+    ss << "  convolution_fwd_algorithm: "
+       << opt_convolution_fwd_algorithm
+       << std::endl;
+    ss << "  convolution_bwd_data_algorithm: "
+       << opt_convolution_bwd_data_algorithm
+       << std::endl;
+    ss << "  convolution_bwd_filter_algorithm: "
+       << opt_convolution_bwd_filter_algorithm
+       << std::endl;
+    ss << "  synthetic_data_reader_randgen: "
+       << opt_synthetic_data_reader_randgen
+       << std::endl;
+    ss << "  num_pre_generated_synthetic_data: "
+       << opt_num_pre_generated_synthetic_data
+       << std::endl;
+    ss << "  deterministic: "
+       << opt_deterministic
+       << std::endl;
+    ss << "  num_io_partitions: "
+       << opt_num_io_partitions
+       << std::endl;
+    ss << "  cosmoflow_parallel_io: "
+       << opt_cosmoflow_parallel_io
+       << std::endl;
+    os << ss.str();
+  }
+}
+
+int get_number_of_local_ranks(MPI_Comm comm) {
+  MPI_Comm local_comm;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
+                      MPI_INFO_NULL, &local_comm);
+  int local_comm_size;
+  MPI_Comm_size(local_comm, &local_comm_size);
+  MPI_Comm_free(&local_comm);
+  return local_comm_size;
+}
+
+// P2P is only supported intra-node shuffling.
+template <typename Tensor>
+bool is_p2p_shuffle_feasible(const Tensor &tensor) {
+  const auto &dist = tensor.get_distribution();
+  auto sample_proc_groups = dist.get_locale_shape()[dc::get_sample_dim()];
+  auto sample_size = tensor.get_shape().back();
+  // Condition: The number of samples must be divisible by the size of
+  // sample process groups
+  if (sample_size % sample_proc_groups != 0) {
+    return false;
+  }
+  // Condition: The number of local processes must be greater than or
+  // equal to the number of processes of the spatial domain
+  auto local_comm_size = get_number_of_local_ranks(
+      tensor.get_locale().get_comm());
+  auto spatial_proc_size = 1;
+  for (int i = 0; i < tensor.get_num_spatial_dims(); ++i) {
+    spatial_proc_size *= dist.get_locale_shape()[i];
+  }
+  if (local_comm_size < spatial_proc_size) {
+    return false;
+  }
+  // Condition: The number of local processes must be divisible by the
+  // number of processes for the spatial domain
+  if (local_comm_size % spatial_proc_size != 0) {
+    return false;
+  }
+  return true;
+}
+
+void *shuffler_src_buf = nullptr;
+size_t shuffler_src_buf_size = 0;
+void *shuffler_dst_buf = nullptr;
+size_t shuffler_dst_buf_size = 0;
+
+template <typename TensorDataType>
+TensorDataType *get_shuffler_src_buf(const TensorDev<TensorDataType> &tensor) {
+  // Allocate if null
+  if (shuffler_src_buf == nullptr) {
+    shuffler_src_buf_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
+    MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size "
+                          << shuffler_src_buf_size;
+    DISTCONV_CUDA_MALLOC(&shuffler_src_buf, shuffler_src_buf_size);
+  }
+  // Returns the pre-allocated memory if it's large enough
+  size_t required_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
+  if (required_size <= shuffler_src_buf_size) {
+    MPIPrintStreamDebug() << "Using shared shuffler buffer";
+    return static_cast<TensorDataType*>(shuffler_src_buf);
+  } else {
+    return nullptr;
+  }
+}
+
+template <typename TensorDataType>
+TensorDataType *get_shuffler_dst_buf(const TensorDev<TensorDataType> &tensor) {
+  // Allocate if null
+  if (shuffler_dst_buf == nullptr) {
+    shuffler_dst_buf_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
+    MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size "
+                          << shuffler_src_buf_size;
+    DISTCONV_CUDA_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size);
+  }
+  size_t required_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
+  // Returns the pre-allocated memory if it's large enough
+  if (required_size <= shuffler_dst_buf_size) {
+    MPIPrintStreamDebug() << "Using shared shuffler buffer";
+    return static_cast<TensorDataType*>(shuffler_dst_buf);
+  } else {
+    return nullptr;
+  }
+}
+void delete_shuffler_buffers() {
+  if (shuffler_src_buf) {
+    CHECK_CUDA(cudaFree(shuffler_src_buf));
+    shuffler_src_buf = nullptr;
+  }
+  if (shuffler_dst_buf) {
+    CHECK_CUDA(cudaFree(shuffler_dst_buf));
+    shuffler_dst_buf = nullptr;
+  }
+}
+} // namespace
+
+int get_strided_mpi_rank(MPI_Comm comm) {
+  // Assumes comm is in the packed order of nodes, i.e., let PPN be
+  // the number of processes per node, the local rank is rank % PPN,
+  // and the node rank is rank / PPN.
+  set_options();
+  int stride = opt_rank_stride;
+  int rank;
+  MPI_Comm_rank(comm, &rank);
+  if (stride == 1) return rank;
+  int num_ranks;
+  MPI_Comm_size(comm, &num_ranks);
+  int num_local_ranks = get_number_of_local_ranks(comm);
+  assert_always(stride >= 1);
+  assert0(num_ranks % num_local_ranks);
+  assert0(num_ranks % stride);
+  int new_rank = rank / stride + (rank % stride) * (num_ranks / stride);
+  return new_rank;
+}
+
+MPI_Comm get_strided_mpi_comm(MPI_Comm comm) {
+  set_options();
+  int stride = opt_rank_stride;
+  if (stride == 1) return comm;
+  int rank;
+  MPI_Comm_rank(comm, &rank);
+  int new_rank = get_strided_mpi_rank(comm);
+  MPIPrintStreamInfo() << "Mapping rank " << rank << " to " << new_rank;
+  MPI_Comm new_comm;
+  MPI_Comm_split(comm, 0, new_rank, &new_comm);
+  return new_comm;
+}
+
+void initialize(MPI_Comm comm) {
+  assert_always(!initialized);
+  set_options();
+  mpi_comm = comm;
+#ifdef DISTCONV_HAS_P2P
+  p2p_instance = new p2p::P2P(mpi_comm);
+#endif // DISTCONV_HAS_P2P
+  mpicuda_comm_instance = new Al::mpicuda_backend::comm_type(
+      mpi_comm, El::GPUManager::Stream());
+  ::distconv::cudnn::Options backend_opts;
+  backend_opts.m_deterministic = opt_deterministic;
+  backend_instance = new Backend(
+      mpi_comm, lbann::cudnn::get_handle(),
+      El::GPUManager::Stream(), backend_opts);
+  print_options(std::cout);
+  initialized = true;
+}
+
+void finalize() {
+  if (initialized) {
+#ifdef DISTCONV_HAS_P2P
+    delete p2p_instance;
+    p2p_instance = nullptr;
+#endif // DISTCONV_HAS_P2P
+    delete backend_instance;
+    backend_instance = nullptr;
+    initialized = false;
+    delete_shuffler_buffers();
+  }
+}
+
+MPI_Comm get_mpi_comm() {
+  return mpi_comm;
+}
+
+int get_mpi_rank() {
+  int rank;
+  MPI_Comm_rank(get_mpi_comm(), &rank);
+  return rank;
+}
+
+int get_mpi_num_ranks() {
+  int num_ranks;
+  MPI_Comm_size(get_mpi_comm(), &num_ranks);
+  return num_ranks;
+}
+
+bool is_mpi_root() {
+  return get_mpi_rank() == 0;
+}
+
+int get_rank_stride() {
+  return opt_rank_stride;
+}
+
+bool evaluate_performance() {
+  return opt_evaluate_performance;
+}
+
+std::string get_convolution_fwd_algorithm() {
+  return opt_convolution_fwd_algorithm;
+}
+
+std::string get_convolution_bwd_data_algorithm() {
+  return opt_convolution_bwd_data_algorithm;
+}
+
+std::string get_convolution_bwd_filter_algorithm() {
+  return opt_convolution_bwd_filter_algorithm;
+}
+
+std::string get_synthetic_data_reader_randgen() {
+  return opt_synthetic_data_reader_randgen;
+}
+
+int get_number_of_pre_generated_synthetic_data() {
+  return opt_num_pre_generated_synthetic_data;
+}
+
+bool is_deterministic() {
+  return opt_deterministic;
+}
+
+int get_number_of_io_partitions() {
+  return opt_num_io_partitions;
+}
+
+bool is_cosmoflow_parallel_io_enabled() {
+  return opt_cosmoflow_parallel_io;
+}
+
+Al::mpicuda_backend::comm_type &get_mpicuda() {
+  return *mpicuda_comm_instance;
+}
+
+Backend &get_backend() {
+  return *backend_instance;
+}
+
+HaloExchangeMethod get_halo_exchange_method() {
+  return opt_halo_exchange;
+}
+
+template <typename TensorDataType>
+TensorShuffler<TensorDataType> *get_tensor_shuffler(const TensorDev<TensorDataType> &src,
+                                                    const TensorDev<TensorDataType> &dst) {
+  if (opt_tensor_shuffler == ShuffleMethod::AL) {
+    return new TensorShufflerAL<TensorDataType>(src, dst, get_mpicuda());
+  } else if (opt_tensor_shuffler == ShuffleMethod::MPI) {
+    return new TensorShuffler<TensorDataType>(src, dst);
+#ifdef DISTCONV_HAS_P2P
+  } else if (opt_tensor_shuffler == ShuffleMethod::HYBRID) {
+    return new TensorShufflerHybrid<TensorDataType>(src, dst, *p2p_instance, get_mpicuda(),
+                                                    get_shuffler_src_buf(src),
+                                                    get_shuffler_dst_buf(dst));
+  } else if (opt_tensor_shuffler == ShuffleMethod::P2P) {
+    bool src_feasible = is_p2p_shuffle_feasible(src);
+    bool dst_feasible = is_p2p_shuffle_feasible(dst);
+    if (!src_feasible) {
+      LBANN_ERROR("P2P shuffler requested but not possible; ",
+                  "inter-node communication is required for source tennsor");
+    }
+    if (!dst_feasible) {
+      LBANN_ERROR("P2P shuffler requested but not possible; ",
+                  "inter-node communication is required for destination tennsor");
+    }
+    return new TensorShufflerP2P<TensorDataType>(src, dst, *p2p_instance,
+                                                 get_shuffler_src_buf(src),
+                                                 get_shuffler_dst_buf(dst));
+#endif // DISTCONV_HAS_P2P
+  } else {
+    LBANN_ERROR("Unsupported shuffler method: ", opt_tensor_shuffler);
+  }
+}
+
+MPI_Comm get_input_comm(const lbann_comm &comm) {
+  if (!is_cosmoflow_parallel_io_enabled() || get_rank_stride() == 1) {
+    return comm.get_trainer_comm().GetMPIComm();
+  } else {
+    return get_mpi_comm();
+  }
+}
+
+int get_input_rank(const lbann_comm &comm) {
+  if (!is_cosmoflow_parallel_io_enabled() || get_rank_stride() == 1) {
+    return comm.get_rank_in_trainer();
+  } else {
+    return get_mpi_rank();
+  }
+}
+
+Dist get_hydrogen_data_parallel_distribution(int num_dims) {
+  using ::distconv::index_t;
+  // When rank stride is 1, the distribution is just sample
+  // distribution. When it's greater than 1, multiple consecutive
+  // ranks of length rank stride share a split in the first
+  // dimension. It is assumed that LBANN uses only the
+  // NUM_RANKS/STRIDE ranks in a data-parallel input layer to read
+  // training data.
+  dc::Shape sample_locale_shape(num_dims, 1);
+  sample_locale_shape[0] = static_cast<index_t>(dc::get_rank_stride());
+  sample_locale_shape[-1] = static_cast<index_t>(dc::get_mpi_num_ranks() / dc::get_rank_stride());
+  auto sample_split_shape = sample_locale_shape;
+  sample_split_shape[0] = 1;
+  auto sample_dist = dc::Dist::make_shared_distribution
+      (sample_locale_shape, sample_split_shape);
+  return sample_dist;
+}
+
+size_t get_workspace_capacity() {
+  size_t available, total;
+  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
+  size_t workspace_capacity = available;
+  // set aside some space for shuffling, halo exchange, etc.
+  workspace_capacity -= 1 << 28;
+  dc::MPIRootPrintStreamInfo()
+      << "Current available memory: " << available << " (" << int(available / 1024.0 / 1024.0)
+      << " MB), workspace: " << workspace_capacity
+      << " (" << int(workspace_capacity / 1024.0 / 1024.0)
+      << " MB)";
+  return workspace_capacity;
+}
+
+int get_num_dims(const Layer &layer) {
+  // Use the dimension of either input or output data.
+  auto nd = layer.get_num_parents() > 0 ? layer.get_input_dims().size() :
+      layer.get_output_dims().size();
+  nd += 1; // input and output dimensions do not have the sample dimension.
+  if (!(nd == 4 || nd == 5)) {
+    LBANN_ERROR(layer.get_name(), ": Invalid number of dimensions: ", nd);
+  }
+  return nd;
+}
+
+int get_num_spatial_dims(const Layer &layer) {
+  return get_num_dims(layer) - 2;
+}
+
+#define PROTO(T)                                                \
+  template TensorShuffler<T> *get_tensor_shuffler<T>(           \
+      const TensorDev<T> &, const TensorDev<T> &);
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+} // namespace dc
+} // namespace lbann
+
+#endif // LBANN_HAS_DISTCONV
diff --git a/src/utils/environment_variable.cpp b/src/utils/environment_variable.cpp
new file mode 100644
index 00000000000..d58853ff335
--- /dev/null
+++ b/src/utils/environment_variable.cpp
@@ -0,0 +1,48 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/environment_variable.hpp"
+
+#include <stdlib.h>
+
+namespace lbann
+{
+namespace utils
+{
+
+std::string GetEnvAccessor::get(std::string const& var_name) const
+{
+#if _GNU_SOURCE
+  // If GNU, secure_getenv might be better??
+  char const* env = secure_getenv(var_name.c_str());
+#else
+  char const* env = std::getenv(var_name.c_str());
+#endif // _GNU_SOURCE
+  return std::string(env ? env : "");
+}
+
+}// namespace utils
+}// namespace lbann
diff --git a/src/utils/graph.cpp b/src/utils/graph.cpp
index 1987676a0a8..f48c09a4467 100644
--- a/src/utils/graph.cpp
+++ b/src/utils/graph.cpp
@@ -36,7 +36,7 @@ namespace graph {
 
 void print(const std::set<El::Int>& nodes,
            const std::map<El::Int,std::set<El::Int>>& edges,
-           std::ostream& os) {
+           std::ostream& os = std::cout) {
   for (const auto& node : nodes) {
     os << "node " << node << " neighbors :";
     for (const auto& neighbor : get_neighbors(node, edges)) {
diff --git a/src/utils/im2col.cpp b/src/utils/im2col.cpp
index aa374dd0ccf..3d7c87ee05f 100644
--- a/src/utils/im2col.cpp
+++ b/src/utils/im2col.cpp
@@ -24,13 +24,15 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_UTILS_IM2COL_INSTANTIATE
 #include "lbann/utils/im2col.hpp"
 #include "lbann/utils/exception.hpp"
 
 namespace lbann {
 
-void im2col(const CPUMat& im,
-            CPUMat& col,
+template <typename TensorDataType>
+void im2col(const CPUMatDT<TensorDataType>& im,
+            CPUMatDT<TensorDataType>& col,
             const int num_channels,
             const int im_num_dims,
             const int * im_dims,
@@ -41,8 +43,8 @@ void im2col(const CPUMat& im,
   // Input and output parameters
   const int col_height = col.Height();
   const int col_width = col.Width();
-  const DataType *__restrict__ im_buffer = im.LockedBuffer();
-  DataType *__restrict__ col_buffer = col.Buffer();
+  const TensorDataType *__restrict__ im_buffer = im.LockedBuffer();
+  TensorDataType *__restrict__ col_buffer = col.Buffer();
 
   // im2col parameters
   std::vector<int> offset_start(im_num_dims);
@@ -145,112 +147,30 @@ void im2col(const CPUMat& im,
 
       // Copy im matrix entry to col matrix if valid
       col_buffer[col_index] = (im_pos_valid ?
-                               im_buffer[im_index] : DataType(0));
+                               im_buffer[im_index] : TensorDataType(0.));
 
     }
   }
 
 }
 
-void col2im(const CPUMat& col,
-            CPUMat& im,
-            const int num_channels,
-            const int im_num_dims,
-            const int * im_dims,
-            const int * im_pads,
-            const int * window_dims,
-            const int * window_strides) {
-
-  // Input and output parameters
-  const DataType *__restrict__ col_buffer = col.LockedBuffer();
-  DataType *__restrict__ im_buffer = im.Buffer();
-
-  // col2im parameters
-  std::vector<int> offset_start(im_num_dims);
-  std::vector<int> offset_end(im_num_dims);
-  std::vector<int> offset_stride(im_num_dims);
-  std::vector<int> offset_num(im_num_dims);
-  for(int d = 0; d < im_num_dims; ++d) {
-    offset_start[d] = -im_pads[d];
-    offset_end[d] = im_dims[d] + im_pads[d] - window_dims[d] + 1;
-    offset_stride[d] = window_strides[d];
-    offset_num[d] = (offset_end[d] - offset_start[d] + offset_stride[d] - 1) / offset_stride[d];
-  }
-
-  #ifdef LBANN_DEBUG
-  const int im_size = im.Height();
-  const int col_height = col.Height();
-  const int col_width = col.Width();
-  // Check matrix dimensions
-  const int expected_im_size = std::accumulate(im_dims,
-                                               im_dims + im_num_dims,
-                                               num_channels,
-                                               std::multiplies<int>());
-  const int expected_col_height = std::accumulate(window_dims,
-                                                  window_dims + im_num_dims,
-                                                  num_channels,
-                                                  std::multiplies<int>());
-  const int expected_col_width = std::accumulate(offset_num.begin(),
-                                                 offset_num.end(),
-                                                 1,
-                                                 std::multiplies<int>());
-  if(im_size != expected_im_size || im.Width() != 1) {
-    std::stringstream ss;
-    ss << "im2col: im matrix has invalid dimensions "
-       << "(expected " << expected_im_size << " x " << 1 << ", "
-       << "found " << im_size << " x " << im.Width() << ")";
-    throw lbann_exception(ss.str());
-  }
-  if(col_height != expected_col_height
-     || col_width != expected_col_width) {
-    std::stringstream ss;
-    ss << "im2col: col matrix has invalid dimensions "
-       << "(expected " << expected_col_height << " x " << expected_col_width << ", "
-       << "found " << col_height << " x " << col_width << ")";
-    throw lbann_exception(ss.str());
-  }
-  #endif // LBANN_DEBUG
-
-  // Call optimized routine for 1x1 col2im
-  std::vector<int> zeros(im_num_dims, 0), ones(im_num_dims, 1);
-  if(std::equal(im_pads, im_pads + im_num_dims, zeros.begin())
-     && std::equal(window_dims, window_dims + im_num_dims, ones.begin())
-     && std::equal(window_strides, window_strides + im_num_dims, ones.begin())) {
-    col2im_1x1(col_buffer, im_buffer, num_channels, im_num_dims, im_dims);
-    return;
-  }
-
-  // Call optimized routine for 2D data
-  if(im_num_dims == 2) {
-    col2im_2d(col_buffer, im_buffer,
-              im_dims[1], im_dims[0], im_pads[1], im_pads[0], num_channels,
-              window_dims[1], window_dims[0],
-              window_strides[1], window_strides[0]);
-    return;
-  }
-
-  // Default algorithm
-  col2im(col, im, num_channels, im_num_dims,
-         im_dims, im_pads, window_dims, window_strides,
-         std::plus<DataType>());
-
-}
-
-void col2im(const CPUMat& col,
-            CPUMat& im,
-            const int num_channels,
-            const int im_num_dims,
-            const int * im_dims,
-            const int * im_pads,
-            const int * window_dims,
-            const int * window_strides,
-            std::function<DataType(const DataType&,const DataType&)> reduction_op) {
+namespace {
+template <typename TensorDataType, typename ReductionOpType>
+void col2im_impl(const CPUMatDT<TensorDataType>& col,
+                 CPUMatDT<TensorDataType>& im,
+                 const int num_channels,
+                 const int im_num_dims,
+                 const int * im_dims,
+                 const int * im_pads,
+                 const int * window_dims,
+                 const int * window_strides,
+                 ReductionOpType reduction_op) {
 
   // Input and output parameters
   const int col_height = col.Height();
   const int im_size = im.Height();
-  const DataType *__restrict__ col_buffer = col.LockedBuffer();
-  DataType *__restrict__ im_buffer = im.Buffer();
+  const TensorDataType *__restrict__ col_buffer = col.LockedBuffer();
+  TensorDataType *__restrict__ im_buffer = im.Buffer();
 
   // im2col parameters
   std::vector<int> offset_start(im_num_dims);
@@ -292,7 +212,7 @@ void col2im(const CPUMat& col,
     const int channel = im_index_remainder;
 
     // Initialize im matrix entry
-    DataType im_entry = 0;
+    TensorDataType im_entry = El::TypeTraits<TensorDataType>::Zero();
     bool im_entry_initialized = false;
     bool offsets_finished = false;
 
@@ -322,7 +242,7 @@ void col2im(const CPUMat& col,
       const int col_index = col_row + col_col * col_height;
 
       // Add col matrix entry to im matrix entry
-      const DataType col_entry = col_buffer[col_index];
+      const TensorDataType col_entry = col_buffer[col_index];
       im_entry = (im_entry_initialized ?
                   reduction_op(im_entry, col_entry) :
                   col_entry);
@@ -346,9 +266,112 @@ void col2im(const CPUMat& col,
   }
 
 }
+}// namespace <anon>
+
+template <typename TensorDataType>
+void col2im(const CPUMatDT<TensorDataType>& col,
+            CPUMatDT<TensorDataType>& im,
+            int num_channels,
+            int im_num_dims,
+            const int * im_dims,
+            const int * im_pads,
+            const int * window_dims,
+            const int * window_strides,
+            std::function<TensorDataType(const TensorDataType&, const TensorDataType&)> reduction_op) {
+  col2im_impl(col, im,
+              num_channels, im_num_dims,
+              im_dims, im_pads, window_dims, window_strides, reduction_op);
+}
+
+template <typename TensorDataType>
+void col2im(const CPUMatDT<TensorDataType>& col,
+            CPUMatDT<TensorDataType>& im,
+            const int num_channels,
+            const int im_num_dims,
+            const int * im_dims,
+            const int * im_pads,
+            const int * window_dims,
+            const int * window_strides) {
+
+  // Input and output parameters
+  const TensorDataType *__restrict__ col_buffer = col.LockedBuffer();
+  TensorDataType *__restrict__ im_buffer = im.Buffer();
+
+  // col2im parameters
+  std::vector<int> offset_start(im_num_dims);
+  std::vector<int> offset_end(im_num_dims);
+  std::vector<int> offset_stride(im_num_dims);
+  std::vector<int> offset_num(im_num_dims);
+  for(int d = 0; d < im_num_dims; ++d) {
+    offset_start[d] = -im_pads[d];
+    offset_end[d] = im_dims[d] + im_pads[d] - window_dims[d] + 1;
+    offset_stride[d] = window_strides[d];
+    offset_num[d] = (offset_end[d] - offset_start[d] + offset_stride[d] - 1) / offset_stride[d];
+  }
+
+  #ifdef LBANN_DEBUG
+  const int im_size = im.Height();
+  const int col_height = col.Height();
+  const int col_width = col.Width();
+  // Check matrix dimensions
+  const int expected_im_size = std::accumulate(im_dims,
+                                               im_dims + im_num_dims,
+                                               num_channels,
+                                               std::multiplies<int>());
+  const int expected_col_height = std::accumulate(window_dims,
+                                                  window_dims + im_num_dims,
+                                                  num_channels,
+                                                  std::multiplies<int>());
+  const int expected_col_width = std::accumulate(offset_num.begin(),
+                                                 offset_num.end(),
+                                                 1,
+                                                 std::multiplies<int>());
+  if(im_size != expected_im_size || im.Width() != 1) {
+    std::stringstream ss;
+    ss << "im2col: im matrix has invalid dimensions "
+       << "(expected " << expected_im_size << " x " << 1 << ", "
+       << "found " << im_size << " x " << im.Width() << ")";
+    throw lbann_exception(ss.str());
+  }
+  if(col_height != expected_col_height
+     || col_width != expected_col_width) {
+    std::stringstream ss;
+    ss << "im2col: col matrix has invalid dimensions "
+       << "(expected " << expected_col_height << " x " << expected_col_width << ", "
+       << "found " << col_height << " x " << col_width << ")";
+    throw lbann_exception(ss.str());
+  }
+  #endif // LBANN_DEBUG
+
+  // Call optimized routine for 1x1 col2im
+  std::vector<int> zeros(im_num_dims, 0), ones(im_num_dims, 1);
+  if(std::equal(im_pads, im_pads + im_num_dims, zeros.begin())
+     && std::equal(window_dims, window_dims + im_num_dims, ones.begin())
+     && std::equal(window_strides, window_strides + im_num_dims, ones.begin())) {
+    col2im_1x1(col_buffer, im_buffer, num_channels, im_num_dims, im_dims);
+    return;
+  }
+
+  // Call optimized routine for 2D data
+  if(im_num_dims == 2) {
+    col2im_2d(col_buffer, im_buffer,
+              im_dims[1], im_dims[0], im_pads[1], im_pads[0], num_channels,
+              window_dims[1], window_dims[0],
+              window_strides[1], window_strides[0]);
+    return;
+  }
+
+  // Default algorithm
+  col2im_impl(col, im, num_channels, im_num_dims,
+              im_dims, im_pads, window_dims, window_strides,
+              std::plus<TensorDataType>());
+
+}
 
-void im2col_1x1(const DataType * input_buffer,
-                DataType * output_buffer,
+
+template <typename TensorDataType>
+void im2col_1x1(const TensorDataType * __restrict__ input_buffer,
+                TensorDataType * __restrict__ output_buffer,
                 const int num_channels,
                 const int num_input_dims,
                 const int * input_dims) {
@@ -356,13 +379,16 @@ void im2col_1x1(const DataType * input_buffer,
                                            input_dims + num_input_dims,
                                            1,
                                            std::multiplies<int>());
-  const CPUMat input_matrix(spatial_size, num_channels, input_buffer, spatial_size);
-  CPUMat output_matrix(num_channels, spatial_size, output_buffer, num_channels);
+  const CPUMatDT<TensorDataType> input_matrix(spatial_size, num_channels,
+                                              input_buffer, spatial_size);
+  CPUMatDT<TensorDataType> output_matrix(num_channels, spatial_size,
+                                         output_buffer, num_channels);
   El::Transpose(input_matrix, output_matrix);
 }
 
-void im2col_2d(const DataType *__restrict__ input_buffer,
-               DataType *__restrict__ output_buffer,
+template <typename TensorDataType>
+void im2col_2d(const TensorDataType *__restrict__ input_buffer,
+               TensorDataType *__restrict__ output_buffer,
                const int input_dim_x,
                const int input_dim_y,
                const int input_pad_x,
@@ -416,7 +442,7 @@ void im2col_2d(const DataType *__restrict__ input_buffer,
 
             // Copy input entry to output entry if valid
             output_buffer[output_index]
-              = input_pos_valid ? input_buffer[input_index] : DataType(0);
+              = input_pos_valid ? input_buffer[input_index] : TensorDataType(0.);
 
           }
         }
@@ -426,8 +452,9 @@ void im2col_2d(const DataType *__restrict__ input_buffer,
 
 }
 
-void col2im_1x1(const DataType * input_buffer,
-                DataType * output_buffer,
+template <typename TensorDataType>
+void col2im_1x1(const TensorDataType * input_buffer,
+                TensorDataType * output_buffer,
                 const int num_channels,
                 const int num_output_dims,
                 const int * output_dims) {
@@ -435,13 +462,16 @@ void col2im_1x1(const DataType * input_buffer,
                                            output_dims + num_output_dims,
                                            1,
                                            std::multiplies<int>());
-  const CPUMat input_matrix(num_channels, spatial_size, input_buffer, num_channels);
-  CPUMat output_matrix(spatial_size, num_channels, output_buffer, spatial_size);
+  const CPUMatDT<TensorDataType> input_matrix(num_channels, spatial_size,
+                                              input_buffer, num_channels);
+  CPUMatDT<TensorDataType> output_matrix(spatial_size, num_channels,
+                                         output_buffer, spatial_size);
   El::Transpose(input_matrix, output_matrix);
 }
 
-void col2im_2d(const DataType *__restrict__ input_buffer,
-               DataType *__restrict__ output_buffer,
+template <typename TensorDataType>
+void col2im_2d(const TensorDataType *__restrict__ input_buffer,
+               TensorDataType *__restrict__ output_buffer,
                const int output_dim_x,
                const int output_dim_y,
                const int output_pad_x,
@@ -475,7 +505,7 @@ void col2im_2d(const DataType *__restrict__ input_buffer,
         const int output_index = (output_pos_x
                                   + output_pos_y * output_dim_x
                                   + channel * output_dim_x * output_dim_y);
-        DataType output_entry = 0;
+        TensorDataType output_entry = El::TypeTraits<TensorDataType>::Zero();
 
         // Get window offsets containing output entry
         const int offset_x_lower = (output_pos_x - offset_start_x - window_dim_x + offset_stride_x) / offset_stride_x;
@@ -519,4 +549,37 @@ void col2im_2d(const DataType *__restrict__ input_buffer,
 
 }
 
+#define PROTO(T)                                                    \
+  template void im2col<T>(                                          \
+    const CPUMatDT<T>&, CPUMatDT<T>&,                               \
+    int, int,                                                       \
+    const int*, const int*,                                         \
+    const int*, const int*);                                        \
+  template void col2im<T>(                                          \
+    const CPUMatDT<T>&,                                             \
+    CPUMatDT<T>&,                                                   \
+    int, int,                                                       \
+    const int*, const int*,                                         \
+    const int*, const int*);                                        \
+  template void col2im<T>(                                          \
+    const CPUMatDT<T>&,                                             \
+    CPUMatDT<T>&,                                                   \
+    int, int,                                                       \
+    const int*, const int*,                                         \
+    const int*, const int*,                                         \
+    std::function<T(T const&, T const&)>);                          \
+  template void im2col_1x1<T>(                                      \
+    const T*, T*, int, int, const int*);                            \
+  template void im2col_2d(                                          \
+    const T*, T*, int, int, int, int, int, int, int, int, int);     \
+  template void col2im_1x1(                                         \
+    const T*, T*, int, int, const int*);                            \
+  template void col2im_2d(                                          \
+    const T*, T*, int, int, int, int, int, int, int, int, int)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+// FIXME -- these should never be called in GPU code.
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 }  // namespace lbann
diff --git a/src/utils/image.cpp b/src/utils/image.cpp
new file mode 100644
index 00000000000..810054a54bf
--- /dev/null
+++ b/src/utils/image.cpp
@@ -0,0 +1,275 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <arpa/inet.h>
+#include <opencv2/imgcodecs.hpp>
+#include "lbann/utils/image.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/opencv.hpp"
+
+namespace lbann {
+
+namespace {
+
+// Read filename into buf.
+void read_file_to_buf(const std::string& filename, El::Matrix<uint8_t>& buf,
+                      size_t& size) {
+  FILE* f = fopen(filename.c_str(), "r");
+  if (f == nullptr) {
+    LBANN_ERROR("Could not open file " + filename);
+  }
+  // Determine the length.
+  if (fseeko(f, 0, SEEK_END) != 0) {
+    LBANN_ERROR("Could not seek to end of file " + filename);
+  }
+  off_t size_ = ftello(f);
+  if (size_ == -1) {
+    LBANN_ERROR("Could not get offset in file " + filename);
+  }
+  size = static_cast<size_t>(size_);
+  rewind(f);
+  // Allocate sufficient space and read.
+  buf.Resize(size, 1);
+  if (fread(buf.Buffer(), 1, size, f) != size) {
+    LBANN_ERROR("Could not real file " + filename);
+  }
+  fclose(f);
+}
+
+// There are other SOFs, but these are the common ones.
+const bool is_jpg_sof[16] = {
+  true, true, true, true, false, true, true, true,
+  false, true, true, true, false, true, true, true};
+
+// Attempt to guess the decoded size of an image.
+// May not return the actual size (and may just return 0), so treat this as a
+// hint.
+void guess_image_size(const El::Matrix<uint8_t>& buf_, size_t size,
+                      size_t& height, size_t& width, size_t& channels) {
+  height = 0;
+  width = 0;
+  channels = 0;
+  const uint8_t* buf = buf_.LockedBuffer();
+  if (size >= 2 &&  // Size
+      buf[0] == 0xFF && buf[1] == 0xD8) {  // Signature
+    // JPEG image.
+    // See: https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure
+    // and https://stackoverflow.com/questions/15800704/get-image-size-without-loading-image-into-memory
+    // and https://github.com/python-pillow/Pillow/blob/master/src/PIL/JpegImagePlugin.py
+    // JPEG is complicated, this will probably not work for every image.
+    // Try to find a start-of-frame marker, and then get the size.
+    for (size_t cur_pos = 2; cur_pos < size;) {
+      uint8_t b = buf[cur_pos];
+      if (b == 0xFF) {
+        if (cur_pos + 1 >= size) { return; }  // Shouldn't happen.
+        uint8_t marker = buf[cur_pos + 1];
+        if (marker >= 0xC0 && marker <= 0xCF && is_jpg_sof[marker - 0xC0]) {
+          // Found the SOF.
+          // 2 for the marker, 2 for the frame header length, 1 for the precision.
+          cur_pos += 5;
+          if (cur_pos + 4 >= size) { return; }  // Shouldn't happen.
+          uint16_t h_w[2];
+          memcpy(h_w, &buf[cur_pos], 4);
+          height = ntohs(h_w[0]);
+          width = ntohs(h_w[1]);
+          channels = 3;  // Assume color.
+          return;
+        } else {
+          cur_pos += 2;
+          if (cur_pos + 2 >= size) { return; }  // Shouldn't happen.
+          // Skip ahead by the length of this segment.
+          uint16_t l;
+          memcpy(&l, &buf[cur_pos], 2);
+          cur_pos += ntohs(l);
+        }
+      } else {
+        // Skip non-0xFFs.
+        cur_pos += 1;
+      }
+    }
+  } else if (size >= 24 &&  // Size
+             // Check signature
+             buf[0] == 0x89 && buf[1] == 0x50 &&
+             buf[2] == 0x4E && buf[3] == 0x47 &&
+             buf[4] == 0x0D && buf[5] == 0x0A &&
+             buf[6] == 0x1A && buf[7] == 0x0A &&
+             // Need IHDR chunk.
+             buf[12] == 'I' && buf[13] == 'H' &&
+             buf[14] == 'D' && buf[15] == 'R') {
+    // PNG image
+    // See: https://en.wikipedia.org/wiki/Portable_Network_Graphics#File_header
+    uint32_t h_w[2];
+    memcpy(h_w, buf + 16, 8);
+    // Convert from network byte order and get size.
+    width = ntohl(h_w[0]);
+    height = ntohl(h_w[1]);
+    channels = 3;  // Assume color.
+  }
+  // Give up.
+}
+
+// Decode an image from a buffer using OpenCV.
+void opencv_decode(El::Matrix<uint8_t>& buf, El::Matrix<uint8_t>& dst,
+                   std::vector<size_t>& dims, const std::string filename) {
+  const size_t encoded_size = buf.Height() * buf.Width();
+  std::vector<size_t> buf_dims = {1, encoded_size, 1};
+  cv::Mat cv_encoded = utils::get_opencv_mat(buf, buf_dims);
+  // Attempt to guess the decoded size.
+  // Warning: These may be wrong.
+  size_t height, width, channels;
+  guess_image_size(buf, encoded_size, height, width, channels);
+  if (height != 0) {
+    // We have a guess.
+    dst.Resize(height*width*channels, 1);
+    std::vector<size_t> guessed_dims = {channels, height, width};
+    // Decode the image.
+    cv::Mat cv_dst = utils::get_opencv_mat(dst, guessed_dims);
+    cv::Mat real_decoded = cv::imdecode(cv_encoded,
+                                        cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH,
+                                        &cv_dst);
+    // For now we only support 8-bit 1- or 3-channel images.
+    if (real_decoded.type() != CV_8UC1 && real_decoded.type() != CV_8UC3) {
+      LBANN_ERROR("Only support 8-bit 1- or 3-channel images, cannot load " + filename);
+    }
+    dims = {real_decoded.type() == CV_8UC1 ? 1ull : 3ull,
+            static_cast<size_t>(real_decoded.rows),
+            static_cast<size_t>(real_decoded.cols)};
+    // If we did not guess the size right, need to copy.
+    if (real_decoded.ptr() != dst.Buffer()) {
+      dst.Resize(utils::get_linearized_size(dims), 1);
+      cv_dst = utils::get_opencv_mat(dst, dims);
+      real_decoded.copyTo(cv_dst);
+    }
+  } else {
+    cv::Mat decoded = cv::imdecode(cv_encoded,
+                                   cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
+    if (decoded.type() != CV_8UC1 && decoded.type() != CV_8UC3) {
+      LBANN_ERROR("Only support 8-bit 1- or 3-channel images, cannot load " + filename);
+    }
+    dims = {decoded.type() == CV_8UC1 ? 1ull : 3ull,
+            static_cast<size_t>(decoded.rows),
+            static_cast<size_t>(decoded.cols)};
+    // Copy to dst.
+    dst.Resize(utils::get_linearized_size(dims), 1);
+    cv::Mat cv_dst = utils::get_opencv_mat(dst, dims);
+    decoded.copyTo(cv_dst);
+  }
+}
+
+}  // anonymous namespace
+
+void load_image(const std::string& filename, El::Matrix<uint8_t>& dst,
+                std::vector<size_t>& dims) {
+  // Load the encoded image.
+  El::Matrix<uint8_t> buf;
+  size_t encoded_size;
+  read_file_to_buf(filename, buf, encoded_size);
+  opencv_decode(buf, dst, dims, filename);
+}
+
+void decode_image(El::Matrix<uint8_t>& src, El::Matrix<uint8_t>& dst,
+                  std::vector<size_t>& dims) {
+  opencv_decode(src, dst, dims, "encoded image");
+}
+
+void save_image(const std::string& filename, El::Matrix<uint8_t>& src,
+                const std::vector<size_t>& dims) {
+  cv::Mat cv_src = utils::get_opencv_mat(src, dims);
+  if (!cv::imwrite(filename, cv_src)) {
+    LBANN_ERROR("Could not save image to " + filename);
+  }
+}
+
+void save_image(const std::string& filename, const CPUMat& src,
+                const std::vector<size_t>& dims) {
+  if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) {
+    LBANN_ERROR("Unsupported dimensions for saving an image.");
+  }
+  El::Matrix<uint8_t> cv_mat = get_uint8_t_image(src, dims);
+  save_image(filename, cv_mat, dims);
+}
+
+std::string encode_image(const El::Matrix<uint8_t>& image,
+                         const std::vector<size_t>& dims,
+                         std::string const& img_format)
+{
+  cv::Mat Mat_img = utils::get_opencv_mat(
+    const_cast<El::Matrix<uint8_t>&>(image), dims);
+  std::vector<uint8_t> encoded_img;
+  // Default IMWRITE_JPEG_QUALITY is 95. Can lower if files are too large
+  cv::imencode(img_format, Mat_img, encoded_img);
+
+  return std::string{encoded_img.begin(), encoded_img.end()};
+}
+
+El::Matrix<uint8_t> get_uint8_t_image(const CPUMat& image,
+                                      const std::vector<size_t>& dims) {
+  // Create the output matrix
+  size_t const size = utils::get_linearized_size(dims);
+  El::Matrix<uint8_t> output_mat(size, 1);
+
+  // Create views into the source and destination matrices
+  cv::Mat source(dims[1], dims[2], dims[0] == 1 ? CV_32FC1 : CV_32FC3,
+                 const_cast<CPUMat&>(image).Buffer());
+  cv::Mat target(dims[1], dims[2], dims[0] == 1 ? CV_8UC1 : CV_8UC3,
+                 output_mat.Buffer());
+
+  // Create the scaling parameters:
+  auto minmax_elements =
+    std::minmax_element(image.LockedBuffer(), image.LockedBuffer() + size);
+  DataType const& smallest = *(minmax_elements.first);
+  DataType const& largest = *(minmax_elements.second);
+
+  double scaling_factor =
+    (largest > smallest
+     ? static_cast<double>(256 / (largest - smallest))
+     : 1.0);
+  double beta = -smallest * scaling_factor;
+
+  if (dims[0] == 1)
+  {
+    source.convertTo(target, target.type(), scaling_factor, beta);
+  }
+  else
+  {
+    auto* image_data = image.LockedBuffer();
+    size_t offset = dims[1]*dims[2]; // size of the image
+    for (size_t col = 0; col < dims[2]; ++col)
+      for (size_t row = 0; row < dims[1]; ++row)
+      {
+        size_t const idx = row + col*dims[1];
+        cv::Vec3b pixel;
+        pixel[0] = cv::saturate_cast<uchar>(scaling_factor*image_data[idx] + beta);
+        pixel[1] = cv::saturate_cast<uchar>(scaling_factor*image_data[idx+offset] + beta);
+        pixel[2] = cv::saturate_cast<uchar>(scaling_factor*image_data[idx+2*offset] + beta);
+        target.at<cv::Vec3b>(row, col) = pixel;
+      }
+  }
+  return output_mat;
+}
+
+}  // namespace lbann
diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp
index 5c82d30c8b2..53a0895c312 100644
--- a/src/utils/lbann_library.cpp
+++ b/src/utils/lbann_library.cpp
@@ -25,27 +25,232 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/utils/lbann_library.hpp"
-#include "lbann/callbacks/callback_checkpoint.hpp"
+
+#include "lbann/proto/factories.hpp"
+#include "lbann/utils/omp_diagnostics.hpp"
+#include "lbann/utils/threads/thread_utils.hpp"
+#include "lbann/callbacks/callback.hpp"
+#include "lbann/callbacks/checkpoint.hpp"
+#include "lbann/callbacks/dump_weights.hpp"
+#include "lbann/callbacks/save_model.hpp"
+#include "lbann/callbacks/load_model.hpp"
+#include "lbann/utils/argument_parser.hpp"
+
+#include <lbann.pb.h>
+#include <model.pb.h>
 
 namespace lbann {
 
-/// Setup I/O thread pool that is shared across all models
-std::unique_ptr<thread_pool> construct_io_thread_pool(lbann_comm *comm) {
-  int num_io_threads = num_free_cores_per_process(comm);
-
-  options *opts = options::get();
-  if(opts->has_int("num_io_threads")) {
-    int requested_io_threads = opts->get_int("num_io_threads");
-    if(requested_io_threads > 0 && requested_io_threads < num_io_threads) {
-      num_io_threads = requested_io_threads;
+void construct_std_options() {
+  auto& arg_parser = global_argument_parser();
+  arg_parser.add_option(MAX_RNG_SEEDS_DISPLAY,
+                        {"--rng_seeds_per_trainer_to_display"},
+                        utils::ENV("LBANN_RNG_SEEDS_PER_TRAINER_TO_DISPLAY"),
+                        "Limit how many random seeds LBANN should display "
+                        "from each trainer",
+                        2);
+  arg_parser.add_option(NUM_IO_THREADS,
+                        {"--num_io_threads"},
+                        utils::ENV("LBANN_NUM_IO_THREADS"),
+                        "Number of threads available to both I/O and "
+                        "initial data transformations for each rank.",
+                        64);
+}
+
+/// Construct a trainer that contains a lbann comm object and threadpool
+std::unique_ptr<trainer> construct_trainer(lbann_comm *comm,
+                                           lbann_data::Trainer* pb_trainer,
+                                           lbann_data::LbannPB &pb,
+                                           options *opts) {
+  try {
+    int procs_per_trainer = 0;
+    if(pb_trainer->procs_per_trainer() > 0) {
+      procs_per_trainer = pb_trainer->procs_per_trainer();
+    }
+    if (procs_per_trainer == 0) {
+      procs_per_trainer = comm->get_procs_in_world();
+    }
+
+    // Set up the communicator and split the grid if necessary
+    comm->split_trainers(procs_per_trainer);
+    if (pb_trainer->num_parallel_readers() > procs_per_trainer) {
+      pb_trainer->set_num_parallel_readers(procs_per_trainer);
+    }
+
+    // Adjust the number of parallel readers; this may be adjusted
+    // after calling split_trainers()
+    // set_num_parallel_readers(*comm, pb);
+
+    // Initalize a per-trainer I/O thread pool
+    std::unique_ptr<thread_pool> io_thread_pool = construct_io_thread_pool(comm, opts);
+
+    // Setup I/O threads
+    auto io_threads_per_process = io_thread_pool->get_num_threads();
+    auto io_threads_offset = io_thread_pool->get_threads_offset();
+
+    // Set algorithmic blocksize in Hydrogen
+    if (pb_trainer->hydrogen_block_size() > 0) {
+      El::SetBlocksize(pb_trainer->hydrogen_block_size());
+    }
+
+    // Set up the communicator and get the grid based on the trainers' spec.
+    // We do not currently support splitting different trainers in different ways,
+    // as this implies different grids.
+    if (procs_per_trainer != comm->get_procs_per_trainer()) {
+      comm->split_trainers(procs_per_trainer);
+    }
+
+    // Display how the OpenMP threads are provisioned
+    // if (opts->has_string("print_affinity")) {
+    //   display_omp_setup();
+    // }
+
+    // User feedback
+    //    print_parameters(comm, pb);
+
+    // Initalize trainer
+    std::unique_ptr<trainer> trainer = proto::construct_trainer(comm, *pb_trainer);
+
+    // If the checkpoint directory has been overridden reset it before
+    // setting up the trainer
+    if (opts && opts->has_string("ckpt_dir")) {
+      for (auto&& c : trainer->get_callbacks()) {
+        {
+          auto* cb = dynamic_cast<callback::checkpoint*>(c);
+          if(cb != nullptr) {
+            cb->set_checkpoint_dir(opts->get_string("ckpt_dir"));
+            if(comm->am_trainer_master()) {
+              std::cout << "Setting the checkpoint directory to " << cb->get_checkpoint_dir() << std::endl;
+            }
+          }
+        }
+      }
+    }
+    if (opts && opts->has_string("restart_dir")) {
+      for (auto&& c : trainer->get_callbacks()) {
+        {
+          auto* cb = dynamic_cast<callback::checkpoint*>(c);
+          if(cb != nullptr) {
+            cb->set_restart_dir(opts->get_string("restart_dir"));
+            if(comm->am_trainer_master()) {
+              std::cout << "Setting the restart directory to " << cb->get_restart_dir() << std::endl;
+            }
+          }
+        }
+      }
+    }
+
+    // Root of the random seed tree
+    int root_random_seed = lbann_default_random_seed;
+
+    // Change random seed if requested
+    if (pb_trainer->random_seed() > 0) {
+      root_random_seed = pb_trainer->random_seed();
+    }
+
+    // Random seed used for the general RNGs
+    int random_seed = root_random_seed;
+    // Random seed used for the RNG used to fetch data
+    int data_seq_random_seed = root_random_seed;
+
+    // Initialize models differently if needed.
+#ifndef LBANN_DETERMINISTIC
+    if (!pb_trainer->random_init_trainers_identically()) {
+      random_seed = hash_combine(random_seed, comm->get_trainer_rank());
+      // Also update the data sequence random seed
+      data_seq_random_seed = random_seed;
     }
+
+    // Under normal conditions, reinitialize the random number generator so
+    // that regularization techniques (e.g. dropout) generate unique patterns
+    // on different ranks.
+    // At this point the data sequence random seed is no longer updated
+    random_seed = hash_combine(random_seed, comm->get_rank_in_world());
+#else
+    if(comm->am_world_master()) {
+      std::cout <<
+        "--------------------------------------------------------------------------------------------------------------------\n"
+        "ALERT: executing with LBANN_DETERMINISTIC flag to minimize reduce numerical variance -- performance will be degraded\n"
+        "--------------------------------------------------------------------------------------------------------------------\n";
+    }
+    if (!pb_trainer->random_init_trainers_identically()) {
+      if(comm->am_trainer_master()) {
+        std::cout << "WARNING: forcing 'random_init_trainers_identically' " <<
+          "due to sequential consistency" << std::endl;
+      }
+    }
+#endif
+
+    // Initialize the general RNGs and the data sequence RNGs
+    init_random(random_seed, io_threads_per_process);
+    init_data_seq_random(data_seq_random_seed);
+    trainer->set_random_seeds(root_random_seed, random_seed, data_seq_random_seed);
+
+    // Collect everyone's random seeds
+    std::vector<int> root_random_seeds(comm->get_procs_in_world());
+    comm->world_all_gather(root_random_seed, root_random_seeds);
+    std::vector<int> random_seeds(comm->get_procs_in_world());
+    comm->world_all_gather(random_seed, random_seeds);
+    std::vector<int> data_seq_random_seeds(comm->get_procs_in_world());
+    comm->world_all_gather(data_seq_random_seed, data_seq_random_seeds);
+
+    // Update the index lists to accomodate multi-trainer / multi-model specification
+    customize_data_readers_index_list(*comm, pb);
+
+    // Initialize data readers
+    //@todo: code not in place for correctly handling image preprocessing
+    std::map<execution_mode, generic_data_reader *> data_readers;
+    bool is_shared_training_data_reader = pb_trainer->shareable_training_data_reader();
+    bool is_shared_testing_data_reader = pb_trainer->shareable_testing_data_reader();
+    if (opts->has_string("share_testing_data_readers")) {
+      is_shared_testing_data_reader = opts->get_bool("share_testing_data_readers");
+    }
+    init_data_readers(comm, pb, data_readers, is_shared_training_data_reader, is_shared_testing_data_reader);
+
+    trainer->setup(std::move(io_thread_pool), data_readers);
+
+    if(opts->get_bool("disable_background_io_activity")) {
+      trainer->allow_background_io_activity(false);
+    }
+
+
+    // Report useful information
+    if (comm->am_world_master()) {
+      print_lbann_configuration(comm,
+                                io_threads_per_process,
+                                io_threads_offset);
+      std::cout << "\n"
+                << trainer->get_description()
+                << std::endl;
+
+      // User feedback
+      print_parameters(*comm, pb, root_random_seeds, random_seeds, data_seq_random_seeds);
+    }
+
+    return trainer;
+
+  } catch (lbann_exception& e) {
+    El::mpi::Abort(El::mpi::COMM_WORLD, 1);
+  } catch (std::exception& e) {
+    El::ReportException(e);  // Elemental exceptions
   }
+  return nullptr;
+}
+
+/// Setup I/O thread pool that is shared across all models
+std::unique_ptr<thread_pool> construct_io_thread_pool(lbann_comm *comm, options *opts) {
+  int max_io_threads = num_free_cores_per_process(comm);
+
+  auto& arg_parser = global_argument_parser();
+  int req_io_threads = arg_parser.get<int>(NUM_IO_THREADS);
+  int num_io_threads = std::max(std::min(max_io_threads, req_io_threads), 1);
 
   auto io_threads_offset = free_core_offset(comm);
 
   if(comm->am_world_master()) {
     std::cout << "\tNum. I/O Threads: " << num_io_threads <<
-      " (Limited to # Unused Compute Cores or 1)" << std::endl;
+      " (Limited to # Unused Compute Cores or 1) at offset "
+      << io_threads_offset << std::endl;
   }
 
   auto io_thread_pool = make_unique<thread_pool>();
@@ -56,148 +261,118 @@ std::unique_ptr<thread_pool> construct_io_thread_pool(lbann_comm *comm) {
 
 std::unique_ptr<model> build_model_from_prototext(
   int argc, char **argv,
+  const lbann_data::Trainer* pb_trainer,
   lbann_data::LbannPB &pb,
   lbann_comm *comm,
-  std::shared_ptr<thread_pool> io_thread_pool,
-  bool first_model) {
+  options *opts,
+  thread_pool& io_thread_pool,
+  std::vector<std::shared_ptr<callback_base>>& shared_callbacks,
+  int training_dr_linearized_data_size) {
 
-  int random_seed = lbann_default_random_seed;
   bool master = comm->am_world_master();
   if (master) {
     std::cerr << "starting build_model_from_prototext" << std::endl;
   }
 
   std::ostringstream err;
-  options *opts = options::get();
-
-  // Optionally over-ride some values in prototext
-  get_cmdline_overrides(*comm, pb);
 
   lbann_data::Model *pb_model = pb.mutable_model();
 
-  // Adjust the number of parallel readers; this may be adjusted
-  // after calling split_trainers()
-  set_num_parallel_readers(*comm, pb);
-
   // Check to see if the model wants to reduce the I/O parallelism
-  if(pb_model->serialize_io() && io_thread_pool->get_num_threads() != 1) {
+  if(pb_model->serialize_io() && io_thread_pool.get_num_threads() != 1) {
     if(master) {
       std::cout << "Model " << pb_model->name() << " serialized the I/O threads" << std::endl;
     }
-    io_thread_pool->relaunch_pinned_threads(1);
+    io_thread_pool.relaunch_pinned_threads(1);
   }
 
-  // Setup I/O threads
-  auto io_threads_per_process = io_thread_pool->get_num_threads();
-  auto io_threads_offset = io_thread_pool->get_threads_offset();
-
-  // Set algorithmic blocksize
-  if (pb_model->block_size() == 0 and master) {
-    err << "model does not provide a valid block size (" << pb_model->block_size() << ")";
-    LBANN_ERROR(err.str());
-  }
-  El::SetBlocksize(pb_model->block_size());
-
-  // Change random seed if needed.
-  if (pb_model->random_seed() > 0) {
-    random_seed = pb_model->random_seed();
-    // Reseed here so that setup is done with this new seed.
-    init_random(random_seed);
-    init_data_seq_random(random_seed);
-  }
-  // Set up the communicator and get the grid based on the first model's spec.
-  // We do not currently support splitting different models in different ways,
-  // as this implies different grids.
-  int procs_per_trainer = pb_model->procs_per_trainer();
-  if (procs_per_trainer == 0) {
-    procs_per_trainer = comm->get_procs_in_world();
-  }
-  if (first_model) {
-    comm->split_trainers(procs_per_trainer);
-    if (pb_model->num_parallel_readers() > procs_per_trainer) {
-      pb_model->set_num_parallel_readers(procs_per_trainer);
-    }
-  } else if (procs_per_trainer != comm->get_procs_per_trainer()) {
-    LBANN_ERROR("Model prototexts requesting different procs per model is not supported");
-  }
-
-  // Initialize models differently if needed.
-#ifndef LBANN_DETERMINISTIC
-  if (pb_model->random_init_models_differently()) {
-    random_seed = random_seed + comm->get_trainer_rank();
-    // Reseed here so that setup is done with this new seed.
-    init_random(random_seed);
-    init_data_seq_random(random_seed);
-  }
-#else
-  if (pb_model->random_init_models_differently()) {
-    if (master) {
-      std::cout << "WARNING: Ignoring random_init_models_differently " <<
-        "due to sequential consistency" << std::endl;
-    }
-  }
-#endif
-
   // Save info to file; this includes the complete prototext (with any over-rides
   // from the cmd line) and various other info
   save_session(*comm, argc, argv, pb);
 
-  // Report useful information
-  if (master) {
-    print_lbann_configuration(pb_model, comm, io_threads_per_process, io_threads_offset);
-  }
-
   // Display how the OpenMP threads are provisioned
   if (opts->has_string("print_affinity")) {
     display_omp_setup();
   }
 
-  // Update the index lists to accomodate multi-trainer / multi-model specification
-  customize_data_readers_index_list(*comm, pb);
-
-  // Initialize data readers
-  //@todo: code not in place for correctly handling image preprocessing
-  std::map<execution_mode, generic_data_reader *> data_readers;
-  bool is_shared_training_data_reader = pb_model->shareable_training_data_reader();
-  bool is_shared_testing_data_reader = pb_model->shareable_testing_data_reader();
-  if (opts->has_string("share_testing_data_readers")) {
-    is_shared_testing_data_reader = opts->get_bool("share_testing_data_readers");
+  // Initalize model
+  std::unique_ptr<model> ret_model = proto::construct_model(comm,
+                                                            training_dr_linearized_data_size,
+                                                            pb.optimizer(),
+                                                            pb.trainer(),
+                                                            pb.model());
+
+  // Add the trainer's callbacks to the model
+  for (auto&& c : shared_callbacks) {
+    ret_model->add_callback(c);
   }
-  init_data_readers(comm, pb, data_readers, is_shared_training_data_reader, is_shared_testing_data_reader);
-
-  // hack to prevent all data readers from loading identical data; instead,
-  // share a single copy. See data_reader_jag_conduit_hdf5 for example
-  if (first_model) {
-    if (opts->has_string("share_data_reader_data")) {
-      for (auto&& t : data_readers) {
-        opts->set_ptr((void*)t.second);
+
+  // If the checkpoint directory has been overridden reset it before
+  // setting up the model
+  if (opts && opts->has_string("ckpt_dir")) {
+    for (auto&& c : ret_model->get_callbacks()) {
+      {
+        auto* cb = dynamic_cast<callback::dump_weights*>(c);
+        if(cb != nullptr) {
+          cb->set_target_dir(opts->get_string("ckpt_dir"));
+          if(comm->am_trainer_master()) {
+            std::cout << "Setting the dump weights directory to " << cb->get_target_dir() << std::endl;
+          }
+        }
+      }
+      {
+        auto* cb = dynamic_cast<callback::save_model*>(c);
+        if(cb != nullptr) {
+          cb->set_target_dir(opts->get_string("ckpt_dir"));
+          if(comm->am_trainer_master()) {
+            std::cout << "Setting the dump weights directory to " << cb->get_target_dir() << std::endl;
+          }
+        }
       }
     }
   }
 
-  // User feedback
-  print_parameters(*comm, pb);
-
-  // Initalize model
-  std::unique_ptr<model> ret_model{
-    proto::construct_model(comm,
-                           data_readers,
-                           pb.optimizer(),
-                           pb.model())
-  };
-  ret_model->setup(std::move(io_thread_pool));
-
-  if(opts->get_bool("disable_background_io_activity")) {
-    ret_model->allow_background_io_activity(false);
-  }
+  if (opts && opts->has_string("load_model_weights_dir")) {
+    callback::load_model* cb = nullptr;
+    for (auto&& c : ret_model->get_callbacks()) {
+      cb = dynamic_cast<callback::load_model*>(c);
+      if(cb != nullptr) {
+        break;
+      }
+    }
 
-  if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store")) {
-    if (master) {
-      std::cout << "\nUSING DATA STORE!\n\n";
+    std::string active_load_model_dir;
+    std::string load_model_dir = opts->get_string("load_model_weights_dir");
+    if(opts->get_bool("load_model_weights_dir_is_complete")) {
+      active_load_model_dir = load_model_dir;
+    }else {
+      size_t epochLast = std::numeric_limits<size_t>::max();;
+      size_t stepLast = std::numeric_limits<size_t>::max();;
+      execution_mode mode = execution_mode::invalid;
+      active_load_model_dir = callback::get_last_shared_checkpoint_filename("sgd", load_model_dir);
+
+      // get last epoch and step saved.
+      int success = callback::read_latest(active_load_model_dir, &mode, &epochLast, &stepLast);
+      if(!success) {
+        LBANN_ERROR("Unable to find the latest checkpoint ", active_load_model_dir);
+        return nullptr;
+      }
+      active_load_model_dir = callback::get_shared_checkpoint_dirname("sgd", load_model_dir, mode, epochLast, stepLast) + ret_model->get_name() + '/';
     }
-    for (auto&& r : data_readers) {
-      if (!r.second) continue;
-      r.second->setup_data_store(pb_model->mini_batch_size());
+
+    if(cb == nullptr) {
+      std::vector<std::string> dirs = {active_load_model_dir};
+      std::unique_ptr<callback::load_model> load_model_cb =
+        make_unique<callback::load_model>(dirs);
+      cb = load_model_cb.get();
+      ret_model->add_callback(std::move(load_model_cb));
+#ifdef LBANN_DEBUG
+      if(comm->am_trainer_master()) {
+        LBANN_WARNING("command line flag --load_model_dir was provided but there was no explicit load_model callback, adding one automagically!");
+      }
+#endif
+    }else {
+      cb->add_dir(opts->get_string("load_model_weights_dir"));
     }
   }
 
@@ -205,32 +380,10 @@ std::unique_ptr<model> build_model_from_prototext(
   //@todo
   //model->restartShared();
 
-  if (comm->am_world_master()) {
-    std::cout << "\n"
-              << ret_model->get_description()
-              << "Callbacks:" << std::endl;
-    for (lbann_callback *cb : ret_model->get_callbacks()) {
-      std::cout << cb->name() << std::endl;
-    }
-  }
-
-#ifndef LBANN_DETERMINISTIC
-  // Under normal conditions, reinitialize the random number generator so
-  // that regularization techniques (e.g. dropout) generate unique patterns
-  // on different ranks.
-  init_random(random_seed + comm->get_rank_in_world());
-#else
-  if(comm->am_world_master()) {
-    std::cout <<
-      "--------------------------------------------------------------------------------\n"
-      "ALERT: executing in sequentially consistent mode -- performance will suffer\n"
-      "--------------------------------------------------------------------------------\n";
-  }
-#endif
   return ret_model;
 }
 
-void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, int io_threads_per_process, int io_threads_offset) {
+void print_lbann_configuration(lbann_comm *comm, int io_threads_per_process, int io_threads_offset) {
   // Report hardware settings
   std::cout << "Hardware properties (for master process)" << std::endl
             << "  Processes on node          : " << comm->get_procs_per_node() << std::endl
@@ -244,6 +397,16 @@ void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, in
   std::cout << std::endl;
 
   // Report build settings
+  std::cout << "Running: LLNL LBANN version: "
+            << LBANN_MAKE_STR(LBANN_VERSION)
+            << " (" << LBANN_MAKE_STR(LBANN_GIT_VERSION) << ")"
+            << std::endl;
+#ifdef HYDROGEN_VERSION
+  std::cout << "         LLNL Hydrogen version: "
+            << HYDROGEN_VERSION
+            << " (" << HYDROGEN_GIT_VERSION << ")"
+            << std::endl << std::endl;
+#endif
   std::cout << "Build settings" << std::endl;
   std::cout << "  Type     : ";
 #ifdef LBANN_DEBUG
@@ -275,22 +438,6 @@ void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, in
 #else
   std::cout << "NOT detected" << std::endl;
 #endif // HYDROGEN_HAVE_CUB
-  std::cout << std::endl;
-
-  // Report device settings
-  std::cout << "GPU settings" << std::endl;
-  bool disable_cuda = pb_model->disable_cuda();
-#ifndef LBANN_HAS_GPU
-  disable_cuda = true;
-#endif // LBANN_HAS_GPU
-  std::cout << "  CUDA         : "
-            << (disable_cuda ? "disabled" : "enabled") << std::endl;
-  std::cout << "  cuDNN        : ";
-#ifdef LBANN_HAS_CUDNN
-  std::cout << (disable_cuda ? "disabled" : "enabled") << std::endl;
-#else
-  std::cout << "disabled" << std::endl;
-#endif // LBANN_HAS_CUDNN
   const auto* env = std::getenv("MV2_USE_CUDA");
   std::cout << "  MV2_USE_CUDA : " << (env != nullptr ? env : "") << std::endl;
   std::cout << std::endl;
@@ -308,10 +455,9 @@ void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, in
 
   // Report model settings
   const auto& grid = comm->get_trainer_grid();
-  int procs_per_trainer = pb_model->procs_per_trainer();
-  std::cout << "Model settings" << std::endl
-            << "  Models                : " << comm->get_num_trainers() << std::endl
-            << "  Processes per trainer : " << procs_per_trainer << std::endl
+  std::cout << "Trainer settings" << std::endl
+            << "  Trainers              : " << comm->get_num_trainers() << std::endl
+            << "  Processes per trainer : " << comm->get_procs_per_trainer() << std::endl
             << "  Grid dimensions       : " << grid.Height() << " x " << grid.Width() << std::endl;
   std::cout << std::endl;
 }
diff --git a/src/utils/nvshmem.cu b/src/utils/nvshmem.cu
new file mode 100644
index 00000000000..d4bf89efd09
--- /dev/null
+++ b/src/utils/nvshmem.cu
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/nvshmem.hpp"
+#ifdef LBANN_HAS_NVSHMEM
+
+namespace lbann {
+namespace nvshmem {
+
+namespace {
+
+bool is_initialized_ = false;
+bool is_finalized_ = false;
+
+} // namespace <anon>
+
+bool is_initialized() noexcept {
+  return is_initialized_;
+}
+
+bool is_finalized() noexcept {
+  return is_finalized_;
+}
+
+bool is_active() noexcept {
+  return is_initialized() && !is_finalized();
+}
+
+void initialize(MPI_Comm comm) {
+
+  // Check if NVSHMEM has already been initialized or finalized
+  if (is_active()) {
+    return;
+  }
+  if (is_finalized()) {
+    LBANN_ERROR("attempted to initialize NVSHMEM after it has been finalized");
+  }
+
+  // Initialize NVSHMEM
+  nvshmemx_init_attr_t attr;
+  attr.mpi_comm = &comm;
+  auto status = nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+  if (status != 0) {
+    LBANN_ERROR("failed to initialize NVSHMEM (status ",status,")");
+  }
+  is_initialized_ = true;
+
+}
+
+void finalize() {
+  if (is_active()) {
+    nvshmem_finalize();
+    is_finalized_ = true;
+  }
+}
+
+} // namespace nvshmem
+} // namespace lbann
+
+#endif // LBANN_HAS_NVSHMEM
diff --git a/src/utils/options.cpp b/src/utils/options.cpp
index 1bfb0814fbb..3caea90d0f7 100644
--- a/src/utils/options.cpp
+++ b/src/utils/options.cpp
@@ -1,5 +1,6 @@
 #include "mpi.h"
 #include "lbann/utils/options.hpp"
+#include "lbann/utils/exception.hpp"
 #include <ctime>
 #include <dirent.h>
 #include <iostream>
@@ -135,11 +136,7 @@ int options::get_int(std::string option)
 {
   int result;
   if (!m_test_int(option, result)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__
-        << " :: options::get_int() - failed to find option: " << option
-        << ", or to convert to int";
-    throw std::runtime_error(err.str());
+    LBANN_ERROR("options::get_int() - failed to find option: ", option, ", or to convert to int");
   }
   return result;
 }
@@ -157,11 +154,7 @@ double options::get_double(std::string option)
 {
   double result;
   if (!m_test_double(option, result)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__
-        << " :: options::get_double() - failed to find option: " << option
-        << ", or to convert the value to double";
-    throw std::runtime_error(err.str());
+    LBANN_ERROR("options::get_double() - failed to find option: ", option,  ", or to convert the value to double");
   }
   return result;
 }
@@ -179,10 +172,7 @@ std::string options::get_string(std::string option)
 {
   std::string result;
   if (!m_test_string(option, result)) {
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__
-        << " :: options::get_string() - failed to find option: " << option;
-    throw std::runtime_error(err.str());
+    LBANN_ERROR("options::get_string() - failed to find option: ", option);
   }
   return result;
 }
@@ -286,10 +276,7 @@ void options::m_parse_file(std::string fn)
   std::ifstream in(fn.c_str());
   if (!in.is_open()) {
     if (!m_rank) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__
-          << " :: failed to open file for reading: " << fn;
-      throw std::runtime_error(err.str());
+      LBANN_ERROR("failed to open file for reading: ", fn);
     }
   }
 
diff --git a/src/utils/protobuf_utils.cpp b/src/utils/protobuf_utils.cpp
index f93b62609ed..fa01daf77ee 100644
--- a/src/utils/protobuf_utils.cpp
+++ b/src/utils/protobuf_utils.cpp
@@ -29,6 +29,8 @@
 #include "lbann/utils/protobuf_utils.hpp"
 #include "lbann/proto/proto_common.hpp"
 
+#include <lbann.pb.h> // Actually use LbannPB here
+
 /**
  * all methods in protobuf_utils are static
  */
@@ -49,15 +51,11 @@ parse_prototext_filenames_from_command_line(
   for (int k=1; k<argc; k++) {
     std::string s(argv[k]);
     if (s[0] != '-' or s[1] != '-') {
-      std::cerr << "badly formed cmd line param; must begin with '--': " << s << std::endl;
-      exit(1);
+      LBANN_ERROR("badly formed cmd line param; must begin with '--': ", s);
     }
     if (s.find(',') != std::string::npos) {
-      std::stringstream err;
-      err << __FILE__ << __LINE__ << " :: "
-          << " badly formed param; contains ','; " << s << "\n"
-          << "possibly you left out '{' or '}' or both ??\n";
-      throw lbann_exception(err.str());
+      LBANN_ERROR(" badly formed param; contains ','; ", s, 
+          "; possibly you left out '{' or '}' or both");
     }
 
     size_t equal_sign = s.find("=");
@@ -86,29 +84,23 @@ parse_prototext_filenames_from_command_line(
   if(!single_file_load) {
     size_t n = models.size();
     if (! (optimizers.size() == 1 || optimizers.size() == n)) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << " you specified " << n << " model filenames, and " << optimizers.size()
-          << " optimizer filenames; you must specify either one or "<< n
-          << " optimizer filenames";
-      throw lbann_exception(err.str());
+      LBANN_ERROR(
+        "you specified ", n, " model filenames, and ", optimizers.size(), 
+        " optimizer filenames; you must specify either one or ", 
+       n, " optimizer filenames");
     }
     if (! (readers.size() == 1 || readers.size() == n)) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << " you specified " << n << " model filenames, and " << readers.size()
-          << " reader filenames; you must specify either one or "<< n
-          << " reader filenames";
-      throw lbann_exception(err.str());
+      LBANN_ERROR(
+        "you specified ", n, " model filenames, and ", readers.size(),
+        " reader filenames; you must specify either one or ", n,
+        " reader filenames");
     }
 
     if (! (data_set_metadata.size() == 0 || data_set_metadata.size() == 1 || data_set_metadata.size() == n)) {
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          << " you specified " << n << " model filenames, and " << data_set_metadata.size()
-          << " data set metadata filenames; you must specify either zero, one, or "<< n
-          << " data set metadata filenames";
-      throw lbann_exception(err.str());
+      LBANN_ERROR(
+        "you specified ", n, " model filenames, and ", data_set_metadata.size(),
+        " data set metadata filenames; you must specify either zero, one, or ",
+        n, " data set metadata filenames");
     }
   }
 
@@ -190,6 +182,7 @@ load_prototext(
 void verify_prototext(
   const bool master,
   const std::vector<std::unique_ptr<lbann_data::LbannPB>> &models) {
+  std::stringstream err;
   if (master) {
     std::cout << "protobuf_utils::verify_prototext; starting verify for " << models.size() << " models\n";
   }
@@ -198,47 +191,24 @@ void verify_prototext(
     lbann_data::LbannPB *t = models[j].get();
     if (! t->has_data_reader()) {
       is_good = false;
-      if (master) {
-        std::cerr << "model #" << j << " is missing data_reader\n";
-      }
+      err << "model #" << j << " is missing data_reader\n";
     } else {
       if (t->data_reader().requires_data_set_metadata() && (! t->has_data_set_metadata())) {
         is_good = false;
-        if (master) {
-          std::cerr << "model #" << j << " is missing data_set_metadata\n";
-        }
-      }
-      if (!t->data_reader().requires_data_set_metadata() && t->has_data_set_metadata()) {
-        is_good = false;
-        if (master) {
-          std::stringstream err;
-          err << "model #" << j << " is has data_set_metadata but does not require it\n"
-              << " please check your command line\n";
-          LBANN_ERROR(err.str());
-        }
+        err << "model #" << j << " is missing metadata (cmd line flag: --metadata=<string>)\n";
       }
     }
     if (! t->has_model()) {
       is_good = false;
-      if (master) {
-        std::cerr << "model #" << j << " is missing model\n";
-      }
+      err << "model #" << j << " is missing model\n";
     }
     if (! t->has_optimizer()) {
       is_good = false;
-      if (master) {
-        std::cerr << "model #" << j << " is missing optimizer\n";
-      }
+      err << "model #" << j << " is missing optimizer\n";
     }
 
     if (! is_good) {
-      if (master) {
-        std::stringstream err;
-        err << __FILE__ << __LINE__ << " :: "
-            << " prototext is missing reader, metadata, optimizer, and/or model;\n"
-            << " please check your command line\n";
-        throw lbann_exception(err.str());
-      }
+      LBANN_ERROR("please check your command line and/or prototext files:\n", err.str());
     }
   }
 }
diff --git a/src/utils/python.cpp b/src/utils/python.cpp
new file mode 100644
index 00000000000..c248ca6715f
--- /dev/null
+++ b/src/utils/python.cpp
@@ -0,0 +1,263 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/python.hpp"
+#ifdef LBANN_HAS_PYTHON
+#include <sstream>
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+namespace python {
+
+namespace {
+
+/** @brief State on main Python thread after initialization. */
+PyThreadState* init_thread_state = nullptr;
+
+} // namespace
+
+void initialize() {
+
+  // Thread-safe initialization with double-checked locking pattern
+  if (!is_active()) {
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    if (!is_active()) {
+
+      // Hack to display output from Python
+      // Note: Python outputs didn't appear because MPI intercepts
+      // stdout and stderr. See
+      // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program
+      Py_UnbufferedStdioFlag = 1;
+
+      // Initialize Python session and release GIL
+      Py_Initialize();
+      PyEval_InitThreads();
+      init_thread_state = PyEval_SaveThread();
+      if (!is_active()) {
+        LBANN_ERROR("error initializing embedded Python session");
+      }
+
+    }
+  }
+
+}
+
+void finalize() {
+
+  // Thread-safe finalization with double-checked locking pattern
+  if (is_active()) {
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    if (is_active()) {
+
+      // Take GIL and finalize Python session
+      if (init_thread_state != nullptr) {
+        PyEval_RestoreThread(init_thread_state);
+        init_thread_state = nullptr;
+      }
+      Py_Finalize();
+
+      // Check that Python session has been finalized
+      if (is_active()) {
+        LBANN_WARNING("error finalizing embedded Python session");
+      }
+
+    }
+  }
+
+}
+
+bool is_active() { return Py_IsInitialized(); }
+
+void check_error(bool force_error) {
+  global_interpreter_lock gil;
+  if (force_error || PyErr_Occurred()) {
+
+    // Get error information from Python session
+    PyObject *type_ptr, *value_ptr, *traceback_ptr;
+    PyErr_Fetch(&type_ptr, &value_ptr, &traceback_ptr);
+    object type(type_ptr), value(value_ptr), traceback(traceback_ptr);
+
+    // Construct error message
+    std::ostringstream err;
+    err << "detected Python error";
+    if (value != nullptr) {
+      object msg = PyObject_Repr(value);
+      msg = PyUnicode_AsEncodedString(msg, "utf-8", "Error -");
+      err << " (" << PyBytes_AS_STRING(msg.get()) << ")";
+    }
+
+    // Print Python traceback if available
+    if (traceback != nullptr) {
+
+      // Format traceback
+      object module = PyImport_ImportModule("traceback");
+      object func = PyObject_GetAttrString(module, "format_tb");
+      object message = PyObject_CallMethod(module,
+                                           "format_tb",
+                                           "(O)",
+                                           traceback.get());
+
+      // Print traceback
+      err << "\n\n" << "Python traceback:";
+      object iter = PyObject_GetIter(message);
+      for (object line = PyIter_Next(iter);
+           line != nullptr;
+           line = PyIter_Next(iter)) {
+        const char* line_ = PyUnicode_AsUTF8(line);
+        err << "\n" << (line_ ? line_ : "");
+      }
+
+    }
+
+    // Clean up and throw exception
+    PyErr_Restore(type.release(), value.release(), traceback.release());
+    PyErr_Clear();
+    LBANN_ERROR(err.str());
+
+  }
+}
+
+// ---------------------------------------------
+// global_interpreter_lock class
+// ---------------------------------------------
+
+global_interpreter_lock::global_interpreter_lock() {
+  initialize(); // Make sure Python is running
+  m_gil_state = PyGILState_Ensure();
+}
+
+global_interpreter_lock::~global_interpreter_lock() {
+  if (is_active()) {
+    PyGILState_Release(m_gil_state);
+  }
+}
+
+// ---------------------------------------------
+// object class
+// ---------------------------------------------
+
+object::object(PyObject* ptr) : m_ptr(ptr) {
+  check_error();
+}
+object::object(const std::string& val) {
+  global_interpreter_lock gil;
+  m_ptr = PyUnicode_FromStringAndSize(val.c_str(), val.size());
+  check_error();
+}
+object::object(long val) {
+  global_interpreter_lock gil;
+  m_ptr = PyLong_FromLong(val);
+  check_error();
+}
+object::object(double val) {
+  global_interpreter_lock gil;
+  m_ptr = PyFloat_FromDouble(val);
+  check_error();
+}
+
+object::object(const object& other) : m_ptr(other.m_ptr) {
+  global_interpreter_lock gil;
+  m_ptr = other.m_ptr;
+  Py_XINCREF(m_ptr);
+  check_error();
+}
+
+object& object::operator=(const object& other) {
+  global_interpreter_lock gil;
+  Py_XDECREF(m_ptr);
+  m_ptr = other.m_ptr;
+  Py_XINCREF(m_ptr);
+  check_error();
+  return *this;
+}
+
+object::object(object&& other) noexcept : m_ptr(other.m_ptr) {
+  other.m_ptr = nullptr;
+}
+
+object& object::operator=(object&& other) {
+  global_interpreter_lock gil;
+  Py_XDECREF(m_ptr);
+  m_ptr = other.m_ptr;
+  other.m_ptr = nullptr;
+  check_error();
+  return *this;
+}
+
+object::~object() {
+  if (is_active()) {
+    global_interpreter_lock gil;
+    Py_XDECREF(m_ptr);
+  }
+}
+
+PyObject* object::release() noexcept {
+  auto old_ptr = m_ptr;
+  m_ptr = nullptr;
+  return old_ptr;
+}
+
+object::operator std::string() {
+  global_interpreter_lock gil;
+  if (m_ptr == nullptr) {
+    LBANN_ERROR("Attempted to convert Python object to std::string, "
+                "but it has not been set");
+  }
+  object python_str = PyObject_Str(m_ptr);
+  std::ostringstream ss;
+  ss << PyUnicode_AsUTF8(python_str);
+  check_error();
+  return ss.str();
+}
+
+object::operator long() {
+  global_interpreter_lock gil;
+  if (m_ptr == nullptr) {
+    LBANN_ERROR("Attempted to convert Python object to long, "
+                "but it has not been set");
+  }
+  auto val = PyLong_AsLong(m_ptr);
+  check_error();
+  return val;
+}
+
+object::operator double() {
+  global_interpreter_lock gil;
+  if (m_ptr == nullptr) {
+    LBANN_ERROR("Attempted to convert Python object to double, "
+                "but it has not been set");
+  }
+  auto val = PyFloat_AsDouble(m_ptr);
+  check_error();
+  return val;
+}
+
+} // namespace python
+} // namespace lbann
+
+#endif // LBANN_HAS_PYTHON
diff --git a/src/utils/random.cpp b/src/utils/random.cpp
index 799bf06d43e..51134732450 100644
--- a/src/utils/random.cpp
+++ b/src/utils/random.cpp
@@ -25,139 +25,109 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <omp.h>
+#define LBANN_RANDOM_INSTANTIATE
 #include "lbann/utils/random.hpp"
 #include "lbann/io/file_io.hpp"
+#include "lbann/utils/hash.hpp"
 #include <thread>
 
-namespace {
-#ifdef __ICC
-lbann::rng_gen generator;
-#pragma omp threadprivate(generator)
-
-lbann::fast_rng_gen fast_generator;
-#pragma omp threadprivate(fast_generator)
-#else
-// Random number generator, file-visible only.
-// Defined like this to work around a GCC problem with threadprivate objects:
-// https://stackoverflow.com/questions/23552077/how-to-define-a-object-or-struct-as-threadprivate-in-openmp/
-extern lbann::rng_gen generator;
-#pragma omp threadprivate(generator)
-lbann::rng_gen generator;
-
-extern lbann::fast_rng_gen fast_generator;
-#pragma omp threadprivate(fast_generator)
-lbann::fast_rng_gen fast_generator;
-#endif
-
-thread_local lbann::rng_gen data_seq_generator;
-thread_local bool data_seq_generator_inited = false;
-int data_seq_generator_seed_base = 0;
-
-thread_local lbann::rng_gen io_generator;
-thread_local bool io_generator_inited = false;
-int io_generator_seed_base = 0;
-
-thread_local lbann::fast_rng_gen fast_io_generator;
-thread_local bool fast_io_generator_inited = false;
-int fast_io_generator_seed_base = 0;
-}
 
 namespace lbann {
 
-rng_gen& get_generator() {
-  return ::generator;
-}
-
-fast_rng_gen& get_fast_generator() {
-  return ::fast_generator;
-}
-
-rng_gen& get_data_seq_generator() {
-  if (!::data_seq_generator_inited) {
-    ::data_seq_generator.seed(::data_seq_generator_seed_base);
-    ::data_seq_generator_inited = true;
-  }
-  return ::data_seq_generator;
-}
-
-rng_gen& get_io_generator() {
-  if (!::io_generator_inited) {
-    std::hash<std::thread::id> h;
-    ::io_generator.seed((::io_generator_seed_base << 8) |
-                        h(std::this_thread::get_id()));
-    ::io_generator_inited = true;
-  }
-  return ::io_generator;
-}
-
-fast_rng_gen& get_fast_io_generator() {
-  if (!::fast_io_generator_inited) {
-    std::hash<std::thread::id> h;
-    ::fast_io_generator.seed((::fast_io_generator_seed_base << 8) |
-                             h(std::this_thread::get_id()));
-    ::fast_io_generator_inited = true;
-  }
-  return ::fast_io_generator;
-}
-
-bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm) {
+bool save_rng_to_checkpoint(persist& p, lbann_comm* comm, bool is_distributed) {
   std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state";
-  makedir(dirname.c_str());
+  std::string rank_in_trainer;
   std::string rng_name;
 
-  /// @todo - Note that the RNG with thread local data is not correct
-  rng_name = dirname + "/rng_seq_generator";
-  std::ofstream rng_seq(rng_name);
-  rng_seq << ::data_seq_generator;
-
-#ifdef LBANN_SET_EL_RNG
-  rng_name = dirname + "/EL_generator";
-  std::ofstream rng_EL(rng_name);
-  rng_EL << El::Generator();
-#endif
-
-  std::string rank_in_world;
   if (comm == nullptr) {
-    rank_in_world = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD));
+    rank_in_trainer = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD));
+    makedir(dirname.c_str());
   } else {
-    rank_in_world = std::to_string(comm->get_rank_in_world());
+    rank_in_trainer = std::to_string(comm->get_rank_in_trainer());
+    if (comm->am_trainer_master() || is_distributed) {
+      makedir(dirname.c_str());
+    }
+    comm->trainer_barrier();
   }
 
-  /// @todo - Note that the RNG with thread local data is not correct
-  rng_name = dirname + "/rng_io_generator_" + rank_in_world;
-  std::ofstream rng_io(rng_name);
-  rng_io << ::io_generator;
+  if (comm == nullptr || comm->am_trainer_master() || is_distributed) {
+    /// @todo - Note that the RNG with thread local data is not correct
+    rng_name = dirname + "/rng_seq_generator";
+    std::ofstream rng_seq(rng_name);
+    if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_seq << get_data_seq_generator();
+    rng_seq.close();
 
-  /// @todo - Note that the RNG with thread local data is not correct
-  rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world;
-  std::ofstream rng_fast_io(rng_name);
-  rng_fast_io << ::fast_io_generator;
+#ifdef LBANN_SET_EL_RNG
+    rng_name = dirname + "/EL_generator";
+    std::ofstream rng_EL(rng_name);
+    if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_EL << El::Generator();
+    rng_EL.close();
+#endif
+  }
+
+  for(int i = 0; i < get_num_io_generators(); i++) {
+    rng_name = dirname + "/rng_io_generator_" + rank_in_trainer
+      + "_t" + std::to_string(i);
+    std::ofstream rng_io(rng_name);
+    if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_name = dirname + "/rng_fast_io_generator_" + rank_in_trainer
+      + "_t" + std::to_string(i);
+    std::ofstream rng_fast_io(rng_name);
+    if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); }
+
+    locked_io_rng_ref io_rng = set_io_generators_local_index(i);
+    rng_io << get_io_generator();
+    rng_fast_io << get_fast_io_generator();
+
+    rng_io.close();
+    rng_fast_io.close();
+  }
 
 #ifdef _OPENMP
   #pragma omp parallel private(rng_name)
   {
-    rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num());
+    rng_name = dirname + "/rng_generator_" + rank_in_trainer + "_"
+             + std::to_string(omp_get_thread_num());
     std::ofstream rng(rng_name);
-    rng << ::generator;
+    if(!rng) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng << get_generator();
+    rng.close();
 
-    rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num());
+    rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer + "_"
+             + std::to_string(omp_get_thread_num());
     std::ofstream rng_fast(rng_name);
-    rng_fast << ::fast_generator;
+    if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_fast << get_fast_generator();
+    rng_fast.close();
   }
 #else
-    rng_name = dirname + "/rng_generator_" + rank_in_world;
+    rng_name = dirname + "/rng_generator_" + rank_in_trainer;
     std::ofstream rng(rng_name);
-    rng << ::generator;
+    if(!rng) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng << get_generator();
+    rng.close();
 
-    rng_name = dirname + "/rng_fast_generator_" + rank_in_world;
+    rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer;
     std::ofstream rng_fast(rng_name);
-    rng_fast << ::fast_generator;
+    if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_fast << get_fast_generator();
+    rng_fast.close();
 #endif
 
    return true;
 }
 
-bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) {
+bool save_rng_to_checkpoint_shared(persist& p, lbann_comm* comm) {
+  return save_rng_to_checkpoint(p, comm, false);
+}
+
+bool save_rng_to_checkpoint_distributed(persist& p, lbann_comm* comm) {
+  return save_rng_to_checkpoint(p, comm, true);
+}
+
+bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm) {
 
   std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state";
   std::string rng_name;
@@ -165,159 +135,123 @@ bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) {
   /// @todo - Note that the RNG with thread local data is not correct
   rng_name = dirname + "/rng_seq_generator";
   std::ifstream rng_seq(rng_name);
-  rng_seq >> ::data_seq_generator;
+  if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); }
+  rng_seq >> get_data_seq_generator();
 
 #ifdef LBANN_SET_EL_RNG
   rng_name = dirname + "/EL_generator";
   std::ifstream rng_EL(rng_name);
+  if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); }
   rng_EL >> El::Generator();
 #endif
 
-  std::string rank_in_world;
+  std::string rank_in_trainer;
   if (comm == nullptr) {
-    rank_in_world = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD));
+    rank_in_trainer = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD));
   } else {
-    rank_in_world = std::to_string(comm->get_rank_in_world());
+    rank_in_trainer = std::to_string(comm->get_rank_in_trainer());
   }
 
-  /// @todo - Note that the RNG with thread local data is not correct
-  rng_name = dirname + "/rng_io_generator_" + rank_in_world;
-  std::ifstream rng_io(rng_name);
-  rng_io >> ::io_generator;
+  for(int i = 0; i < get_num_io_generators(); i++) {
+    rng_name = dirname + "/rng_io_generator_" + rank_in_trainer
+      + "_t" + std::to_string(i);
+    std::ifstream rng_io(rng_name);
+    if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_name = dirname + "/rng_fast_io_generator_" + rank_in_trainer
+      + "_t" + std::to_string(i);
+    std::ifstream rng_fast_io(rng_name);
+    if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); }
+
+    locked_io_rng_ref io_rng = set_io_generators_local_index(i);
+    rng_io >> get_io_generator();
+    rng_fast_io >> get_fast_io_generator();
+  }
 
-  /// @todo - Note that the RNG with thread local data is not correct
-  rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world;
-  std::ifstream rng_fast_io(rng_name);
-  rng_fast_io >> ::fast_io_generator;
 
 #ifdef _OPENMP
   #pragma omp parallel private(rng_name)
   {
-    rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num());
+    rng_name = dirname + "/rng_generator_" + rank_in_trainer + "_"
+             + std::to_string(omp_get_thread_num());
     std::ifstream rng(rng_name);
-    rng >> ::generator;
+    if(!rng) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng >> get_generator();
 
-    rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num());
+    rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer + "_"
+             + std::to_string(omp_get_thread_num());
     std::ifstream rng_fast(rng_name);
-    rng_fast >> ::fast_generator;
+    if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_fast >> get_fast_generator();
    }
 #else
-    rng_name = dirname + "/rng_generator_" + rank_in_world;
+    rng_name = dirname + "/rng_generator_" + rank_in_trainer;
     std::ifstream rng(rng_name);
-    rng >> ::generator;
+    if(!rng) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng >> get_generator();
 
-    rng_name = dirname + "/rng_fast_generator_" + rank_in_world;
+    rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer;
     std::ifstream rng_fast(rng_name);
-    rng_fast >> ::fast_generator;
+    if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); }
+    rng_fast >> get_fast_generator();
    }
 #endif
   return true;
 }
 
-void init_random(int seed, lbann_comm *comm) {
-  if (seed != -1) {
-    // Seed every OpenMP thread, if present.
-    // Note: Threadprivate OMP variables don't work with dynamic threads.
-#ifdef _OPENMP
-    #pragma omp parallel
-    {
-      get_generator().seed((seed << 8) | omp_get_thread_num());
-      get_fast_generator().seed((seed << 8) | omp_get_thread_num());
-    }
-#else
-    get_generator().seed(seed);
-    get_fast_generator().seed(seed);
-#endif
-#ifdef LBANN_SET_EL_RNG
-    if (comm != nullptr) {
-      El::Generator().seed(seed ^ comm->get_rank_in_trainer());
-    } else {
-      El::Generator().seed(seed ^ El::mpi::Rank(El::mpi::COMM_WORLD));
-    }
-#endif
-  } else {
-    // Seed with a random value.
-    std::random_device rd;
-    unsigned rand_val = rd();
-#ifdef _OPENMP
-    #pragma omp parallel
-    {
-      get_generator().seed((rand_val << 8) | omp_get_thread_num());
-      get_fast_generator().seed((rand_val << 8) | omp_get_thread_num());
-    }
-#else
-    get_generator().seed(rand_val);
-    get_fast_generator().seed(rand_val);
-#endif
-#ifdef LBANN_SET_EL_RNG
-    El::Generator().seed(rand_val);
-#endif
-  }
-
-  init_io_random(seed);
-}
-
-void init_data_seq_random(int seed) {
-  if (seed == -1) {
-    // Seed with a random value.
-    std::random_device rd;
-    seed = rd();
-  }
-
-  ::data_seq_generator_seed_base = seed;
-  /// Reset the init flag so that generator will reinitialize
-  ::data_seq_generator_inited = false;
-}
-
-void init_io_random(int seed) {
-  if (seed == -1) {
-    // Seed with a random value.
-    std::random_device rd;
-    seed = rd();
-  }
-
-  ::io_generator_seed_base = seed;
-  /// Reset the init flag so that generator will reinitialize
-  ::io_generator_inited = false;
-
-  ::fast_io_generator_seed_base = seed;
-  /// Reset the init flag so that generator will reinitialize
-  ::fast_io_generator_inited = false;
-}
-
-void gaussian_fill(AbsDistMat& mat, El::Int m, El::Int n, DataType mean,
-                   DataType stddev) {
+template <typename TensorDataType>
+void gaussian_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                   TensorDataType mean, TensorDataType stddev) {
 #ifndef LBANN_DETERMINISTIC
   El::Gaussian(mat, m, n, mean, stddev);
 #else
   gaussian_fill_procdet(mat, m, n, mean, stddev);
-#endif  // LBANN_PARALLEL_DETERMINISTIC
+#endif  // LBANN_DETERMINISTIC
 }
 
-void bernoulli_fill(AbsDistMat& mat, El::Int m, El::Int n, double p) {
+template <typename TensorDataType>
+void bernoulli_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, double p) {
 #ifndef LBANN_DETERMINISTIC
   El::Bernoulli(mat, m, n, p);
 #else
   bernoulli_fill_procdet(mat, m, n, p);
-#endif  // LBANN_PARALLEL_DETERMINISTIC
+#endif  // LBANN_DETERMINISTIC
 }
 
-void uniform_fill(AbsDistMat& mat, El::Int m, El::Int n, DataType center,
-                  DataType radius) {
+template <typename TensorDataType>
+void uniform_fill(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                  TensorDataType center, TensorDataType radius) {
 #ifndef LBANN_DETERMINISTIC
   El::Uniform(mat, m, n, center, radius);
 #else
   uniform_fill_procdet(mat, m, n, center, radius);
-#endif  // LBANN_PARALLEL_DETERMINISTIC
+#endif  // LBANN_DETERMINISTIC
 }
 
-void gaussian_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType mean,
-                           DataType stddev) {
-  CircMat<El::Device::CPU> vals(m, n, mat.Grid(), 0);
+template <typename TensorDataType>
+void gaussian_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                           TensorDataType mean, TensorDataType stddev) {
+#if defined(LBANN_HAS_GPU_FP16) && defined(LBANN_HAS_HALF)
+  using RandDataType = typename std::conditional<
+    El::Or<std::is_same<TensorDataType,cpu_fp16>,
+           std::is_same<TensorDataType,fp16>>::value,
+    float, TensorDataType>::type;
+#elif defined(LBANN_HAS_GPU_FP16)
+  using RandDataType = typename std::conditional<
+    std::is_same<TensorDataType,fp16>::value,
+    float, TensorDataType>::type;
+#elif defined(LBANN_HAS_HALF)
+  using RandDataType = typename std::conditional<
+    std::is_same<TensorDataType,cpu_fp16>::value,
+    float, TensorDataType>::type;
+#else
+  using RandDataType = TensorDataType;
+#endif // LBANN_HAS_GPU_FP16
+
+  CircMatDT<RandDataType, El::Device::CPU> vals(m, n, mat.Grid(), 0);
   if (vals.Participating()) {
     auto& local_vals = vals.Matrix();
     auto& gen = get_generator();
-    std::normal_distribution<DataType> dist(mean, stddev);
+    std::normal_distribution<RandDataType> dist(mean, stddev);
     for (El::Int col = 0; col < local_vals.Width(); ++col) {
       for (El::Int row = 0; row < local_vals.Height(); ++row) {
         local_vals(row, col) = dist(gen);
@@ -327,29 +261,48 @@ void gaussian_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType mean,
   El::Copy(vals, mat);
 }
 
-void bernoulli_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, double p) {
-  CircMat<El::Device::CPU> vals(m, n, mat.Grid(), 0);
+template <typename TensorDataType>
+void bernoulli_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n, double p) {
+  CircMatDT<TensorDataType, El::Device::CPU> vals(m, n, mat.Grid(), 0);
   if (vals.Participating()) {
     auto& local_vals = vals.Matrix();
     auto& gen = get_generator();
     std::bernoulli_distribution dist(p);
     for (El::Int col = 0; col < local_vals.Width(); ++col) {
       for (El::Int row = 0; row < local_vals.Height(); ++row) {
-        local_vals(row, col) = dist(gen);
+        local_vals(row, col) = El::To<TensorDataType>(dist(gen));
       }
     }
   }
   El::Copy(vals, mat);
 }
 
-void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType center,
-                          DataType radius) {
-  CircMat<El::Device::CPU> vals(m, n, mat.Grid(), 0);
+template <typename TensorDataType>
+void uniform_fill_procdet(El::AbstractDistMatrix<TensorDataType>& mat, El::Int m, El::Int n,
+                          TensorDataType center, TensorDataType radius) {
+#if defined(LBANN_HAS_GPU_FP16) && defined(LBANN_HAS_HALF)
+  using RandDataType = typename std::conditional<
+    El::Or<std::is_same<TensorDataType,cpu_fp16>,
+           std::is_same<TensorDataType,fp16>>::value,
+    float, TensorDataType>::type;
+#elif defined(LBANN_HAS_GPU_FP16)
+  using RandDataType = typename std::conditional<
+    std::is_same<TensorDataType,fp16>::value,
+    float, TensorDataType>::type;
+#elif defined(LBANN_HAS_HALF)
+  using RandDataType = typename std::conditional<
+    std::is_same<TensorDataType,cpu_fp16>::value,
+    float, TensorDataType>::type;
+#else
+  using RandDataType = TensorDataType;
+#endif // LBANN_HAS_GPU_FP16
+
+  CircMatDT<RandDataType, El::Device::CPU> vals(m, n, mat.Grid(), 0);
   if (vals.Participating()) {
     auto& local_vals = vals.Matrix();
     auto& gen = get_generator();
-    std::uniform_real_distribution<DataType> dist(center - radius,
-                                                  center + radius);
+    std::uniform_real_distribution<RandDataType> dist(center - radius,
+                                                      center + radius);
     for (El::Int col = 0; col < local_vals.Width(); ++col) {
       for (El::Int row = 0; row < local_vals.Height(); ++row) {
         local_vals(row, col) = dist(gen);
@@ -359,4 +312,16 @@ void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType center
   El::Copy(vals, mat);
 }
 
+#define PROTO(T)                                                                                                  \
+  template void gaussian_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T mean, T stddev);         \
+  template void bernoulli_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, double p);                \
+  template void uniform_fill<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T center, T radius);        \
+  template void gaussian_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T mean, T stddev); \
+  template void bernoulli_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, double p);        \
+  template void uniform_fill_procdet<T>(El::AbstractDistMatrix<T>& mat, El::Int m, El::Int n, T center, T radius)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 }  // namespace lbann
diff --git a/src/utils/random_number_generators.cpp b/src/utils/random_number_generators.cpp
new file mode 100644
index 00000000000..e197b5ea0f9
--- /dev/null
+++ b/src/utils/random_number_generators.cpp
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <omp.h>
+#include "lbann/utils/random_number_generators.hpp"
+#include "lbann/utils/hash.hpp"
+#include "lbann/utils/exception.hpp"
+#include <lbann/utils/memory.hpp>
+#include <thread>
+
+namespace {
+#ifdef __ICC
+lbann::rng_gen generator;
+#pragma omp threadprivate(generator)
+
+lbann::fast_rng_gen fast_generator;
+#pragma omp threadprivate(fast_generator)
+#else
+// Random number generator, file-visible only.
+// Defined like this to work around a GCC problem with threadprivate objects:
+// https://stackoverflow.com/questions/23552077/how-to-define-a-object-or-struct-as-threadprivate-in-openmp/
+extern lbann::rng_gen generator;
+#pragma omp threadprivate(generator)
+lbann::rng_gen generator;
+
+extern lbann::fast_rng_gen fast_generator;
+#pragma omp threadprivate(fast_generator)
+lbann::fast_rng_gen fast_generator;
+#endif
+
+bool generator_inited = false;
+bool fast_generator_inited = false;
+
+thread_local lbann::rng_gen data_seq_generator;
+thread_local bool data_seq_generator_inited = false;
+int data_seq_generator_seed_base = 0;
+bool data_seq_generator_seed_inited = false;
+
+// Local index for the I/O generators
+thread_local size_t local_io_generators_index = 0;
+std::vector<lbann::io_rng_t> io_generators;
+bool io_generators_inited = false;
+}
+
+namespace lbann {
+
+rng_gen& get_generator() {
+  if (!::generator_inited) { LBANN_ERROR("RNG seed not set"); }
+  return ::generator;
+}
+
+fast_rng_gen& get_fast_generator() {
+  if (!::fast_generator_inited) { LBANN_ERROR("Fast RNG seed not set"); }
+  return ::fast_generator;
+}
+
+rng_gen& get_data_seq_generator() {
+  if (!::data_seq_generator_inited) {
+    if (!::data_seq_generator_seed_inited) { LBANN_ERROR("data sequence RNG seed not set"); }
+    ::data_seq_generator.seed(::data_seq_generator_seed_base);
+    ::data_seq_generator_inited = true;
+  }
+  return ::data_seq_generator;
+}
+
+int get_num_io_generators() {
+  return ::io_generators.size();
+}
+
+locked_io_rng_ref set_io_generators_local_index(size_t idx) {
+  ::local_io_generators_index = idx;
+  if (!::io_generators_inited) { LBANN_ERROR("I/O RNG seed not set"); }
+  return locked_io_rng_ref(::io_generators[idx]);
+}
+
+rng_gen& get_io_generator() {
+  const size_t idx = ::local_io_generators_index;
+  io_rng_t& io_rng = ::io_generators[idx];
+  if (io_rng.active_thread_id.load() != std::this_thread::get_id()) {
+    LBANN_ERROR("I/O RNG illegal thread access");
+  }
+  return io_rng.generator;
+}
+
+fast_rng_gen& get_fast_io_generator() {
+  const size_t idx = ::local_io_generators_index;
+  io_rng_t& io_rng = ::io_generators[idx];
+  if (io_rng.active_thread_id.load() != std::this_thread::get_id()) {
+    LBANN_ERROR("I/O RNG illegal thread access");
+  }
+  return io_rng.fast_generator;
+}
+
+void init_random(int seed, int num_io_RNGs, lbann_comm *comm) {
+  generator_inited = true;
+  fast_generator_inited = true;
+  if (seed != -1) {
+    // Seed every OpenMP thread, if present.
+    // Note: Threadprivate OMP variables don't work with dynamic threads.
+#ifdef _OPENMP
+    #pragma omp parallel
+    {
+      get_generator().seed(hash_combine(seed, omp_get_thread_num()));
+      get_fast_generator().seed(hash_combine(seed, omp_get_thread_num()));
+    }
+#else
+    get_generator().seed(seed);
+    get_fast_generator().seed(seed);
+#endif
+
+#ifdef LBANN_SET_EL_RNG
+    // Set Elemental's RNG seed
+    auto elemental_seed = hash_combine(seed, 104729); // 10000th prime
+    int mpi_initialized = 0;
+    MPI_Initialized(&mpi_initialized);
+    if(mpi_initialized) {
+      // If MPI is initialized mix in the rank to ensure that Hydrogen
+      // has good RNGs.  Note that under some configurations LBANN
+      // will not do this, so it is good to ensure that Hydrogen is
+      // well seeded.
+      elemental_seed = (comm == nullptr
+                        ? hash_combine(elemental_seed, El::mpi::Rank(El::mpi::COMM_WORLD))
+                        : hash_combine(elemental_seed, comm->get_rank_in_trainer()));
+    }
+    El::Generator().seed(elemental_seed);
+#endif
+
+  } else {
+    // Seed with a random value.
+    std::random_device rd;
+    unsigned rand_val = rd();
+#ifdef _OPENMP
+    #pragma omp parallel
+    {
+      get_generator().seed(hash_combine(rand_val, omp_get_thread_num()));
+      get_fast_generator().seed(hash_combine(rand_val, omp_get_thread_num()));
+    }
+#else
+    get_generator().seed(rand_val);
+    get_fast_generator().seed(rand_val);
+#endif
+#ifdef LBANN_SET_EL_RNG
+    El::Generator().seed(rand_val);
+#endif
+  }
+
+  init_io_random(seed, num_io_RNGs);
+}
+
+void init_data_seq_random(int seed) {
+  if (seed == -1) {
+    // Seed with a random value.
+    std::random_device rd;
+    seed = rd();
+  }
+
+  ::data_seq_generator_seed_base = seed;
+  ::data_seq_generator_seed_inited = true;
+  /// Reset the init flag so that generator will reinitialize
+  ::data_seq_generator_inited = false;
+}
+
+void init_io_random(int seed, int num_io_RNGs) {
+  int seed_base = seed;
+  if (seed == -1) {
+    // Seed with a random value.
+    std::random_device rd;
+    seed_base = rd();
+  }
+
+  ::io_generators.resize(num_io_RNGs);
+  for(int i = 0; i < num_io_RNGs; i++) {
+    auto& io_rng = ::io_generators[i];
+    io_rng.generator.seed(hash_combine(seed_base, i));
+    io_rng.fast_generator.seed(hash_combine(seed_base, i));
+    io_rng.active_thread_id.store(std::thread::id());
+  }
+  ::io_generators_inited = true;
+}
+
+}  // namespace lbann
diff --git a/src/utils/serialization.cpp b/src/utils/serialization.cpp
new file mode 100644
index 00000000000..7e27fb38349
--- /dev/null
+++ b/src/utils/serialization.cpp
@@ -0,0 +1,94 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+/** @file
+ *
+ *  Serialization functions for arithmetic types. Specializations for
+ *  Cereal's Binary, JSON, and XML archives are provided.
+ */
+
+#include "lbann/utils/serialization.hpp"
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+#include <cereal/archives/xml.hpp>
+
+/** @namespace cereal
+ *
+ *  Extensions to Cereal for extra arithmetic types used by LBANN.
+ */
+namespace cereal
+{
+
+#ifdef LBANN_HAS_HALF
+#ifdef LBANN_HAS_GPU_FP16
+
+/** @brief Save this half-precision value in Binary */
+void save(BinaryOutputArchive& archive, __half const& value)
+{
+  archive.saveBinary(std::addressof(value), sizeof(value));
+}
+
+/** @brief Load this half-precision value from Binary */
+void load(BinaryInputArchive& archive, __half& value)
+{
+  archive.loadBinary(std::addressof(value), sizeof(value));
+}
+
+#endif // LBANN_HAS_GPU_FP16
+
+// Save/load functions for XML archives
+float save_minimal(XMLOutputArchive const&,
+                   half_float::half const& val) noexcept
+{
+  return val;
+}
+
+void load_minimal(XMLInputArchive const&, half_float::half& val,
+                  float const& in_val) noexcept
+{
+  val = in_val;
+}
+
+// Save/load functions for JSON archives
+void save(JSONOutputArchive& oarchive, half_float::half const& val)
+{
+  std::ostringstream oss;
+  oss.precision(std::numeric_limits<long double>::max_digits10);
+  oss << val;
+  oarchive.saveValue(oss.str());
+}
+
+void load(JSONInputArchive& iarchive, half_float::half& val)
+{
+  std::string encoded;
+  iarchive.loadValue(encoded);
+  val = std::stof(encoded);
+}
+
+#endif // LBANN_HAS_HALF
+
+}// namespace cereal
diff --git a/src/utils/statistics.cpp b/src/utils/statistics.cpp
index f4ec27d3f0f..c688e517a7b 100644
--- a/src/utils/statistics.cpp
+++ b/src/utils/statistics.cpp
@@ -60,7 +60,7 @@ void entrywise_mean_and_stdev(const Mat& data,
   mean = shifted_mean + shift;
   const DataType var = std::max(shifted_sqmean - shifted_mean * shifted_mean,
                                 DataType(0));
-  stdev = std::sqrt(var);
+  stdev = El::Sqrt(var);
 
 }
 
@@ -94,7 +94,7 @@ void entrywise_mean_and_stdev(const AbsDistMat& data,
   // Compute mean and standard deviation
   mean = sum_sqsum[0] / size;
   const DataType var = std::max(sum_sqsum[1] / size - mean * mean, DataType(0));
-  stdev = std::sqrt(var);
+  stdev = El::Sqrt(var);
 
 }
 
@@ -126,7 +126,7 @@ void columnwise_mean_and_stdev(const Mat& data,
     const DataType mean = shifted_mean + shift;
     const DataType var = std::max(shifted_sqmean - shifted_mean * shifted_mean,
                                   DataType(0));
-    const DataType stdev = std::sqrt(var);
+    const DataType stdev = El::Sqrt(var);
     means(0, col) = mean;
     stdevs(0, col) = stdev;
   }
@@ -209,7 +209,7 @@ void columnwise_mean_and_stdev(const AbsDistMat& data,
     const DataType mean = local_means(0, col) / height;
     const DataType sqmean = local_stdevs(0, col) / height;
     const DataType var = std::max(sqmean - mean * mean, DataType(0));
-    const DataType stdev = std::sqrt(var);
+    const DataType stdev = El::Sqrt(var);
     local_means(0, col) = mean;
     local_stdevs(0, col) = stdev;
   }
@@ -265,7 +265,7 @@ void rowwise_mean_and_stdev(const Mat& data,
       const DataType mean = shifted_mean + shifts[row - row_start];
       const DataType var = std::max(shifted_sqmean - shifted_mean * shifted_mean,
                                     DataType(0));
-      const DataType stdev = std::sqrt(var);
+      const DataType stdev = El::Sqrt(var);
       means(row, 0) = mean;
       stdevs(row, 0) = stdev;
     }
@@ -356,7 +356,7 @@ void rowwise_mean_and_stdev(const AbsDistMat& data,
     const DataType mean = local_means(row, 0) / width;
     const DataType sqmean = local_stdevs(row, 0) / width;
     const DataType var = std::max(sqmean - mean * mean, DataType(0));
-    const DataType stdev = std::sqrt(var);
+    const DataType stdev = El::Sqrt(var);
     local_means(row, 0) = mean;
     local_stdevs(row, 0) = stdev;
   }
diff --git a/src/utils/summary.cpp b/src/utils/summary.cpp
index fddb0446882..2f347a337af 100644
--- a/src/utils/summary.cpp
+++ b/src/utils/summary.cpp
@@ -22,12 +22,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_summary .hpp .cpp - Write summary statistics to Tensorboard
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/utils/summary.hpp"
 
+#include "lbann/utils/exception.hpp"
+#include "lbann/utils/image.hpp"
+#include "lbann/utils/opencv.hpp"
+
 namespace lbann {
 
 #ifdef LBANN_HAS_TBINF
@@ -49,140 +51,24 @@ lbann_summary::~lbann_summary() {
   }
 }
 
-void lbann_summary::reduce_mean(const std::string tag,
-                                const AbsDistMat& mat,
-                                int step) {
-  // Local sum
-  DataType sum = 0.0;
-
-  // Check distributed matrix format
-  El::DistData mat_format(mat);
-  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
-    // Compute local sum on master process if matrix is Star,Star
-    if(m_comm->am_trainer_master()) {
-      sum = local_sum(mat.LockedMatrix());
-    }
-  } else {
-    // Compute local sum on all processes if matrix is in MC,MR;
-    // Star,VC; or similar format
-    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
-    // formats
-    sum = local_sum(mat.LockedMatrix());
-  }
-
-  // Add local sum to list of pending means
-  m_pending_means.emplace_back(tag, step, sum, 0.0f, mat.Height() * mat.Width());
-}
-
-void lbann_summary::reduce_min(const std::string tag,
-                               const AbsDistMat& mat,
-                               int step) {
-  DataType mat_local_min = local_min(mat.LockedMatrix());
-  m_pending_mins.emplace_back(tag, step, mat_local_min);
-}
-
-void lbann_summary::reduce_max(const std::string tag,
-                               const AbsDistMat& mat,
-                               int step) {
-  DataType mat_local_max = local_max(mat.LockedMatrix());
-  m_pending_maxes.emplace_back(tag, step, mat_local_max);
-}
-
-void lbann_summary::reduce_stdev(const std::string tag,
-                                 const AbsDistMat& mat,
+void lbann_summary::report_image(std::string const& tag,
+                                 std::string const& img_format,
+                                 CPUMat const& image,
+                                 std::vector<int> const& dims_in,
                                  int step) {
-  // Local sum and squared sum
-  DataType sum = 0.0;
-  DataType sqsum = 0.0;
+  std::vector<size_t> dims(dims_in.begin(), dims_in.end());
 
-  // Check distributed matrix format
-  El::DistData mat_format(mat);
-  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
-    // Compute local sums on master process if matrix is Star,Star
-    if(m_comm->am_trainer_master()) {
-      local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
-    }
-  } else {
-    // Compute local sums on all processes if matrix is in MC,MR;
-    // Star,VC; or similar format
-    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
-    // formats
-    local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
+  if (static_cast<size_t>(image.Height()) != utils::get_linearized_size(dims))
+  {
+    LBANN_ERROR("Image height \"", image.Height(),
+                "\" does not match expected value \"",
+                utils::get_linearized_size(dims), "\".");
   }
 
-  // Add local sums to list of pending stdevs.
-  m_pending_stdevs.emplace_back(tag, step, sum, sqsum, mat.Height() * mat.Width());
-}
+  auto uint8_img = get_uint8_t_image(image, dims);
+  auto img_str = encode_image(uint8_img, dims, img_format);
+  m_sw->add_image(tag, img_str, dims, step);
 
-void lbann_summary::reduce_scalar(const std::string tag,
-                                  DataType s,
-                                  int step) {
-  if (m_comm->am_trainer_master()) {
-    m_pending_scalars.emplace_back(tag, step, s);
-  }
-}
-
-void lbann_summary::sum_reduce_scalar(const std::string tag,
-                                      DataType s,
-                                      int step) {
-  m_pending_sum_scalars.emplace_back(tag, step, s);
-}
-
-void lbann_summary::reduce_scalar_all(const std::string tag,
-                                      DataType s,
-                                      int step) {
-  m_pending_scalar_alls.emplace_back(tag, step, s);
-}
-
-void lbann_summary::reduce_histogram(const std::string tag,
-                                     const AbsDistMat& mat,
-                                     int step) {
-  DataType mat_local_min = local_min(mat.LockedMatrix());
-  DataType mat_local_max = local_max(mat.LockedMatrix());
-  // Local sum and squared sum
-  DataType sum = 0.0f;
-  DataType sqsum = 0.0f;
-  // Check distributed matrix format
-  El::DistData mat_format(mat);
-  if(mat_format.colDist == El::STAR && mat_format.rowDist == El::STAR) {
-    // Compute local sums on master process if matrix is Star,Star
-    if(m_comm->am_trainer_master()) {
-      local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
-    }
-  } else {
-    // Compute local sums on all processes if matrix is in MC,MR;
-    // Star,VC; or similar format
-    // TODO: implement for matrices in Circ,Circ; MC,Star; or similar
-    // formats
-    local_sum_sqsum(mat.LockedMatrix(), sum, sqsum);
-  }
-  // Compute local buckets.
-  std::vector<float> buckets(m_histogram_buckets.size(), 0.0f);
-  const int height = mat.LocalHeight();
-  const int width = mat.LocalWidth();
-  const int ldim = mat.LDim();
-  const DataType *__restrict__ mat_buf = mat.LockedMatrix().LockedBuffer();
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      // Note: This could be optimized; upper_bound takes O(logn) time.
-      int bucket = std::upper_bound(
-                     m_histogram_buckets.begin(), m_histogram_buckets.end(),
-                     mat_buf[row + col * ldim]) - m_histogram_buckets.begin();
-      buckets[bucket] += 1.0f;
-    }
-  }
-  // Add to list of pending histograms.
-  m_pending_histograms.emplace_back(
-    tag, step, buckets, mat_local_min, mat_local_max, mat.Height() * mat.Width(),
-    sum, sqsum);
-  // TODO: Support histograms on multiple models.
-}
-
-void lbann_summary::reduce_2norm(const std::string tag, const AbsDistMat& mat,
-                                 int step) {
-  // Using a squared 2-norm so that we can just sum this.
-  DataType local_norm = local_2norm(mat.LockedMatrix());
-  sum_reduce_scalar(tag, local_norm * local_norm, step);
 }
 
 void lbann_summary::flush() {
@@ -203,14 +89,14 @@ void lbann_summary::flush_means() {
   if (m_pending_means.empty()) {
     return;
   }
-  std::vector<DataType> local_sums;
+  std::vector<float> local_sums;
   for (const auto& op : m_pending_means) {
     local_sums.push_back(op.local);
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> global_sums(local_sums.size());
+    std::vector<float> global_sums(local_sums.size());
     m_comm->trainer_reduce(local_sums.data(), local_sums.size(),
-                         global_sums.data());
+                           global_sums.data());
     // Compute the means in-place.
     for (unsigned i = 0; i < global_sums.size(); ++i) {
       global_sums[i] /= m_pending_means[i].num;
@@ -227,18 +113,18 @@ void lbann_summary::flush_mins() {
   if (m_pending_mins.empty()) {
     return;
   }
-  std::vector<DataType> local_mins;
+  std::vector<float> local_mins;
   for (const auto& op : m_pending_mins) {
     local_mins.push_back(op.local);
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> global_mins(local_mins.size());
+    std::vector<float> global_mins(local_mins.size());
     m_comm->trainer_reduce(local_mins.data(), local_mins.size(),
-                         global_mins.data(), El::mpi::MIN);
+                           global_mins.data(), El::mpi::MIN);
     gather_scalar_summary(m_pending_mins, global_mins);
   } else {
     m_comm->trainer_reduce(local_mins.data(), local_mins.size(),
-                         m_comm->get_trainer_master(), El::mpi::MIN);
+                           m_comm->get_trainer_master(), El::mpi::MIN);
   }
   m_pending_mins.clear();
 }
@@ -247,18 +133,18 @@ void lbann_summary::flush_maxes() {
   if (m_pending_maxes.empty()) {
     return;
   }
-  std::vector<DataType> local_maxes;
+  std::vector<float> local_maxes;
   for (const auto& op : m_pending_maxes) {
     local_maxes.push_back(op.local);
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> global_maxes(local_maxes.size());
+    std::vector<float> global_maxes(local_maxes.size());
     m_comm->trainer_reduce(local_maxes.data(), local_maxes.size(),
-                         global_maxes.data(), El::mpi::MAX);
+                           global_maxes.data(), El::mpi::MAX);
     gather_scalar_summary(m_pending_maxes, global_maxes);
   } else {
     m_comm->trainer_reduce(local_maxes.data(), local_maxes.size(),
-                         m_comm->get_trainer_master(), El::mpi::MAX);
+                           m_comm->get_trainer_master(), El::mpi::MAX);
   }
   m_pending_maxes.clear();
 }
@@ -267,8 +153,8 @@ void lbann_summary::flush_stdevs() {
   if (m_pending_stdevs.empty()) {
     return;
   }
-  std::vector<DataType> local_sums;
-  std::vector<DataType> local_sqsums;
+  std::vector<float> local_sums;
+  std::vector<float> local_sqsums;
   for (const auto& op : m_pending_stdevs) {
     local_sums.push_back(op.local);
     local_sqsums.push_back(op.local2);
@@ -279,15 +165,15 @@ void lbann_summary::flush_stdevs() {
     // The n-1 is to use an unbiased variance estimate.
     // This unrolls the usual formulation of standard deviation some, to avoid
     // global operations when pushing the operation.
-    std::vector<DataType> global_sums(local_sums.size());
-    std::vector<DataType> global_sqsums(local_sqsums.size());
+    std::vector<float> global_sums(local_sums.size());
+    std::vector<float> global_sqsums(local_sqsums.size());
     m_comm->trainer_reduce(local_sums.data(), local_sums.size(),
-                         global_sums.data());
+                           global_sums.data());
     m_comm->trainer_reduce(local_sqsums.data(), local_sqsums.size(),
-                         global_sqsums.data());
+                           global_sqsums.data());
     // Re-use the global_sums vector for the standard deviation.
     for (unsigned i = 0; i < global_sums.size(); ++i) {
-      global_sums[i] = std::sqrt(
+      global_sums[i] = El::Sqrt(
                          (global_sqsums[i] -
                           global_sums[i] * global_sums[i] / m_pending_stdevs[i].num) /
                          (m_pending_stdevs[i].num - 1));
@@ -307,7 +193,7 @@ void lbann_summary::flush_scalars() {
     return;
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> local_scalars;
+    std::vector<float> local_scalars;
     for (const auto& op : m_pending_scalars) {
       local_scalars.push_back(op.local);
     }
@@ -320,18 +206,18 @@ void lbann_summary::flush_sum_scalars() {
   if (m_pending_sum_scalars.empty()) {
     return;
   }
-  std::vector<DataType> local_sums;
+  std::vector<float> local_sums;
   for (const auto& op : m_pending_sum_scalars) {
     local_sums.push_back(op.local);
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> global_sums(local_sums.size());
+    std::vector<float> global_sums(local_sums.size());
     m_comm->trainer_reduce(local_sums.data(), local_sums.size(),
-                         global_sums.data());
+                           global_sums.data());
     gather_scalar_summary(m_pending_sum_scalars, global_sums);
   } else {
     m_comm->trainer_reduce(local_sums.data(), local_sums.size(),
-                         m_comm->get_trainer_master());
+                           m_comm->get_trainer_master());
   }
   m_pending_sum_scalars.clear();
 }
@@ -341,12 +227,12 @@ void lbann_summary::flush_scalar_alls() {
     return;
   }
   // Gather from every process to world master.
-  std::vector<DataType> local_scalars;
+  std::vector<float> local_scalars;
   for (const auto& op : m_pending_scalar_alls) {
     local_scalars.push_back(op.local);
   }
   if (m_comm->am_world_master()) {
-    std::vector<DataType> scalars(
+    std::vector<float> scalars(
       m_comm->get_procs_in_world()*local_scalars.size());
     m_comm->gather(local_scalars.data(), local_scalars.size(),
                    scalars.data(), m_comm->get_world_comm());
@@ -370,10 +256,10 @@ void lbann_summary::flush_histograms() {
   if (m_pending_histograms.empty()) {
     return;
   }
-  std::vector<DataType> local_mins;
-  std::vector<DataType> local_maxes;
-  std::vector<DataType> local_sums;
-  std::vector<DataType> local_sqsums;
+  std::vector<float> local_mins;
+  std::vector<float> local_maxes;
+  std::vector<float> local_sums;
+  std::vector<float> local_sqsums;
   std::vector<float> buckets;
   for (const auto& op : m_pending_histograms) {
     local_mins.push_back(op.min);
@@ -383,10 +269,10 @@ void lbann_summary::flush_histograms() {
     buckets.insert(buckets.end(), op.buckets.begin(), op.buckets.end());
   }
   if (m_comm->am_trainer_master()) {
-    std::vector<DataType> model_mins(local_mins.size());
-    std::vector<DataType> model_maxes(local_maxes.size());
-    std::vector<DataType> model_sums(local_sums.size());
-    std::vector<DataType> model_sqsums(local_sqsums.size());
+    std::vector<float> model_mins(local_mins.size());
+    std::vector<float> model_maxes(local_maxes.size());
+    std::vector<float> model_sums(local_sums.size());
+    std::vector<float> model_sqsums(local_sqsums.size());
     std::vector<float> model_buckets(buckets.size());
     m_comm->trainer_reduce(local_mins.data(), local_mins.size(),
                          model_mins.data(), El::mpi::MIN);
@@ -400,13 +286,13 @@ void lbann_summary::flush_histograms() {
                          model_buckets.data());
     // Gather to the world master for writing out.
     if (m_comm->am_world_master()) {
-      std::vector<DataType> global_mins(
+      std::vector<float> global_mins(
         m_comm->get_num_trainers() * model_mins.size());
-      std::vector<DataType> global_maxes(
+      std::vector<float> global_maxes(
         m_comm->get_num_trainers() * model_maxes.size());
-      std::vector<DataType> global_sums(
+      std::vector<float> global_sums(
         m_comm->get_num_trainers() * model_sums.size());
-      std::vector<DataType> global_sqsums(
+      std::vector<float> global_sqsums(
         m_comm->get_num_trainers() * model_sqsums.size());
       std::vector<float> global_buckets(
         m_comm->get_num_trainers() * model_buckets.size());
@@ -459,138 +345,15 @@ void lbann_summary::flush_histograms() {
   m_pending_histograms.clear();
 }
 
-DataType lbann_summary::local_sum(const Mat& mat) const {
-  // Note there are more numerically stable ways to compute a sum.
-  const El::Int height = mat.Height();
-  const El::Int width = mat.Width();
-  const El::Int ldim = mat.LDim();
-  const DataType * __restrict__ mat_buf = mat.LockedBuffer();
-  auto sum = DataType(0);
-  if (ldim == height) {
-    const El::Int size = height*width;
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum))
-    for (El::Int i = 0; i < size; ++i) {
-      sum += mat_buf[i];
-    }
-  } else {
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum) collapse(2))
-    for (El::Int row = 0; row < height; ++row) {
-      for (El::Int col = 0; col < width; ++col) {
-        sum += mat_buf[row + col * ldim];
-      }
-    }
-  }
-  return sum;
-}
-
-void lbann_summary::local_sum_sqsum(
-  const Mat& mat, DataType& sum, DataType& sqsum) const {
-  // Note there are more numerically stable ways to compute a sum.
-  const El::Int height = mat.Height();
-  const El::Int width = mat.Width();
-  const El::Int ldim = mat.LDim();
-  const DataType * __restrict__ mat_buf = mat.LockedBuffer();
-  sum = DataType(0);
-  sqsum = DataType(0);
-  if (ldim == height) {
-    const El::Int size = height*width;
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum,sqsum))
-    for (El::Int i = 0; i < size; ++i) {
-      const DataType val = mat_buf[i];
-      sum += val;
-      sqsum += val*val;
-    }
-  } else {
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:sum,sqsum) collapse(2))
-    for (El::Int row = 0; row < height; ++row) {
-      for (El::Int col = 0; col < width; ++col) {
-        const DataType val = mat_buf[row + col*ldim];
-        sum += val;
-        sqsum += val * val;
-      }
-    }
-  }
-}
-
-DataType lbann_summary::local_min(const Mat& mat) const {
-  const El::Int height = mat.Height();
-  const El::Int width = mat.Width();
-  const El::Int ldim = mat.LDim();
-  const DataType * __restrict__ mat_buf = mat.LockedBuffer();
-  auto min = std::numeric_limits<DataType>::max();
-  if (ldim == height) {
-    const El::Int size = height*width;
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(min:min))
-    for (El::Int i = 0; i < size; ++i) {
-      min = std::min(min, mat_buf[i]);
-    }
-  } else {
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(min:min) collapse(2))
-    for (El::Int row = 0; row < height; ++row) {
-      for (El::Int col = 0; col < width; ++col) {
-        min = std::min(min, mat_buf[row + col*ldim]);
-      }
-    }
-  }
-  return min;
-}
-
-DataType lbann_summary::local_max(const Mat& mat) const {
-  const El::Int height = mat.Height();
-  const El::Int width = mat.Width();
-  const El::Int ldim = mat.LDim();
-  const DataType * __restrict__ mat_buf = mat.LockedBuffer();
-  auto max = std::numeric_limits<DataType>::min();
-  if (ldim == height) {
-    const El::Int size = height*width;
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(max:max))
-    for (El::Int i = 0; i < size; ++i) {
-      max = std::max(max, mat_buf[i]);
-    }
-  } else {
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(max:max) collapse(2))
-    for (El::Int row = 0; row < height; ++row) {
-      for (El::Int col = 0; col < width; ++col) {
-        max = std::max(max, mat_buf[row + col*ldim]);
-      }
-    }
-  }
-  return max;
-}
-
-DataType lbann_summary::local_2norm(const Mat& mat) const {
-  // Note there are more numerically stable ways to compute this.
-  const El::Int height = mat.Height();
-  const El::Int width = mat.Width();
-  const El::Int ldim = mat.LDim();
-  const DataType * __restrict__ mat_buf = mat.LockedBuffer();
-  auto norm = DataType(0);
-  if (ldim == height) {
-    const El::Int size = height*width;
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:norm))
-    for (El::Int i = 0; i < size; ++i) {
-      norm += mat_buf[i] * mat_buf[i];
-    }
-  } else {
-    LBANN_OMP_PARALLEL_FOR_ARGS(reduction(+:norm) collapse(2))
-    for (El::Int row = 0; row < height; ++row) {
-      for (El::Int col = 0; col < width; ++col) {
-        norm += mat_buf[row + col * ldim] * mat_buf[row + col * ldim];
-      }
-    }
-  }
-  return std::sqrt(norm);
-}
-
 std::string lbann_summary::prepend_model(const std::string tag,
                                          int model) const {
   return "model" + std::to_string(model) + "/" + tag;
 }
 
 void lbann_summary::gather_scalar_summary(
-  const std::vector<pending_op>& ops, std::vector<DataType>& scalars) {
+  const std::vector<pending_op>& ops, std::vector<float>& scalars) {
   if (m_comm->am_world_master()) {
-    std::vector<DataType> data(m_comm->get_num_trainers() * scalars.size());
+    std::vector<float> data(m_comm->get_num_trainers() * scalars.size());
     m_comm->intertrainer_gather(scalars.data(), scalars.size(), data.data());
     for (unsigned i = 0; i < data.size(); ++i) {
       int model = i / ops.size();
@@ -605,10 +368,10 @@ void lbann_summary::gather_scalar_summary(
 }
 
 void lbann_summary::gather_scalar_summary(const std::string tag,
-                                          DataType s,
+                                          float s,
                                           int step) {
   if (m_comm->am_world_master()) {
-    std::vector<DataType> data(m_comm->get_num_trainers());
+    std::vector<float> data(m_comm->get_num_trainers());
     m_comm->intertrainer_gather(s, data);
     for (size_t model = 0; model < data.size(); ++model) {
       m_sw->add_scalar(prepend_model(tag, model), data[model], step);
diff --git a/src/utils/system_info.cpp b/src/utils/system_info.cpp
new file mode 100644
index 00000000000..79fc0f4860c
--- /dev/null
+++ b/src/utils/system_info.cpp
@@ -0,0 +1,166 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/system_info.hpp"
+
+#include "lbann/utils/environment_variable.hpp"
+
+#include <stdexcept>
+#include <string>
+
+#include <mpi.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace lbann {
+namespace utils {
+namespace {
+
+int try_mpi_comm_rank() noexcept
+{
+  int rank = -1;
+  int mpi_has_been_initialized = -1, mpi_has_been_finalized = -1;
+  MPI_Initialized(&mpi_has_been_initialized);
+  MPI_Finalized(&mpi_has_been_finalized);
+
+  if (mpi_has_been_initialized && !mpi_has_been_finalized)
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  return rank;
+}
+
+// I know about SLURM, Open-MPI, and MVAPICH2
+int try_env_variable_rank()
+{
+  ENV slurm_rank("SLURM_PROCID");
+  if (slurm_rank.exists())
+    return slurm_rank.template value<int>();
+  ENV openmpi_rank("OMPI_COMM_WORLD_RANK");
+  if (openmpi_rank.exists())
+    return openmpi_rank.template value<int>();
+  ENV mvapich2_rank("MV2_COMM_WORLD_RANK");
+  if (mvapich2_rank.exists())
+    return mvapich2_rank.template value<int>();
+
+  return -1;
+}
+
+int try_mpi_comm_size() noexcept
+{
+  int size = -1;
+  int mpi_has_been_initialized = -1, mpi_has_been_finalized = -1;
+  MPI_Initialized(&mpi_has_been_initialized);
+  MPI_Finalized(&mpi_has_been_finalized);
+
+  if (mpi_has_been_initialized && !mpi_has_been_finalized)
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  return size;
+}
+
+// I know about SLURM, Open-MPI, and MVAPICH2
+int try_env_variable_size()
+{
+  ENV slurm_size("SLURM_NTASKS");
+  if (slurm_size.exists())
+    return slurm_size.template value<int>();
+  ENV openmpi_size("OMPI_COMM_WORLD_SIZE");
+  if (openmpi_size.exists())
+    return openmpi_size.template value<int>();
+  ENV mvapich2_size("MV2_COMM_WORLD_SIZE");
+  if (mvapich2_size.exists())
+    return mvapich2_size.template value<int>();
+
+  return -1;
+}
+}// namespace <anon>
+
+std::string SystemInfo::pid() const
+{
+  return std::to_string(getpid());
+}
+
+std::string SystemInfo::host_name() const
+{
+  char hostname[4096];
+  int status = gethostname(hostname, 4096);
+  if (status != 0)
+    throw std::runtime_error("gethostname failed");
+  return hostname;
+}
+
+int SystemInfo::mpi_rank() const
+{
+  static int rank = -1;
+
+  // Short-circuit if rank has already been found.
+  if (rank != -1)
+    return rank;
+
+  // First try MPI directly
+  rank = try_mpi_comm_rank();
+
+  // Now try some environment variables
+  if (rank == -1)
+    rank = try_env_variable_rank();
+
+  // At this point, I assume I'm not in an MPI job.
+  if (rank == -1)
+    rank = 0;
+
+  return rank;
+}
+
+int SystemInfo::mpi_size() const
+{
+  static int size = -1;
+
+  // Short-circuit if size has already been found.
+  if (size != -1)
+    return size;
+
+  // First try MPI directly
+  size = try_mpi_comm_size();
+
+  // Now try some environment variables
+  if (size == -1)
+    size = try_env_variable_size();
+
+  // At this point, I assume I'm not in an MPI job.
+  if (size == -1)
+    size = 0;
+
+  return size;
+}
+
+std::string
+SystemInfo::env_variable_value(std::string const& var_name) const
+{
+  return ENV(var_name).raw_value();
+}
+
+}// namespace utils
+}// namespace lbann
diff --git a/src/utils/threads/CMakeLists.txt b/src/utils/threads/CMakeLists.txt
index ca1ab783b27..7d0fc767bce 100644
--- a/src/utils/threads/CMakeLists.txt
+++ b/src/utils/threads/CMakeLists.txt
@@ -1,7 +1,38 @@
+# Test for non-portable POSIX features
+include(CheckCXXSourceCompiles)
+set(CMAKE_REQUIRED_LIBRARIES Threads::Threads)
+set(_PTHREAD_SETAFFINITY_NP_TEST_CODE
+  "#include <pthread.h>
+int main(int, char* argv[]) {
+  pthread_t thd = pthread_self();
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  return pthread_setaffinity_np(thd, sizeof(cpu_set_t), &cpuset);
+}")
+check_cxx_source_compiles(
+  "${_PTHREAD_SETAFFINITY_NP_TEST_CODE}" LBANN_HAS_PTHREAD_SETAFFINITY_NP)
+set(_PTHREAD_GETAFFINITY_NP_TEST_CODE
+  "#include <pthread.h>
+int main(int, char* argv[]) {
+  pthread_t thd = pthread_self();
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  return pthread_getaffinity_np(thd, sizeof(cpu_set_t), &cpuset);
+}")
+check_cxx_source_compiles(
+  "${_PTHREAD_GETAFFINITY_NP_TEST_CODE}" LBANN_HAS_PTHREAD_GETAFFINITY_NP)
+set(CMAKE_REQUIRED_LIBRARIES)
+
+if (LBANN_HAS_PTHREAD_SETAFFINITY_NP AND LBANN_HAS_PTHREAD_GETAFFINITY_NP)
+  set(LBANN_HAS_PTHREAD_AFFINITY_SUPPORT TRUE CACHE INTERNAL
+    "LBANN has pthread affinity support")
+endif ()
+
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
   thread_pool.cpp
   thread_utils.cpp
+  thread_topology.cpp
 )
 
 # Propagate the files up the tree
diff --git a/src/utils/threads/thread_pool.cpp b/src/utils/threads/thread_pool.cpp
index 0c79eef54c8..9bb66dc0785 100644
--- a/src/utils/threads/thread_pool.cpp
+++ b/src/utils/threads/thread_pool.cpp
@@ -1,4 +1,14 @@
 #include "lbann/utils/threads/thread_pool.hpp"
+#include "lbann/utils/argument_parser.hpp"
+#include "lbann/utils/lbann_library.hpp"
+#include "lbann/utils/threads/thread_topology.hpp"
+
+#if defined(LBANN_TOPO_AWARE)
+#include <hwloc.h>
+#if defined(HWLOC_API_VERSION) && (HWLOC_API_VERSION < 0x00010b00)
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#endif
 
 #include <algorithm>
 #include <iostream>
@@ -37,43 +47,83 @@ void thread_pool::launch_threads(size_type num_threads)
   }
 }
 
-void thread_pool::launch_pinned_threads(size_type num_threads, int cpu_offset) {
+// FIXME (trb 08/03/2019): Setting thread affinity is not a portable
+// pthread operation (hence the _np suffix); indeed, OSX does not
+// support it. Unfortunately the case on OSX is even more dire -- they
+// seem to want to prevent you from messing with their scheduler at
+// all. The MACH kernel API for doing this is marked "deprecated" and
+// its use is not advised for code that is not tied to a specific OSX
+// version (see here for more information:
+// http://web.mit.edu/darwin/src/modules/xnu/osfmk/man/).
+//
+// As a result of the above, this will, in fact, *not* launch pinned
+// threads when the locally-supported pthread API does not support it.
+void thread_pool::launch_pinned_threads(
+  size_type num_threads, int PU_offset) {
+
+#if defined(LBANN_TOPO_AWARE)
   threads_.reserve(num_threads);
   m_work_group.reserve(num_threads);
   m_thread_id_to_local_id_map.reserve(num_threads);
 
-  m_threads_offset = cpu_offset;
-
-  // Find the current thread affinity
-  cpu_set_t cpuset, ht_cpuset;
-  CPU_ZERO(&cpuset);
-
-  auto error = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
-  if (error != 0) {
-    std::cerr << "error in pthread_getaffinity_np, error=" << error
-              << std::endl;
-  }
-
-  // Try to launch each worker thread
+  hwloc_topology_t topo;
+  int err;
+  /* initialize a topology context */
+  err = hwloc_topology_init(&topo);
+  if(err) { LBANN_ERROR("hwloc_topology_init failed"); }
+  /* build the topology created and configured above */
+  err = hwloc_topology_load(topo);
+  if(err) { LBANN_ERROR("hwloc_topology_load failed"); }
+  // Get the number of PUs per core
+  // hwloc_obj_t core = hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 0);
+  m_threads_offset = PU_offset;
+
+  //  hwloc_cpuset_t current_cpuset = get_local_cpuset_for_current_thread(topo);
+  int i;
+  //  hwloc_obj_t obj;
+  // Try to launch each worker thread and pin it to a single core
   try
   {
-    for (size_type cnt = 0; cnt < num_threads; ++cnt) {
-      CPU_ZERO(&ht_cpuset);
-      // Pin this thread to the base CPU id plus the thread count and offset
-      for (int j = 0; j < CPU_SETSIZE; j++) {
-        if (CPU_ISSET(j, &cpuset)) {
-          CPU_SET(j+cnt+cpu_offset, &ht_cpuset);
-        }
+    hwloc_cpuset_t allocated_cpuset = get_local_cpuset_for_current_thread(topo);
+    hwloc_cpuset_t excluded_cpuset = hwloc_bitmap_alloc();
+    // int cpuset_idx = 0;
+    int skipped_indices = 0;
+    // Skip PUs to match the thread offset
+    hwloc_bitmap_foreach_begin(i, allocated_cpuset) {
+      if(skipped_indices < m_threads_offset) {
+        skipped_indices++;
+        hwloc_bitmap_set(excluded_cpuset, i);
       }
+    } hwloc_bitmap_foreach_end();
+
+    hwloc_cpuset_t iot_cpuset = hwloc_bitmap_dup(allocated_cpuset);
+    hwloc_bitmap_andnot(iot_cpuset, iot_cpuset, excluded_cpuset);
+    if(hwloc_bitmap_iszero(iot_cpuset)) {
+      LBANN_WARNING("Insufficient number of allocated cores to respect I/O CPU offset");
+      hwloc_bitmap_free(iot_cpuset);
+      // Reset the cpuset back to all of the allowed cores
+      iot_cpuset = hwloc_bitmap_dup(allocated_cpuset);
+    }
 
-      threads_.emplace_back(&thread_pool::do_thread_work_pinned_thread_,this, cnt, ht_cpuset);
+    for (size_type cnt = 0; cnt < num_threads; ++cnt) {
+      hwloc_cpuset_t ht_cpuset = hwloc_bitmap_dup(iot_cpuset);
+      hwloc_topology_t ht_topo;
+      err = hwloc_topology_dup(&ht_topo, topo);
+      if(err) { LBANN_ERROR("hwloc_topology_dup failed"); }
+      threads_.emplace_back(&thread_pool::do_thread_work_pinned_thread_,
+                            this, cnt, ht_topo, ht_cpuset);
     }
+    hwloc_bitmap_free(iot_cpuset);
+    hwloc_bitmap_free(allocated_cpuset);
   }
   catch(...)
   {
     all_work_done_ = true;
     throw;
   }
+#else
+  launch_threads(num_threads);
+#endif// LBANN_HAS_PTHREAD_AFFINITY_SUPPORT
 }
 
 void thread_pool::reap_threads() {
@@ -110,14 +160,22 @@ void thread_pool::do_thread_work_()
   }
 }
 
-void thread_pool::do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set)
+#if defined(LBANN_TOPO_AWARE)
+void thread_pool::do_thread_work_pinned_thread_(int tid, hwloc_topology_t topo, hwloc_cpuset_t cpuset)
 {
   // Set the CPU affinity for the thread
-  auto error = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_set);
+  auto error = hwloc_set_cpubind(topo, cpuset, 0);
+  // Free the hwloc_cpuset_t structure once the thread is pinned
+  hwloc_bitmap_free(cpuset);
+  //assert(!err);
   if (error != 0) {
-    std::cerr << "error in pthread_setaffinity_np, error=" << error << std::endl;
+    std::cerr << "error in hwloc_set_cpubind, error="
+              << strerror(error) << std::endl;
   }
 
+  /* terminate this topology context */
+  hwloc_topology_destroy(topo);
+
   {
     std::lock_guard<std::mutex> guard(m_thread_map_mutex);
     // Establish a local thread id
@@ -132,6 +190,7 @@ void thread_pool::do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set)
     }
   }
 }
+#endif // LBANN_TOPO_AWARE
 
 int thread_pool::get_local_thread_id() {
   std::thread::id this_id = std::this_thread::get_id();
diff --git a/src/utils/threads/thread_topology.cpp b/src/utils/threads/thread_topology.cpp
new file mode 100644
index 00000000000..a2c52e1228d
--- /dev/null
+++ b/src/utils/threads/thread_topology.cpp
@@ -0,0 +1,226 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <lbann/utils/threads/thread_topology.hpp>
+#include <lbann/utils/exception.hpp>
+
+#if defined(LBANN_TOPO_AWARE)
+#include <hwloc.h>
+#ifdef LBANN_HAS_GPU
+#include <hwloc/cudart.h>
+#endif // LBANN_HAS_GPU
+#if defined(HWLOC_API_VERSION) && (HWLOC_API_VERSION < 0x00010b00)
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#endif
+
+#ifdef LBANN_HAS_GPU
+#include <hydrogen/device/gpu/CUDA.hpp>
+#endif // LBANN_HAS_GPU
+
+#include <iostream>
+
+namespace lbann {
+
+int get_num_numa_nodes() {
+  int num_numa_nodes = 1;
+#if defined(LBANN_TOPO_AWARE)
+  // Determine the number of NUMA nodes present.
+  hwloc_topology_t topo;
+  hwloc_topology_init(&topo);
+  hwloc_topology_load(topo);
+  int numa_depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE);
+  // if (numa_depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
+  //   LBANN_ERROR(comm->get_rank_in_world(),
+  //               ": cannot determine hwloc NUMA-node depth");
+  // }
+  num_numa_nodes = hwloc_get_nbobjs_by_depth(topo, numa_depth);
+  // Warn if there are more NUMA nodes than processes per node.
+  // It's probably fine if there are more processes than NUMA nodes for now.
+  // We can adjust that later when we better understand the threaded perf.
+  // ppn = comm->get_procs_per_node();
+  // if (num_numa_nodes > ppn) {
+  //   // if (comm->get_rank_in_node() == 0) {
+  //     std::cout << comm->get_rank_in_world() <<
+  //               ": WARNING: node has " << num_numa_nodes <<
+  //               " NUMA nodes but you have " << ppn << " processes per node" <<
+  //               std::endl;
+  //   // }
+  // }
+  hwloc_topology_destroy(topo);
+#endif // LBANN_TOPO_AWARE
+  return num_numa_nodes;
+}
+
+#if defined(LBANN_TOPO_AWARE)
+void hwloc_print_topo()
+{
+  hwloc_topology_t topo;
+  int err;
+  /* initialize a topology context */
+  err = hwloc_topology_init(&topo);
+  assert(!err);
+  /* build the topology created and configured above */
+  err = hwloc_topology_load(topo);
+
+  {
+    printf("%u cores\n",
+	   hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE));
+  }
+
+  {
+    hwloc_obj_t core3, core7;
+    core3 = hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 3);
+    core7 = hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 7);
+    if (core3 && core7) {
+      hwloc_obj_t ancestor = hwloc_get_common_ancestor_obj(topo, core3, core7);
+      printf("ancestor type %s\n", hwloc_obj_type_string(ancestor->type));
+    }
+  }
+
+  {
+    hwloc_obj_t core0 = hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 0);
+    hwloc_obj_t parent = core0;
+    while (parent && !parent->memory.local_memory)
+      parent = parent->parent;
+    printf("%llu bytes\n", (unsigned long long) parent->memory.local_memory);
+  }
+
+  {
+    hwloc_obj_t pu = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, 0);
+    while (pu) {
+      printf("%u\n", pu->os_index);
+      pu = pu->next_cousin;
+    }
+  }
+
+  /* terminate this topology context */
+  hwloc_topology_destroy(topo);
+  std::cout << err << std::endl;
+  return;
+}
+
+// This function is implemented in HWLOC 2.1
+int hwloc_bitmap_singlify_per_core(hwloc_topology_t topology, hwloc_bitmap_t cpuset, unsigned which)
+{
+  hwloc_obj_t core = NULL;
+  while ((core = hwloc_get_next_obj_covering_cpuset_by_type(topology, cpuset, HWLOC_OBJ_CORE, core)) != NULL) {
+    /* this core has some PUs in the cpuset, find the index-th one */
+    unsigned i = 0;
+    int pu = -1;
+    do {
+      pu = hwloc_bitmap_next(core->cpuset, pu);
+      if (pu == -1) {
+        /* no which-th PU in cpuset and core, remove the entire core */
+        hwloc_bitmap_andnot(cpuset, cpuset, core->cpuset);
+        break;
+      }
+      if (hwloc_bitmap_isset(cpuset, pu)) {
+        if (i == which) {
+          /* remove the entire core except that exact pu */
+          hwloc_bitmap_andnot(cpuset, cpuset, core->cpuset);
+          hwloc_bitmap_set(cpuset, pu);
+          break;
+        }
+        i++;
+      }
+    } while (1);
+  }
+  return 0;
+}
+
+hwloc_cpuset_t get_local_cpuset_for_current_thread(hwloc_topology_t topo) {
+  hwloc_cpuset_t local_cpuset = hwloc_bitmap_alloc();
+#ifdef LBANN_HAS_GPU
+  // Find CPUs close to the GPU being used
+  hwloc_cudart_get_device_cpuset(topo, hydrogen::GPUManager::Device(), local_cpuset);
+#else
+  hwloc_const_cpuset_t allowed_cpuset = hwloc_topology_get_allowed_cpuset(topo);
+  local_cpuset = hwloc_bitmap_dup(allowed_cpuset);
+  //  hwloc_bitmap_free(allowed_cpuset);
+#endif // LBANN_HAS_GPU
+  return local_cpuset;
+}
+
+hwloc_cpuset_t get_allocated_cpuset_for_current_thread(hwloc_topology_t topo) {
+  hwloc_cpuset_t current_cpuset = hwloc_bitmap_alloc();
+  int err = hwloc_get_cpubind(topo, current_cpuset, 0);
+  if(err) { LBANN_ERROR("Unable to execute hwloc_get_cpubind"); }
+
+  hwloc_cpuset_t PU_core_set = hwloc_bitmap_alloc();
+  // Primary core set
+  hwloc_cpuset_t primary_PU_core_set = hwloc_bitmap_dup(current_cpuset);
+  // Hyperthread core set
+  hwloc_cpuset_t ht_PU_core_set = hwloc_bitmap_dup(current_cpuset);
+  // Get the list of available cores without hyperthreads
+  err = hwloc_bitmap_singlify_per_core(topo, primary_PU_core_set, 0);
+  if(err) { LBANN_ERROR("Unable to singlify the cpuset"); }
+  err = hwloc_bitmap_singlify_per_core(topo, ht_PU_core_set, 1);
+  if(err) { LBANN_ERROR("Unable to singlify the cpuset"); }
+
+  if(!hwloc_bitmap_iszero(ht_PU_core_set)) {
+    find_common_core_set_from_cpu_masks(topo, PU_core_set, primary_PU_core_set, ht_PU_core_set);
+  }else {
+    PU_core_set = hwloc_bitmap_dup(primary_PU_core_set);
+  }
+
+  hwloc_bitmap_free(current_cpuset);
+  hwloc_bitmap_free(primary_PU_core_set);
+  hwloc_bitmap_free(ht_PU_core_set);
+
+  return PU_core_set;
+}
+
+
+void find_common_core_set_from_cpu_masks(hwloc_topology_t topo,
+                                         hwloc_bitmap_t core_set,
+                                         hwloc_bitmap_t primary_set,
+                                         hwloc_bitmap_t ht_set) {
+  hwloc_cpuset_t tmp_primary_set = hwloc_bitmap_alloc();
+  hwloc_cpuset_t tmp_ht_set = hwloc_bitmap_alloc();
+  // Find the set of cores in the mask of the primary set
+  {
+    hwloc_obj_t core = NULL;
+    while ((core = hwloc_get_next_obj_covering_cpuset_by_type(topo, primary_set, HWLOC_OBJ_CORE, core)) != NULL) {
+      hwloc_bitmap_or(tmp_primary_set, tmp_primary_set, core->cpuset);
+    }
+  }
+  // Find the set of cores in the mask for the secondary (hyperthread) set
+  {
+    hwloc_obj_t core = NULL;
+    while ((core = hwloc_get_next_obj_covering_cpuset_by_type(topo, ht_set, HWLOC_OBJ_CORE, core)) != NULL) {
+      hwloc_bitmap_or(tmp_ht_set, tmp_ht_set, core->cpuset);
+    }
+  }
+  // AND both sets together to find the actual cores in the CPU mask
+  hwloc_bitmap_and(core_set, tmp_primary_set, tmp_ht_set);
+
+  hwloc_bitmap_free(tmp_primary_set);
+  hwloc_bitmap_free(tmp_ht_set);
+}
+#endif // LBANN_TOPO_AWARE
+
+} // namespace lbann
diff --git a/src/utils/threads/thread_utils.cpp b/src/utils/threads/thread_utils.cpp
index 1c1cf18165e..0fed6438051 100644
--- a/src/utils/threads/thread_utils.cpp
+++ b/src/utils/threads/thread_utils.cpp
@@ -25,6 +25,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/utils/threads/thread_utils.hpp"
+#include "lbann/utils/argument_parser.hpp"
+#include "lbann/utils/lbann_library.hpp"
 #include <thread>
 #include <omp.h>
 
@@ -42,7 +44,9 @@ int num_free_cores_per_process(const lbann_comm *comm) {
   aluminum_threads = 1;
 #endif // LBANN_HAS_ALUMINUM
 
-  auto io_threads_per_process = std::max(1, static_cast<int>((max_threads / processes_on_node) - omp_threads - aluminum_threads));
+  auto max_cores_per_process = static_cast<int>(max_threads / processes_on_node);
+
+  auto io_threads_per_process = std::max(1, (max_cores_per_process - omp_threads - aluminum_threads));
 
   return io_threads_per_process;
 }
@@ -52,14 +56,14 @@ int free_core_offset(const lbann_comm *comm) {
   auto max_threads = std::max(hw_cc,decltype(hw_cc){1});
 
   auto omp_threads = omp_get_max_threads();
-  auto processes_on_node = comm->get_procs_per_node();
 
   auto aluminum_threads = 0;
 #ifdef LBANN_HAS_ALUMINUM
   aluminum_threads = 1;
 #endif // LBANN_HAS_ALUMINUM
 
-  auto io_threads_offset = ((omp_threads+aluminum_threads) * processes_on_node) % max_threads;
+  // Offset into the CPUMASK of each process
+  auto io_threads_offset = (omp_threads+aluminum_threads)% max_threads;
 
   return io_threads_offset;
 }
diff --git a/src/utils/trainer_file_utils.cpp b/src/utils/trainer_file_utils.cpp
new file mode 100644
index 00000000000..c9e7b57db75
--- /dev/null
+++ b/src/utils/trainer_file_utils.cpp
@@ -0,0 +1,55 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/trainer_file_utils.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+namespace file {
+
+void trainer_master_make_directory(const std::string& path, lbann_comm* comm) {
+  if (comm == nullptr) {
+    LBANN_ERROR("Invalid communicator pointer");
+    return;
+  }
+
+  if (path.empty() || path == "." || path == "/") {
+    return;
+  }
+
+  if (comm->am_trainer_master()) {
+    // only master checks to see if the directory exists and creates if not.
+    make_directory(path);
+  }
+  comm->trainer_barrier();
+
+}
+
+} // namespace file
+
+} // namespace lbann
diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt
index 3578391b3a4..d19067b5470 100644
--- a/src/utils/unit_test/CMakeLists.txt
+++ b/src/utils/unit_test/CMakeLists.txt
@@ -1,8 +1,26 @@
-set_full_path(_DIR_LBANN_CATCH2_TEST_FILES
+set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES
   any_test.cpp
+  argument_parser_test.cpp
+  beta_distribution_test.cpp
+  cloneable_test.cpp
+  environment_variable_test.cpp
   factory_test.cpp
+  from_string_test.cpp
+  hash_test.cpp
+  image_test.cpp
+  python_test.cpp
+  random_test.cpp
   type_erased_matrix_test.cpp
+
+  stubs/preset_env_accessor.hpp
+  stubs/preset_env_accessor.cpp
   )
 
-set(LBANN_CATCH2_TEST_FILES
-  "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE)
+if (LBANN_HAS_HALF)
+  list(APPEND THIS_DIR_SEQ_CATCH2_TEST_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/serialize_half_test.cpp)
+endif (LBANN_HAS_HALF)
+
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQ_CATCH2_TEST_FILES}"
+  "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" PARENT_SCOPE)
diff --git a/src/utils/unit_test/argument_parser_test.cpp b/src/utils/unit_test/argument_parser_test.cpp
new file mode 100644
index 00000000000..47b885fe293
--- /dev/null
+++ b/src/utils/unit_test/argument_parser_test.cpp
@@ -0,0 +1,620 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+#include <catch2/catch.hpp>
+
+#include "lbann/utils/argument_parser.hpp"
+
+#include "lbann/utils/environment_variable.hpp"
+
+#include "stubs/preset_env_accessor.hpp"
+
+TEMPLATE_TEST_CASE ("Testing the argument parser", "[parser][utilities]",
+                    lbann::utils::strict_parsing,
+                    lbann::utils::allow_extra_parameters)
+{
+  using error_handler = TestType;
+  using parser_type = lbann::utils::argument_parser<error_handler>;
+
+  parser_type parser;
+  SECTION("Passing default arguments")
+  {
+      char const* argv[] = { "argument_parser_test.exe" };
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(
+        parser.get_exe_name() == "argument_parser_test.exe");
+  }
+
+  SECTION("Short help flag")
+  {
+    char const* argv[] = {"argument_parser_test.exe", "-h"};
+    int const argc = sizeof(argv) / sizeof(argv[0]);
+
+    CHECK_FALSE(parser.help_requested());
+    REQUIRE_NOTHROW(parser.parse(argc, argv));
+    CHECK(parser.help_requested());
+  }
+
+  SECTION("Long help flag")
+  {
+    char const* argv[] = {"argument_parser_test.exe", "--help"};
+    int const argc = sizeof(argv) / sizeof(argv[0]);
+
+    CHECK_FALSE(parser.help_requested());
+    REQUIRE_NOTHROW(parser.parse(argc, argv));
+    CHECK(parser.help_requested());
+  }
+
+  SECTION("Boolean flags are false by default")
+  {
+    auto flag_v =
+      parser.add_flag(
+        "flag v", {"-v", "--flag-v"}, "print verbosely");
+
+    CHECK(parser.option_is_defined("flag v"));
+    CHECK_FALSE(flag_v);
+
+    SECTION("Short flag sets flag to true")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "-v"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      CHECK_FALSE(parser.template get<bool>("flag v"));
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<bool>("flag v"));
+      CHECK(flag_v);
+    }
+
+    SECTION("Long flag sets flag to true")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "--flag-v"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      CHECK_FALSE(parser.template get<bool>("flag v"));
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<bool>("flag v"));
+      CHECK(flag_v);
+    }
+  }
+
+  SECTION("Numeric option")
+  {
+    auto param_t =
+      parser.add_option("parameter t", {"-t", "--param-t"},
+                        "Docstring for \"parameter t\"", 1);
+
+    CHECK(parser.option_is_defined("parameter t"));
+    CHECK(parser.template get<int>("parameter t") == 1);
+    CHECK(param_t == 1);
+
+    SECTION ("Short flag")
+    {
+      char const* argv[] = {"argument_parser_test.exe", "-t", "9"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(
+        parser.template get<int>("parameter t") == 9);
+      CHECK(param_t == 9);
+    }
+
+    SECTION ("Long flag")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "--param-t", "13"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<int>("parameter t") == 13);
+      CHECK(param_t == 13);
+    }
+  }
+
+  SECTION("String-valued option")
+  {
+    auto param_n =
+      parser.add_option("parameter n", {"-n", "--param-n", "--parameter-n"},
+                        "Docstring for \"parameter t\"",
+                        "<unregistered parameter value>");
+
+    CHECK(parser.option_is_defined("parameter n"));
+    CHECK(parser.template get<std::string>("parameter n")
+          == "<unregistered parameter value>");
+
+    SECTION("The short option is passed on the command line")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "-n", "short form of param n"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(
+        parser.template get<std::string>("parameter n")
+        == "short form of param n");
+      CHECK(param_n == "short form of param n");
+    }
+
+    SECTION ("First long option")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "--param-n",
+           "first long form of param n"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(
+        parser.template get<std::string>("parameter n")
+        == "first long form of param n");
+      CHECK(param_n == "first long form of param n");
+    }
+
+    SECTION ("Second long option")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "--parameter-n",
+           "second long form of param n"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(
+        parser.template get<std::string>("parameter n")
+        == "second long form of param n");
+      CHECK(param_n == "second long form of param n");
+    }
+  }
+
+  SECTION ("Required numeric argument")
+  {
+    auto required_int =
+      parser.template add_required_argument<int>(
+        "required", "This argument is required.");
+
+    CHECK(parser.option_is_defined("required"));
+
+    SECTION("Missing required argument")
+    {
+      char const* argv[] = {"argument_parser_test.exe"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      parser.parse_no_finalize(argc,argv);
+      REQUIRE_THROWS_AS(
+        parser.finalize(),
+        typename parser_type::missing_required_arguments);
+    }
+
+    SECTION("Required argument is passed")
+    {
+      char const* argv[] = {"argument_parser_test.exe","13"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(required_int == 13);
+    }
+
+    SECTION("Another is added option and passed in the arguments")
+    {
+      auto required_string =
+        parser.template add_required_argument<std::string>(
+          "required string", "This argument is also required.");
+
+      char const* argv[] = {"argument_parser_test.exe","13","bananas"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(required_int == 13);
+      CHECK(required_string == "bananas");
+    }
+  }
+
+  SECTION("More complex argument relationships")
+  {
+    auto optional_int =
+      parser.add_argument(
+        "optional", "This argument is optional.", -1);
+
+    CHECK(parser.option_is_defined("optional"));
+    CHECK(parser.template get<int>("optional") == -1);
+    CHECK(optional_int == -1);
+
+    SECTION("Only default arguments passed")
+    {
+      int const argc = 1;
+      char const* argv[] = {"argument_parser_test.exe"};
+
+      REQUIRE_NOTHROW(parser.parse(argc,argv));
+      CHECK(parser.template get<int>("optional") == -1);
+      CHECK(optional_int == -1);
+    }
+
+    SECTION("Option is passed in the arguments")
+    {
+      int const argc = 2;
+      char const* argv[] = {"argument_parser_test.exe","13"};
+
+      REQUIRE_NOTHROW(parser.parse(argc,argv));
+      CHECK(parser.template get<int>("optional") == 13);
+      CHECK(optional_int == 13);
+    }
+
+    SECTION("Another optional argument is added")
+    {
+      auto optional_string =
+        parser.add_argument(
+          "optional string", "This argument is also optional.",
+          "pickles");
+
+      SECTION("Parsing both arguments works")
+      {
+        int const argc = 3;
+        char const* argv[] = {"argument_parser_test.exe","42","bananas"};
+
+        CHECK(optional_int == -1);
+        CHECK(optional_string == "pickles");
+        REQUIRE_NOTHROW(parser.parse(argc, argv));
+        CHECK(optional_int == 42);
+        CHECK(optional_string == "bananas");
+      }
+
+      SECTION("A required argument is added and passed in the arguments")
+      {
+        auto required_string =
+          parser.template add_required_argument<std::string>(
+            "required string", "This argument is required.");
+
+        SECTION("Bad ordering of the arguments")
+        {
+          int const argc = 3;
+          char const* argv[] = {
+            "argument_parser_test.exe","42","bananas"};
+
+          REQUIRE_THROWS(parser.parse(argc,argv));
+          CHECK(required_string == "42");
+        }
+
+        SECTION("Correct ordering of the arguments")
+        {
+          int const argc = 3;
+          char const* argv[] = {
+            "argument_parser_test.exe","bananas","42"};
+
+          CHECK(optional_int == -1);
+          REQUIRE_NOTHROW(parser.parse(argc, argv));
+          CHECK(optional_int == 42);
+          CHECK(required_string == "bananas");
+        }
+      }
+    }
+  }
+
+  SECTION("A flag with env variable override is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    auto verbose =
+      parser.add_flag("verbose", {"-v"},
+                      TestENV("VALUE_IS_TRUE"), "");
+
+    CHECK(parser.option_is_defined("verbose"));
+    CHECK(verbose);
+
+    SECTION("Command line flag overrides environment variable")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "-v"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<bool>("verbose"));
+      CHECK(verbose);
+    }
+  }
+
+  SECTION("A flag with false-valued env variable override is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    auto verbose =
+      parser.add_flag("verbose", {"-v"},
+                      TestENV("VALUE_IS_FALSE"), "");
+
+    CHECK(parser.option_is_defined("verbose"));
+    CHECK_FALSE(verbose);
+
+    SECTION("Command line flag overrides environment variable")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "-v"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<bool>("verbose"));
+      CHECK(verbose);
+    }
+  }
+
+  SECTION("A flag with false-valued env variable override is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    auto verbose =
+      parser.add_flag("verbose", {"-v"},
+                      TestENV("VALUE_IS_UNDEFINED"), "");
+
+    CHECK(parser.option_is_defined("verbose"));
+    CHECK_FALSE(verbose);
+
+    SECTION("Command line flag overrides environment variable")
+    {
+      char const* argv[]
+        = {"argument_parser_test.exe", "-v"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<bool>("verbose"));
+      CHECK(verbose);
+    }
+  }
+
+  SECTION("A defined environment varible is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    parser.add_option(
+      "apple", {"-a"}, TestENV("APPLE"),
+      "Apple pie tastes good.", 1.23);
+
+    CHECK(parser.option_is_defined("apple"));
+
+    SECTION("The option is not passed in the arguments")
+    {
+      int const argc = 1;
+      char const* argv[] = {"argument_parser_test.exe"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<double>("apple") == 3.14);
+    }
+
+    SECTION("The option is passed in the arguments")
+    {
+      int const argc = 3;
+      char const* argv[] = {"argument_parser_test.exe", "-a", "5.0"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<double>("apple") == 5.0);
+    }
+  }
+
+  SECTION("An undefined environment varible is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    parser.add_option(
+      "platypus", {"-p"}, TestENV("DOESNT_EXIST"),
+      "This variable won't exist.", 1.23);
+
+    CHECK(parser.option_is_defined("platypus"));
+
+    SECTION("The option is not passed in the arguments")
+    {
+      int const argc = 1;
+      char const* argv[] = {"argument_parser_test.exe"};
+
+          REQUIRE_NOTHROW(parser.parse(argc, argv));
+          CHECK(parser.template get<double>("platypus") == 1.23);
+    }
+    SECTION("The option is passed in the arguments")
+    {
+      int const argc = 3;
+      char const* argv[] = {"argument_parser_test.exe", "-p", "2.0"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<double>("platypus") == 2.0);
+    }
+  }
+
+  SECTION("A defined string environment varible is added")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    parser.add_option(
+      "pizza", {"-p"}, TestENV("PIZZA"),
+      "Mmmm pizza.", "mushroom");
+
+    CHECK(parser.option_is_defined("pizza"));
+
+    SECTION("The option is not passed in the arguments")
+    {
+      int const argc = 1;
+      char const* argv[] = {"argument_parser_test.exe"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<std::string>("pizza") == "pepperoni");
+    }
+
+    SECTION("The option is passed in the arguments")
+    {
+      int const argc = 3;
+      char const* argv[] = {"argument_parser_test.exe", "-p", "hawaiian"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<std::string>("pizza") == "hawaiian");
+    }
+  }
+
+  SECTION("An undefined environment varible is added to a string option")
+  {
+    using namespace lbann::utils::stubs;
+    using TestENV = lbann::utils::EnvVariable<PresetEnvAccessor>;
+
+    parser.add_option(
+      "parameter p", {"-p"}, TestENV("DOESNT_EXIST"),
+      "This variable won't exist.", "parameter p test string");
+
+    CHECK(parser.option_is_defined("parameter p"));
+
+    SECTION("The option is not passed in the arguments")
+    {
+      int const argc = 1;
+      char const* argv[] = {"argument_parser_test.exe"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<std::string>("parameter p")
+            == "parameter p test string");
+    }
+
+    SECTION("The option is passed in the arguments")
+    {
+      int const argc = 3;
+      char const* argv[] = {"argument_parser_test.exe", "-p",
+                            "parameter p argument value"};
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+      CHECK(parser.template get<std::string>("parameter p")
+            == "parameter p argument value");
+    }
+  }
+}
+
+// Testing for cases of ignored arguments
+TEST_CASE("Partial argument parsing", "[parser][utilities]")
+{
+  lbann::utils::argument_parser<lbann::utils::allow_extra_parameters> parser;
+  auto flag_v =
+    parser.add_flag(
+      "flag v", {"-v", "--flag-v"}, "Docstring for \"flag v\"");
+  auto param_s =
+    parser.add_option("parameter s", {"-s", "--param-s"},
+                      "Docstring for \"parameter s\"",
+                      "default value of s");
+  auto param_t =
+    parser.add_option("parameter t", {"-t", "--param-t"},
+                      "Docstring for \"parameter t\"", 1);
+
+  CHECK(parser.option_is_defined("parameter t"));
+  CHECK(parser.option_is_defined("flag v"));
+
+  SECTION("Incomplete argument sets are fine.")
+  {
+    char const* argv[] = {"argument_parser_test.exe", "-t", "9"};
+    int const argc = sizeof(argv) / sizeof(argv[0]);
+    REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+    CHECK_FALSE(parser.template get<bool>("flag v"));
+    CHECK_FALSE(flag_v);
+
+    CHECK(parser.template get<int>("parameter t") == 9);
+    CHECK(param_t == 9);
+  }
+
+  SECTION("Unknown arguments are ok.")
+  {
+    char const* argv[] = {"argument_parser_test.exe",
+                          "-o", "-v", "-a", "-t", "2", "-p", "13"};
+    int const argc = sizeof(argv) / sizeof(argv[0]);
+
+    REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+    CHECK(parser.template get<bool>("flag v"));
+    CHECK(flag_v);
+
+    CHECK(parser.template get<int>("parameter t") == 2);
+    CHECK(param_t == 2);
+  }
+
+  SECTION("Final argument is unknown flag is ok.")
+  {
+    char const* argv[] = {"argument_parser_test.exe",
+                          "-o", "-v", "-a", "-t", "2", "-p", "13", "-flag"};
+    int const argc = sizeof(argv) / sizeof(argv[0]);
+
+    REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+    CHECK(parser.template get<bool>("flag v"));
+    CHECK(flag_v);
+
+    CHECK(parser.template get<int>("parameter t") == 2);
+    CHECK(param_t == 2);
+  }
+
+  SECTION("Arguments with equals assignments are ok")
+  {
+    SECTION("Short form")
+    {
+      char const* argv[] = {"argument_parser_test.exe", "-t=32"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+      CHECK(parser.template get<int>("parameter t") == 32);
+      CHECK(param_t == 32);
+    }
+
+    SECTION("Long form")
+    {
+      char const* argv[] = {"argument_parser_test.exe", "--param-t=121"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+      CHECK(parser.template get<int>("parameter t") == 121);
+      CHECK(param_t == 121);
+    }
+
+    SECTION("String parameter with equals sign in it")
+    {
+      char const* argv[] = {"argument_parser_test.exe",
+                            "--param-s=something=other"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+      CHECK(parser.template get<std::string>("parameter s") == "something=other");
+      CHECK(param_s == "something=other");
+    }
+
+    SECTION("Unknown parameters may also use equals signs")
+    {
+      char const* argv[] = {"argument_parser_test.exe", "--flag-v",
+                            "--param-q=121", "--param-t=21"};
+      int const argc = sizeof(argv) / sizeof(argv[0]);
+
+      REQUIRE_NOTHROW(parser.parse(argc, argv));
+
+      CHECK(parser.template get<int>("parameter t") == 21);
+      CHECK(flag_v);
+    }
+  }
+}
diff --git a/src/utils/unit_test/beta_distribution_test.cpp b/src/utils/unit_test/beta_distribution_test.cpp
new file mode 100644
index 00000000000..103925dc684
--- /dev/null
+++ b/src/utils/unit_test/beta_distribution_test.cpp
@@ -0,0 +1,46 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/beta.hpp>
+
+#include <cmath>
+
+constexpr size_t num_tests = 1000;
+
+template <typename RealType, typename Generator>
+void test_dist(Generator& g, RealType a, RealType b) {
+  lbann::beta_distribution<RealType> dist(a, b);
+  for (size_t i = 0; i < num_tests; ++i) {
+    RealType val = dist(g);
+    REQUIRE(std::isfinite(val));
+    REQUIRE(val >= RealType(0));
+    REQUIRE(val <= RealType(1));
+  }
+}
+
+TEST_CASE("Testing beta_distribution", "[random][utilities]") {
+  std::mt19937 gen;
+  SECTION("float") {
+    SECTION("a=0.5 b=0.5") {
+      test_dist<float>(gen, 0.5f, 0.5f);
+    }
+    SECTION("a=0.001 b=0.001") {
+      test_dist<float>(gen, 0.001f, 0.001f);
+    }
+    SECTION("a=1.5 b=1.5") {
+      test_dist<float>(gen, 1.5f, 1.5f);
+    }
+  }
+  SECTION("double") {
+    SECTION("a=0.5 b=0.5") {
+      test_dist<double>(gen, 0.5, 0.5);
+    }
+    SECTION("a=0.001 b=0.001") {
+      test_dist<double>(gen, 0.001, 0.001);
+    }
+    SECTION("a=1.5 b=1.5") {
+      test_dist<double>(gen, 1.5, 1.5);
+    }
+  }
+}
diff --git a/src/utils/unit_test/cloneable_test.cpp b/src/utils/unit_test/cloneable_test.cpp
new file mode 100644
index 00000000000..a195ed9a187
--- /dev/null
+++ b/src/utils/unit_test/cloneable_test.cpp
@@ -0,0 +1,160 @@
+#include <catch2/catch.hpp>
+#include "lbann/utils/cloneable.hpp"
+
+using namespace lbann;
+
+struct Base : Cloneable<NonLeafClass<Base>> {};
+struct Derived : Cloneable<Derived, Base> {};
+
+struct DiamondBase : Cloneable<NonLeafClass<DiamondBase>> {};
+struct DiamondDerivedLeft
+  : Cloneable<HasAbstractFunction<DiamondDerivedLeft>,
+              AsVirtualBase<DiamondBase>>
+{};
+struct DiamondDerivedRight
+  : Cloneable<HasAbstractFunction<DiamondDerivedRight>,
+              AsVirtualBase<DiamondBase>>
+{};
+struct DiamondDerivedBottom
+  : Cloneable<DiamondDerivedBottom, DiamondDerivedLeft, DiamondDerivedRight>
+{};
+
+struct StandaloneCloneable : Cloneable<StandaloneCloneable> {};
+
+TEST_CASE("Testing cloneable mechanism -- standalone class", "[utils]")
+{
+  StandaloneCloneable a;
+  std::unique_ptr<StandaloneCloneable> bptr = a.clone();
+  REQUIRE(bptr);
+}
+
+TEST_CASE("Testing cloneable mechanism -- shallow hierarchy", "[utils]")
+{
+  Derived a;
+  std::unique_ptr<Derived> bptr = a.clone();
+  REQUIRE(bptr);
+  std::unique_ptr<Base> cptr = bptr->clone();
+  REQUIRE(cptr);
+  std::unique_ptr<Base> dptr = cptr->clone();
+  REQUIRE(dptr);
+}
+
+TEST_CASE("Testing cloneable mechanism -- inheritance diamond", "[utils]")
+{
+  DiamondDerivedBottom a;
+  std::unique_ptr<DiamondDerivedBottom> bptr = a.clone();
+  REQUIRE(bptr);
+  std::unique_ptr<DiamondDerivedLeft> cptr = a.clone();
+  REQUIRE(cptr);
+  std::unique_ptr<DiamondDerivedRight> dptr = a.clone();
+  REQUIRE(dptr);
+  // This call will fail if the shared base class is not properly resolved.
+  std::unique_ptr<DiamondBase> eptr = a.clone();
+  REQUIRE(eptr);
+
+  REQUIRE(dynamic_cast<DiamondDerivedLeft const*>(eptr.get()));
+  REQUIRE(dynamic_cast<DiamondDerivedRight const*>(eptr.get()));
+  REQUIRE(dynamic_cast<DiamondDerivedBottom const*>(eptr.get()));
+}
+
+struct DeepBase : Cloneable<HasAbstractFunction<DeepBase>> {};
+struct DeepMidDerived
+  : Cloneable<HasAbstractFunction<DeepMidDerived>, DeepBase>
+{};
+struct DeepMostDerived
+  : Cloneable<DeepMostDerived, DeepMidDerived>
+{};
+
+TEST_CASE("Testing cloneable mechanism -- deep hierarchy", "[utils]")
+{
+  DeepMostDerived a;
+  std::unique_ptr<DeepMostDerived> bptr = a.clone();
+  REQUIRE(bptr);
+  std::unique_ptr<DeepMidDerived> cptr = a.clone();
+  REQUIRE(cptr);
+  std::unique_ptr<DeepBase> dptr = a.clone();
+  REQUIRE(dptr);
+
+  std::unique_ptr<DeepMidDerived> eptr = cptr->clone();
+  REQUIRE(eptr);
+  std::unique_ptr<DeepBase> fptr = eptr->clone();
+  REQUIRE(fptr);
+
+  std::unique_ptr<DeepBase> gptr = fptr->clone();
+  REQUIRE(gptr);
+  REQUIRE(dynamic_cast<DeepMidDerived const*>(gptr.get()));
+  REQUIRE(dynamic_cast<DeepMostDerived const*>(gptr.get()));
+}
+
+// Static unit tests
+
+static_assert(IsCloneable_v<Cloneable<Base>>(),
+              "Cloneable<...> should be cloneable");
+static_assert(IsCloneable_v<Cloneable<NonLeafClass<Base>>>(),
+              "Cloneable<...> should be cloneable");
+
+// All of the tested classes above should be cloneable
+static_assert(IsCloneable_v<Base>(), "Base should be cloneable.");
+static_assert(IsCloneable_v<Derived>(), "Derived should be cloneable.");
+
+static_assert(IsCloneable_v<StandaloneCloneable>(),
+              "StandaloneCloneable should be cloneable.");
+
+static_assert(IsCloneable_v<DiamondBase>(),
+              "DiamondBase should be cloneable.");
+static_assert(IsCloneable_v<DiamondDerivedLeft>(),
+              "DiamondDerivedLeft should be cloneable.");
+static_assert(IsCloneable_v<DiamondDerivedRight>(),
+              "DiamondDerivedRight should be cloneable.");
+static_assert(IsCloneable_v<DiamondDerivedBottom>(),
+              "DiamondDerivedBottom should be cloneable.");
+
+static_assert(IsCloneable_v<DeepBase>(),
+              "DeepBase should be cloneable.");
+static_assert(IsCloneable_v<DeepMidDerived>(),
+              "DeepMidDerived should be cloneable.");
+static_assert(IsCloneable_v<DeepMostDerived>(),
+              "DeepMostDerived should be cloneable.");
+
+// Basic types should not be cloneable
+static_assert(!IsCloneable_v<int>(), "int should not be cloneable.");
+static_assert(!IsCloneable_v<char>(), "char should not be cloneable.");
+static_assert(!IsCloneable_v<float>(), "float should not be cloneable.");
+static_assert(!IsCloneable_v<double>(), "double should not be cloneable.");
+
+// A class that returns by raw pointer doesn't fit the Cloneable
+// concept as defined here.
+struct NotTheRightCloneable {
+  NotTheRightCloneable* clone() const;
+};
+
+static_assert(!IsCloneable_v<NotTheRightCloneable>(),
+              "NotTheRightCloneable should not be cloneable.");
+
+// The predicate is not able to catch degenerate cases that happen to
+// provide the right clone signature but don't use the cloneable
+// method. This would be difficult to enforce as well, especially
+// since the Cloneable infrastructure could be written out long-hand.
+struct ShouldBeOk {
+  std::unique_ptr<ShouldBeOk> clone() const;
+};
+
+static_assert(IsCloneable_v<ShouldBeOk>(), "ShouldBeOk should be cloneable.");
+
+// The predicate should be able to handle the naive
+// "polymorphic-clone-to-unique_ptr" approach. Of course the base
+// class will not be caught, but derived classes will.
+struct NoCovariantCloneBase
+{
+  virtual ~NoCovariantCloneBase() = default;
+  virtual std::unique_ptr<NoCovariantCloneBase> clone() const = 0;
+};
+struct NoCovariantCloneConcrete : NoCovariantCloneBase
+{
+  std::unique_ptr<NoCovariantCloneBase> clone() const override;
+};
+
+static_assert(IsCloneable_v<NoCovariantCloneBase>(),
+              "NoCovariantCloneBase should appear to be cloneable.");
+static_assert(!IsCloneable_v<NoCovariantCloneConcrete>(),
+              "NoCovariantCloneConcrete should not be cloneable.");
diff --git a/src/utils/unit_test/environment_variable_test.cpp b/src/utils/unit_test/environment_variable_test.cpp
new file mode 100644
index 00000000000..d38ae0df529
--- /dev/null
+++ b/src/utils/unit_test/environment_variable_test.cpp
@@ -0,0 +1,169 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <catch2/catch.hpp>
+
+#include "lbann/utils/environment_variable.hpp"
+
+#include "stubs/preset_env_accessor.hpp"
+
+using namespace lbann::utils;
+
+TEST_CASE("Environment variable wrapper", "[utilities][parser]")
+{
+  using TestENV = EnvVariable<stubs::PresetEnvAccessor>;
+
+  SECTION("A floating point variable")
+  {
+    TestENV apple("APPLE");
+
+    CHECK(apple.exists());
+    CHECK(apple.name() == "APPLE");
+    CHECK(apple.raw_value() == "3.14");
+
+    // This class is (purposefully) not as rigorously typed as, say,
+    // the type-erased "any". Since conversion is done on-the-fly from
+    // a string, there's less need for strong typing.
+    CHECK(apple.value<float>() == 3.14f);
+    CHECK(apple.value<double>() == 3.14);
+    CHECK(apple.value<int>() == 3);
+
+    // Environment variables should always be convertible to strings
+    CHECK(apple.value<std::string>() == apple.raw_value());
+  }
+
+  SECTION("An integer variable")
+  {
+    TestENV scoops("ICE_CREAM_SCOOPS");
+
+    CHECK(scoops.exists());
+    CHECK(scoops.name() == "ICE_CREAM_SCOOPS");
+    CHECK(scoops.raw_value() == "3");
+
+    CHECK(scoops.value<float>() == 3.f);
+    CHECK(scoops.value<double>() == 3.);
+    CHECK(scoops.value<int>() == 3);
+    CHECK(scoops.value<std::string>() == scoops.raw_value());
+  }
+
+  SECTION("A string variable")
+  {
+    TestENV pizza("PIZZA");
+    CHECK(pizza.exists());
+    CHECK(pizza.name() == "PIZZA");
+    CHECK(pizza.raw_value() == "pepperoni");
+    CHECK(pizza.value<std::string>() == pizza.raw_value());
+
+    CHECK_THROWS_AS(pizza.value<float>() == 123.f, std::invalid_argument);
+    CHECK_THROWS_AS(pizza.value<double>() == 321., std::invalid_argument);
+    CHECK_THROWS_AS(pizza.value<int>() == 42, std::invalid_argument);
+  }
+
+  SECTION("Boolean variables")
+  {
+    SECTION("Variable stored as the string \"true\"")
+    {
+      TestENV true_str_var("VALUE_IS_TRUE");
+
+      CHECK(true_str_var.exists());
+      CHECK(true_str_var.name() == "VALUE_IS_TRUE");
+      CHECK(true_str_var.raw_value() == "true");
+      CHECK(true_str_var.value<std::string>() == true_str_var.raw_value());
+
+      CHECK(true_str_var.value<bool>());
+
+      CHECK_THROWS_AS(true_str_var.value<float>() == 123.f,
+                      std::invalid_argument);
+      CHECK_THROWS_AS(true_str_var.value<double>() == 321.,
+                      std::invalid_argument);
+      CHECK_THROWS_AS(true_str_var.value<int>() == 42, std::invalid_argument);
+    }
+
+    SECTION("Variable stored as a \"1\"")
+    {
+      TestENV true_int_var("VALUE_IS_ONE");
+
+      CHECK(true_int_var.exists());
+      CHECK(true_int_var.name() == "VALUE_IS_ONE");
+      CHECK(true_int_var.raw_value() == "1");
+      CHECK(true_int_var.value<std::string>() == true_int_var.raw_value());
+      CHECK(true_int_var.value<bool>());
+    }
+
+    SECTION("Variable stored as the string \"false\"")
+    {
+      TestENV false_str_var("VALUE_IS_FALSE");
+
+      CHECK(false_str_var.exists());
+      CHECK(false_str_var.name() == "VALUE_IS_FALSE");
+      CHECK(false_str_var.raw_value() == "false");
+      CHECK(false_str_var.value<std::string>() == false_str_var.raw_value());
+
+      CHECK_FALSE(false_str_var.value<bool>());
+
+      CHECK_THROWS_AS(false_str_var.value<float>() == 123.f,
+                      std::invalid_argument);
+      CHECK_THROWS_AS(false_str_var.value<double>() == 321.,
+                      std::invalid_argument);
+      CHECK_THROWS_AS(false_str_var.value<int>() == 42, std::invalid_argument);
+    }
+
+    SECTION("Variable stored as a \"0\"")
+    {
+      TestENV false_int_var("VALUE_IS_ZERO");
+
+      CHECK(false_int_var.exists());
+      CHECK(false_int_var.name() == "VALUE_IS_ZERO");
+      CHECK(false_int_var.raw_value() == "0");
+      CHECK(false_int_var.value<std::string>() == false_int_var.raw_value());
+
+      CHECK_FALSE(false_int_var.value<bool>());
+    }
+
+    SECTION("Variable has a value not convertible to bool")
+    {
+      TestENV not_a_bool("PIZZA");
+      CHECK_THROWS_AS(not_a_bool.value<bool>(),
+                      std::invalid_argument);
+    }
+  }
+
+  SECTION("A variable that doesn't exist")
+  {
+    TestENV bad("DOESNT_EXIST");
+
+    CHECK_FALSE(bad.exists());
+
+    CHECK(bad.name() == "DOESNT_EXIST");
+    CHECK(bad.raw_value() == "");
+    CHECK(bad.value<std::string>() == bad.raw_value());
+
+    CHECK_THROWS_AS(bad.value<float>() == 123.f, std::invalid_argument);
+    CHECK_THROWS_AS(bad.value<double>() == 321., std::invalid_argument);
+    CHECK_THROWS_AS(bad.value<int>() == 42, std::invalid_argument);
+    CHECK_THROWS_AS(bad.value<bool>(), std::invalid_argument);
+  }
+}
diff --git a/src/utils/unit_test/factory_test.cpp b/src/utils/unit_test/factory_test.cpp
index d118e72030a..88d545a40c3 100644
--- a/src/utils/unit_test/factory_test.cpp
+++ b/src/utils/unit_test/factory_test.cpp
@@ -83,7 +83,7 @@ TEMPLATE_TEST_CASE(
 
       THEN("The factory knows about two builders")
       {
-        auto names = factory.get_registered_keys();
+        auto names = factory.registered_ids();
         REQUIRE(std::distance(names.begin(), names.end()) == 2UL);
       }
       AND_WHEN("A builder is added with an existing key")
@@ -97,7 +97,7 @@ TEMPLATE_TEST_CASE(
 
         THEN("The factory still knows about only two factories")
         {
-          auto names = factory.get_registered_keys();
+          auto names = factory.registered_ids();
           REQUIRE(std::distance(names.begin(), names.end()) == 2UL);
         }
       }
@@ -130,7 +130,7 @@ TEMPLATE_TEST_CASE(
         THEN("The number of known factories has decreased.")
         {
           REQUIRE(success == true);
-          auto names = factory.get_registered_keys();
+          auto names = factory.registered_ids();
           REQUIRE(std::distance(names.begin(), names.end()) == 1UL);
         }
 
diff --git a/src/utils/unit_test/from_string_test.cpp b/src/utils/unit_test/from_string_test.cpp
new file mode 100644
index 00000000000..063a7e1e199
--- /dev/null
+++ b/src/utils/unit_test/from_string_test.cpp
@@ -0,0 +1,141 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/from_string.hpp>
+
+namespace
+{
+
+template <typename T> T PositiveAnswer() noexcept;
+template <typename T> T NegativeAnswer() noexcept;
+
+template <> int PositiveAnswer() noexcept { return 123; }
+template <> int NegativeAnswer() noexcept { return -456; }
+template <> long PositiveAnswer() noexcept { return 123L; }
+template <> long NegativeAnswer() noexcept { return -456L; }
+template <> long long PositiveAnswer() noexcept { return 123LL; }
+template <> long long NegativeAnswer() noexcept { return -456LL; }
+
+template <> unsigned long PositiveAnswer() noexcept
+{
+  return 9876543210UL;
+}
+template <> unsigned long NegativeAnswer() noexcept
+{
+  return static_cast<unsigned long>(-1);
+}
+template <> unsigned long long PositiveAnswer() noexcept
+{
+  return 9876543210ULL;
+}
+template <> unsigned long long NegativeAnswer() noexcept
+{
+  return static_cast<unsigned long long>(-1);
+}
+
+template <> float PositiveAnswer() noexcept { return 9.87f; }
+template <> float NegativeAnswer() noexcept { return -6.54f; }
+template <> double PositiveAnswer() noexcept { return 9.87; }
+template <> double NegativeAnswer() noexcept { return -6.54; }
+template <> long double PositiveAnswer() noexcept { return 9.87l; }
+template <> long double NegativeAnswer() noexcept { return -6.54l; }
+
+}// namespace <anon>
+
+using lbann::utils::from_string;
+
+TEST_CASE("From string corner cases","[utilities][string]")
+{
+  SECTION("Boolean strings")
+  {
+    CHECK(from_string<bool>("true"));
+    CHECK(from_string<bool>("TRUE"));
+    CHECK(from_string<bool>("tRuE"));
+    CHECK(from_string<bool>("TrUe"));
+    CHECK(from_string<bool>("1"));
+    CHECK(from_string<bool>("431"));
+    CHECK(from_string<bool>("3.14"));
+
+    CHECK_FALSE(from_string<bool>("false"));
+    CHECK_FALSE(from_string<bool>("FALSE"));
+    CHECK_FALSE(from_string<bool>("FaLsE"));
+    CHECK_FALSE(from_string<bool>("0"));
+    CHECK_FALSE(from_string<bool>("0.0"));
+
+    // FIXME: This should be true:
+    //CHECK(from_string<bool>("0.2"));
+
+    CHECK_THROWS_AS(from_string<bool>("not a bool"), std::invalid_argument);
+  }
+
+  SECTION("From lvalue string to string")
+  {
+    std::string input("I am a string");
+    REQUIRE(from_string<std::string>(input) == input);
+    REQUIRE(from_string<std::string>(input) == "I am a string");
+  }
+
+  SECTION("From rvalue string to string")
+  {
+    REQUIRE(from_string("I'm a string") == "I'm a string");
+  }
+
+  SECTION("Exceptional cases")
+  {
+    REQUIRE_THROWS_AS(from_string<int>("9876543210"), std::out_of_range);
+  }
+}
+
+TEMPLATE_TEST_CASE("From string to floating point type",
+                   "[utilities][string]",
+                   float, double, long double)
+{
+  REQUIRE_THROWS_AS(from_string<TestType>("pineapple"), std::invalid_argument);
+  REQUIRE(from_string<TestType>("9.87") == PositiveAnswer<TestType>());
+  REQUIRE(from_string<TestType>("-6.54") == NegativeAnswer<TestType>());
+}
+
+TEMPLATE_TEST_CASE("From string to signed integer type",
+                   "[utilities][string]",
+                   int, long, long long)
+{
+  REQUIRE_THROWS_AS(from_string<TestType>("pineapple"), std::invalid_argument);
+  REQUIRE(from_string<TestType>("123") == PositiveAnswer<TestType>());
+  REQUIRE(from_string<TestType>("-456") == NegativeAnswer<TestType>());
+}
+
+TEMPLATE_TEST_CASE("From string to unsigned integer type",
+                   "[utilities][string]",
+                   unsigned long, unsigned long long)
+{
+  REQUIRE_THROWS_AS(from_string<TestType>("pineapple"), std::invalid_argument);
+  REQUIRE(from_string<TestType>("9876543210") == PositiveAnswer<TestType>());
+  REQUIRE(from_string<TestType>("-1") == NegativeAnswer<TestType>());
+}
diff --git a/src/utils/unit_test/hash_test.cpp b/src/utils/unit_test/hash_test.cpp
new file mode 100644
index 00000000000..7de4b81802a
--- /dev/null
+++ b/src/utils/unit_test/hash_test.cpp
@@ -0,0 +1,49 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/hash.hpp>
+
+#include <unordered_set>
+
+TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") {
+
+  SECTION ("hash_combine") {
+    std::unordered_set<size_t> hashes;
+    for (size_t seed=0; seed<10; ++seed) {
+      hashes.insert(seed);
+    }
+    for (size_t seed=0; seed<=16; seed+=2) {
+      for (int val=-49; val<=49; val+=7) {
+        const auto hash = lbann::hash_combine(seed, val);
+        CHECK_FALSE(hashes.count(hash));
+        hashes.insert(hash);
+      }
+    }
+  }
+
+  SECTION ("enum_hash") {
+    enum class Humor { PHLEGMATIC, CHOLERIC, SANGUINE, MELANCHOLIC };
+    std::vector<Humor> enum_list = { Humor::MELANCHOLIC, Humor::SANGUINE,
+                                     Humor::CHOLERIC, Humor::PHLEGMATIC };
+    std::unordered_set<size_t> hashes;
+    for (size_t i=0; i<enum_list.size(); ++i) {
+      const auto hash = lbann::enum_hash<Humor>()(enum_list[i]);
+      CHECK_FALSE(hashes.count(hash));
+      hashes.insert(hash);
+    }
+  }
+
+  SECTION ("pair_hash") {
+    std::unordered_set<size_t> hashes;
+    for (char i=-12; i<=12; i+=3) {
+      for (unsigned long j=0; j<=11209; j+=1019) {
+        std::pair<char,unsigned long> val(i,j);
+        const auto hash = lbann::pair_hash<char,unsigned long>()(val);
+        CHECK_FALSE(hashes.count(hash));
+        hashes.insert(hash);
+      }
+    }
+  }
+
+}
diff --git a/src/utils/unit_test/image_test.cpp b/src/utils/unit_test/image_test.cpp
new file mode 100644
index 00000000000..8f2cdaa564c
--- /dev/null
+++ b/src/utils/unit_test/image_test.cpp
@@ -0,0 +1,51 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/image.hpp>
+
+// Hide by default because this will create a file.
+TEST_CASE("Testing image utils", "[.image-utils][utilities]") {
+  SECTION("JPEG") {
+    std::string filename = "test.jpg";
+    lbann::CPUMat image;
+    // Make this a 3-channel image.
+    image.Resize(3*32*32, 1);
+    {
+      lbann::DataType* buf = image.Buffer();
+      for (size_t channel = 0; channel < 3; ++channel) {
+        for (size_t col = 0; col < 32; ++col) {
+          for (size_t row = 0; row < 32; ++row) {
+            const size_t i = channel*32*32 + row+col*32;
+            if (row == col) { buf[i] = 1.0f; }
+            else { buf[i] = 0.0f; }
+          }
+        }
+      }
+    }
+    SECTION("save image") {
+      std::vector<size_t> dims = {3, 32, 32};
+      REQUIRE_NOTHROW(lbann::save_image(filename, image, dims));
+    }
+    SECTION("load image") {
+      El::Matrix<uint8_t> loaded_image;
+      std::vector<size_t> dims;
+      REQUIRE_NOTHROW(lbann::load_image(filename, loaded_image, dims));
+      REQUIRE(dims.size() == 3);
+      REQUIRE(dims[0] == 3);
+      REQUIRE(dims[1] == 32);
+      REQUIRE(dims[2] == 32);
+      const uint8_t* buf = loaded_image.LockedBuffer();
+      for (size_t channel = 0; channel < 3; ++channel) {
+        for (size_t col = 0; col < 32; ++col) {
+          for (size_t row = 0; row < 32; ++row) {
+            const size_t i = 3*(col+row*32) + channel;
+            if (row == col) { REQUIRE(buf[i] == 255); }
+            // Turns out JPEG doesn't encode every pixel to exactly 0.
+            else { REQUIRE(buf[i] <= 1); }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/utils/unit_test/python_test.cpp b/src/utils/unit_test/python_test.cpp
new file mode 100644
index 00000000000..595c070b74b
--- /dev/null
+++ b/src/utils/unit_test/python_test.cpp
@@ -0,0 +1,269 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/python.hpp>
+
+#ifdef LBANN_HAS_PYTHON
+TEST_CASE ("Testing the embedded Python session", "[python][utilities]") {
+
+  SECTION ("Initializing and finalizing the Python session") {
+    REQUIRE_NOTHROW(lbann::python::initialize());
+    REQUIRE(lbann::python::is_active());
+    REQUIRE_NOTHROW(lbann::python::initialize());
+    REQUIRE(lbann::python::is_active());
+    REQUIRE_NOTHROW(lbann::python::finalize());
+    REQUIRE_FALSE(lbann::python::is_active());
+    REQUIRE_NOTHROW(lbann::python::finalize());
+    REQUIRE_FALSE(lbann::python::is_active());
+    REQUIRE_NOTHROW(lbann::python::initialize());
+    REQUIRE(lbann::python::is_active());
+  }
+
+  SECTION ("Acquiring the global interpreter lock") {
+    SECTION ("Acquiring GIL once") {
+      std::unique_ptr<lbann::python::global_interpreter_lock> gil;
+      REQUIRE_NOTHROW(gil.reset(new lbann::python::global_interpreter_lock()));
+      REQUIRE_NOTHROW(gil.reset());
+    }
+    SECTION ("Acquiring GIL recursively") {
+      std::unique_ptr<lbann::python::global_interpreter_lock> gil1, gil2, gil3;
+      REQUIRE_NOTHROW(gil1.reset(new lbann::python::global_interpreter_lock()));
+      REQUIRE_NOTHROW(gil2.reset(new lbann::python::global_interpreter_lock()));
+      REQUIRE_NOTHROW(gil3.reset(new lbann::python::global_interpreter_lock()));
+      REQUIRE_NOTHROW(gil3.reset());
+      REQUIRE_NOTHROW(gil2.reset());
+      REQUIRE_NOTHROW(gil1.reset());
+    }
+  }
+
+  SECTION ("Python error checking") {
+    lbann::python::global_interpreter_lock gil;
+    REQUIRE_NOTHROW(lbann::python::check_error());
+    REQUIRE_THROWS(lbann::python::check_error(true));
+    REQUIRE_NOTHROW(lbann::python::check_error());
+
+    SECTION ("Raising Python exception") {
+      PyObject* main = PyImport_ImportModule("__main__");
+      std::string func_def = R"(
+def throw_exception():
+    raise RuntimeError('This error is expected')
+)";
+      PyRun_SimpleString(func_def.c_str());
+      PyObject_CallMethod(main, "throw_exception", "()");
+      REQUIRE_THROWS(lbann::python::check_error());
+      Py_DECREF(main);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Making syntax error") {
+      PyObject* main = PyImport_ImportModule("__main__");
+      std::string func_def = R"(
+def make_syntax_error():
+    this should throw a NameError
+)";
+      PyRun_SimpleString(func_def.c_str());
+      PyObject_CallMethod(main, "make_syntax_error", "()");
+      REQUIRE_THROWS(lbann::python::check_error());
+      Py_DECREF(main);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Passing bad arguments into Python/C API") {
+      PyLong_AsLong(nullptr);
+      REQUIRE_THROWS(lbann::python::check_error());
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+  }
+
+  SECTION ("Python object wrapper") {
+    lbann::python::global_interpreter_lock gil;
+
+    SECTION ("Default constructor") {
+      std::unique_ptr<lbann::python::object> obj;
+      REQUIRE_NOTHROW(obj.reset(new lbann::python::object()));
+      REQUIRE(*obj == nullptr);
+      REQUIRE_NOTHROW(obj.reset());
+    }
+
+    SECTION ("Constructor with raw Python object pointer") {
+      PyObject* ptr = Py_BuildValue("(i,d,s)", 987, 6.54, "321");
+      REQUIRE(ptr != nullptr);
+      Py_INCREF(ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      std::unique_ptr<lbann::python::object> obj;
+      REQUIRE_NOTHROW(obj.reset(new lbann::python::object(ptr)));
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE_NOTHROW(obj.reset());
+      REQUIRE(Py_REFCNT(ptr) == 1);
+      Py_DECREF(ptr);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Constructor with null pointer") {
+      std::unique_ptr<lbann::python::object> obj;
+      REQUIRE_NOTHROW(obj.reset(new lbann::python::object(nullptr)));
+      REQUIRE_NOTHROW(obj.reset());
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Access functions to raw Python object pointer") {
+      PyObject* ptr = Py_BuildValue("(i,d,s)", 12, 3.4, "56");
+      REQUIRE(ptr != nullptr);
+      Py_INCREF(ptr);
+      std::unique_ptr<lbann::python::object> obj;
+      REQUIRE_NOTHROW(obj.reset(new lbann::python::object(ptr)));
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE(obj->get() == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE(const_cast<const lbann::python::object&>(*obj).get() == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE(*obj == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE(const_cast<const lbann::python::object&>(*obj) == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE(obj->release() == ptr);
+      REQUIRE(obj->get() == nullptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      REQUIRE_NOTHROW(obj.reset());
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      Py_DECREF(ptr);
+      Py_DECREF(ptr);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Copy constructor") {
+      PyObject* ptr = Py_BuildValue("(i,d,s)", 98, 7.6, "54");
+      std::unique_ptr<lbann::python::object> obj1(new lbann::python::object(ptr));
+      std::unique_ptr<lbann::python::object> obj2;
+      REQUIRE_NOTHROW(obj2.reset(new lbann::python::object(*obj1)));
+      REQUIRE(*obj1 == ptr);
+      REQUIRE(*obj2 == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 2);
+      obj1.reset();
+      REQUIRE(Py_REFCNT(ptr) == 1);
+      obj2.reset();
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Copy assignment operator") {
+      PyObject* ptr1 = Py_BuildValue("(i,d,s)", 1, 2., "3");
+      PyObject* ptr2 = Py_BuildValue("(i,d,s)", 4, 5., "6");
+      Py_INCREF(ptr1);
+      Py_INCREF(ptr2);
+      REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 2));
+      std::unique_ptr<lbann::python::object> obj1(new lbann::python::object(ptr1));
+      std::unique_ptr<lbann::python::object> obj2(new lbann::python::object(ptr2));
+      REQUIRE_NOTHROW(*obj2 = *obj1);
+      REQUIRE(*obj1 == ptr1);
+      REQUIRE(*obj2 == ptr1);
+      REQUIRE((Py_REFCNT(ptr1) == 3 && Py_REFCNT(ptr2) == 1));
+      obj1.reset();
+      REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1));
+      obj2.reset();
+      Py_DECREF(ptr1);
+      Py_DECREF(ptr2);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Move constructor") {
+      PyObject* ptr = Py_BuildValue("(i,d,s)", 987, 65.4, "three two one");
+      std::unique_ptr<lbann::python::object> obj1(new lbann::python::object(ptr));
+      std::unique_ptr<lbann::python::object> obj2;
+      REQUIRE_NOTHROW(obj2.reset(new lbann::python::object(std::move(*obj1))));
+      REQUIRE(*obj1 == nullptr);
+      REQUIRE(*obj2 == ptr);
+      REQUIRE(Py_REFCNT(ptr) == 1);
+      obj1.reset();
+      REQUIRE(Py_REFCNT(ptr) == 1);
+      obj2.reset();
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Move assignment operator") {
+      PyObject* ptr1 = Py_BuildValue("(i,d,s)", 9, 8., "7");
+      PyObject* ptr2 = Py_BuildValue("(i,d,s)", 6, 5., "4");
+      Py_INCREF(ptr1);
+      Py_INCREF(ptr2);
+      REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 2));
+      std::unique_ptr<lbann::python::object> obj1(new lbann::python::object(ptr1));
+      std::unique_ptr<lbann::python::object> obj2(new lbann::python::object(ptr2));
+      REQUIRE_NOTHROW(*obj2 = std::move(*obj1));
+      REQUIRE(*obj1 == nullptr);
+      REQUIRE(*obj2 == ptr1);
+      REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1));
+      obj1.reset();
+      REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1));
+      obj2.reset();
+      Py_DECREF(ptr1);
+      Py_DECREF(ptr2);
+      REQUIRE_NOTHROW(lbann::python::check_error());
+    }
+
+    SECTION ("Convenience functions for Python str") {
+      SECTION ("Empty string"){
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object("")));
+        REQUIRE(static_cast<std::string>(*obj).empty());
+      }
+      SECTION ("Non-empty string"){
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object("one two three")));
+        REQUIRE(static_cast<std::string>(*obj) == "one two three");
+      }
+    }
+
+    SECTION ("Convenience functions for Python int") {
+      SECTION ("Zero value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(0l)));
+        REQUIRE(static_cast<long>(*obj) == 0l);
+      }
+      SECTION ("Positive value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(123l)));
+        REQUIRE(static_cast<long>(*obj) == 123l);
+      }
+      SECTION ("Negative value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-321l)));
+        REQUIRE(static_cast<long>(*obj) == -321l);
+      }
+    }
+
+    SECTION ("Convenience functions for Python float") {
+      SECTION ("Zero value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(0.0)));
+        REQUIRE(static_cast<double>(*obj) == 0.0);
+      }
+      SECTION ("Positive value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(3.21)));
+        REQUIRE(static_cast<double>(*obj) == 3.21);
+      }
+      SECTION ("Negative value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-12.3)));
+        REQUIRE(static_cast<double>(*obj) == -12.3);
+      }
+      SECTION ("Infinite value") {
+        constexpr double inf = std::numeric_limits<double>::infinity();
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(inf)));
+        REQUIRE(static_cast<double>(*obj) == inf);
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-inf)));
+        REQUIRE(static_cast<double>(*obj) == -inf);
+      }
+      SECTION ("NaN value") {
+        std::unique_ptr<lbann::python::object> obj;
+        REQUIRE_NOTHROW(obj.reset(new lbann::python::object(std::nan(""))));
+        REQUIRE(std::isnan(static_cast<double>(*obj)));
+      }
+    }
+
+  }
+
+}
+#endif // LBANN_HAS_PYTHON
diff --git a/src/utils/unit_test/random_test.cpp b/src/utils/unit_test/random_test.cpp
new file mode 100644
index 00000000000..25d7d14c20e
--- /dev/null
+++ b/src/utils/unit_test/random_test.cpp
@@ -0,0 +1,78 @@
+// MUST include this
+#include <catch2/catch.hpp>
+
+// File being tested
+#include <lbann/utils/random.hpp>
+
+#include <limits>
+
+constexpr size_t num_tests = 1000;
+
+TEST_CASE("Testing random_uniform", "[random][utilities]") {
+
+  SECTION("32-bit Mersenne Twister") {
+    std::mt19937 gen;
+    SECTION("floats") {
+      for (size_t i = 0; i < num_tests; ++i) {
+        const float val = lbann::random_uniform<float>(gen);
+        REQUIRE(val >= 0.0f);
+        REQUIRE(val < 1.0f);
+      }
+    }
+    SECTION("doubles") {
+      for (size_t i = 0; i < num_tests; ++i) {
+        const double val = lbann::random_uniform<double>(gen);
+        REQUIRE(val >= 0.0);
+        REQUIRE(val < 1.0);
+      }
+    }
+  }
+
+  SECTION("64-bit Mersenne Twister") {
+    std::mt19937_64 gen;
+    SECTION("floats") {
+      for (size_t i = 0; i < num_tests; ++i) {
+        const float val = lbann::random_uniform<float>(gen);
+        REQUIRE(val >= 0.0f);
+        REQUIRE(val < 1.0f);
+      }
+    }
+    SECTION("doubles") {
+      for (size_t i = 0; i < num_tests; ++i) {
+        const double val = lbann::random_uniform<double>(gen);
+        REQUIRE(val >= 0.0);
+        REQUIRE(val < 1.0);
+      }
+    }
+  }
+
+  SECTION("Bounds") {
+    SECTION("float") {
+      SECTION("Min") {
+        auto gen = []() -> uint64_t { return 0ull; };
+        const float val = lbann::random_uniform<float>(gen);
+        REQUIRE(val == 0.0f);
+      }
+      SECTION("Max") {
+        auto gen = []() -> uint64_t { return -1ull; };
+        const float val = lbann::random_uniform<float>(gen);
+        constexpr float eps = std::numeric_limits<float>::epsilon();
+        REQUIRE(val == 1.0f - eps/2);
+      }
+    }
+    SECTION("double") {
+      SECTION("Min") {
+        auto gen = []() -> uint64_t { return 0ull; };
+        const double val = lbann::random_uniform<double>(gen);
+        REQUIRE(val == 0.0);
+      }
+      SECTION("Max") {
+        auto gen = []() -> uint64_t { return -1ull; };
+        const double val = lbann::random_uniform<double>(gen);
+        constexpr double eps = std::numeric_limits<double>::epsilon();
+        REQUIRE(val == 1.0 - eps/2);
+      }
+    }
+  }
+
+}
diff --git a/src/utils/unit_test/serialize_half_test.cpp b/src/utils/unit_test/serialize_half_test.cpp
new file mode 100644
index 00000000000..a3302720f22
--- /dev/null
+++ b/src/utils/unit_test/serialize_half_test.cpp
@@ -0,0 +1,73 @@
+#include <catch2/catch.hpp>
+
+#include <lbann/base.hpp> // half stuff is here.
+#include <lbann/utils/serialization.hpp>
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+#include <cereal/archives/xml.hpp>
+
+#include <lbann/utils/h2_tmp.hpp>
+
+#include <sstream>
+
+using namespace h2::meta;
+
+// (NOTE trb 04/06/2020): There seems to be an issue with Catch2 where
+// this *must* be a parameter pack. This only appears to be true for
+// templated type aliases, not actual classes. I haven't looked into
+// it, but I don't care that much since this works around well.
+
+template <typename... ValueType>
+using BinaryArchiveTypes = TL<ValueType...,
+                              cereal::BinaryOutputArchive,
+                              cereal::BinaryInputArchive>;
+template <typename... ValueType>
+using JSONArchiveTypes = TL<ValueType...,
+                            cereal::JSONOutputArchive,
+                            cereal::JSONInputArchive>;
+template <typename... ValueType>
+using XMLArchiveTypes = TL<ValueType...,
+                           cereal::XMLOutputArchive,
+                           cereal::XMLInputArchive>;
+
+// This is not really elegant, but preprocessing macros inside
+// preprocessor blocks is "undefined behavior" so we duplicate the
+// whole thing.
+#ifdef LBANN_HAS_GPU_FP16
+TEMPLATE_PRODUCT_TEST_CASE(
+  "Serialization of half types",
+  "[utilities][half][serialize]",
+  (BinaryArchiveTypes, JSONArchiveTypes, XMLArchiveTypes),
+  (lbann::cpu_fp16, lbann::fp16))
+#else
+TEMPLATE_PRODUCT_TEST_CASE(
+  "Serialization of half types",
+  "[utilities][half][serialize]",
+  (BinaryArchiveTypes, JSONArchiveTypes, XMLArchiveTypes),
+  (lbann::cpu_fp16))
+#endif
+{
+  using ValueType = tlist::Car<TestType>;
+  using ArchiveTypes = tlist::Cdr<TestType>;
+  using OutputArchiveT = tlist::Car<ArchiveTypes>; // First entry
+  using InputArchiveT = tlist::Cadr<ArchiveTypes>; // Second entry
+
+  std::stringstream ss;
+  ValueType val(1.23f), val_restore(0.f);
+
+  // Save
+  {
+    OutputArchiveT oarchive(ss);
+
+    CHECK_NOTHROW(oarchive(val));
+  }
+
+  // Restore
+  {
+    InputArchiveT iarchive(ss);
+    CHECK_NOTHROW(iarchive(val_restore));
+  }
+
+  CHECK(val == val_restore);
+}
diff --git a/src/utils/unit_test/stubs/preset_env_accessor.cpp b/src/utils/unit_test/stubs/preset_env_accessor.cpp
new file mode 100644
index 00000000000..166cf6be7dd
--- /dev/null
+++ b/src/utils/unit_test/stubs/preset_env_accessor.cpp
@@ -0,0 +1,11 @@
+#include "preset_env_accessor.hpp"
+
+namespace lbann {
+namespace utils {
+namespace stubs {
+
+std::unordered_map<std::string,std::string> PresetEnvAccessor::vars_;
+
+}// namespace stubs
+}// namespace utils
+}// namespace lbann
diff --git a/src/utils/unit_test/stubs/preset_env_accessor.hpp b/src/utils/unit_test/stubs/preset_env_accessor.hpp
new file mode 100644
index 00000000000..db212119061
--- /dev/null
+++ b/src/utils/unit_test/stubs/preset_env_accessor.hpp
@@ -0,0 +1,75 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED
+#define LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED
+
+#include <string>
+#include <unordered_map>
+
+namespace lbann {
+namespace utils {
+namespace stubs {
+
+class PresetEnvAccessor
+{
+public:
+  std::string get(std::string const&) const;
+private:
+  static void populate_vars();
+private:
+  static std::unordered_map<std::string, std::string> vars_;
+};
+
+inline std::string PresetEnvAccessor::get(std::string const& var_name) const
+{
+  if (vars_.size() == 0UL) populate_vars();
+
+  auto it = vars_.find(var_name);
+  if (it == vars_.end())
+    return "";
+
+  return it->second;
+}
+
+inline void PresetEnvAccessor::populate_vars()
+{
+  vars_ = {
+    {"APPLE", "3.14"}, // float
+    {"ICE_CREAM_SCOOPS", "3"}, // int
+    {"PIZZA", "pepperoni"}, // string
+    {"VALUE_IS_TRUE", "true"}, // true as string
+    {"VALUE_IS_ONE", "1"}, // true as int
+    {"VALUE_IS_FALSE", "false"}, // false as string
+    {"VALUE_IS_ZERO", "0"}, // false as int
+  };
+}
+
+}// namespace stubs
+}// namespace utils
+}// namespace lbann
+
+#endif /* LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED */
diff --git a/src/weights/CMakeLists.txt b/src/weights/CMakeLists.txt
index 05c557f50ef..1e9d68fa8d8 100644
--- a/src/weights/CMakeLists.txt
+++ b/src/weights/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  data_type_weights.cpp
   initializer.cpp
   variance_scaling_initializers.cpp
   weights.cpp
diff --git a/src/weights/data_type_weights.cpp b/src/weights/data_type_weights.cpp
new file mode 100644
index 00000000000..28c09bb858e
--- /dev/null
+++ b/src/weights/data_type_weights.cpp
@@ -0,0 +1,488 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define LBANN_DATA_TYPE_WEIGHTS_INSTANTIATE
+#include "lbann/weights/data_type_weights.hpp"
+#include "lbann/optimizers/optimizer.hpp"
+#include "lbann/utils/exception.hpp"
+#include "lbann/io/file_io.hpp"
+
+#include <layers.pb.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+
+namespace {
+
+/** @brief Get a string version of tensor dimensions */
+std::string stringify_dims(const std::vector<int>& dims)
+{
+  std::ostringstream oss;
+  oss << dims.front();
+  for (size_t i = 1; i < dims.size(); ++i)
+    oss << "x" << dims[i];
+  return oss.str();
+}
+
+/** @brief Get string describing tensor dimensions.
+ *  The tensor is stored in a matrix, although there may be multiple
+ *  dimensions corresponding to the matrix height and width.
+ */
+std::string get_dims_string(const std::vector<int>& matrix_height_dims,
+                            const std::vector<int>& matrix_width_dims) {
+  std::ostringstream oss;
+  oss << "(" << stringify_dims(matrix_height_dims) << ")x"
+      << "(" << stringify_dims(matrix_width_dims) << ")";
+  return oss.str();
+}
+
+} // namespace
+
+namespace lbann {
+
+template <typename TensorDataType>
+data_type_weights<TensorDataType>::data_type_weights(lbann_comm* comm)
+  : BaseType(comm) {}
+
+template <typename TensorDataType>
+data_type_weights<TensorDataType>::data_type_weights(const WeightsType& other)
+  : BaseType(other) {
+
+  // Deep copies
+  m_values.reset(other.m_values ? other.m_values->Copy() : nullptr);
+  m_initializer = (other.m_initializer
+                   ? other.m_initializer->clone() : nullptr);
+  m_optimizer = (other.m_optimizer
+                 ? other.m_optimizer->clone() : nullptr);
+  if (m_optimizer != nullptr) {
+    m_optimizer->set_weights(this);
+  }
+
+}
+
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::operator=(const WeightsType& other) -> WeightsType& {
+  weights::operator=(other);
+
+  // Deep copies
+  m_values.reset(other.m_values ? other.m_values->Copy() : nullptr);
+  m_initializer = (other.m_initializer
+                   ? other.m_initializer->clone() : nullptr);
+  m_optimizer = (other.m_optimizer
+                 ? other.m_optimizer->clone() : nullptr);
+  if (m_optimizer != nullptr) {
+    m_optimizer->set_weights(this);
+  }
+
+  return *this;
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::do_augment_description_(description& desc) const {
+
+  // Optimizer
+  if (m_optimizer != nullptr) {
+    desc.add(m_optimizer->get_description());
+  }
+
+  // Initializer
+  if (m_initializer != nullptr) {
+    desc.add(m_initializer->get_description());
+  }
+
+}
+
+// -----------------------------------------------
+// Dimension accessors
+// -----------------------------------------------
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::do_set_dims_(
+  std::vector<int> const& matrix_height_dims,
+  std::vector<int> const& matrix_width_dims) {
+  if (m_values != nullptr) {
+    const auto& height = this->get_matrix_height();
+    const auto& width = this->get_matrix_width();
+    if (m_values->Height() != height || m_values->Width() != width) {
+      LBANN_ERROR("attempted to set weights \"", this->get_name(), "\" "
+                  "with dimensions ",
+                  get_dims_string(matrix_height_dims, matrix_width_dims), ", "
+                  "but it is already setup with a ",
+                  m_values->Height(), " x ", m_values->Width(), " "
+                  "weights matrix");
+    }
+  }
+}
+
+// -----------------------------------------------
+// Initializer accessors
+// -----------------------------------------------
+
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_initializer() -> InitializerType* {
+  return const_cast<InitializerType*>(static_cast<const data_type_weights&>(*this).get_initializer());
+}
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_initializer() const
+  -> const InitializerType* {
+  return m_initializer.get();
+}
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_initializer(
+  std::unique_ptr<weights_initializer>&& init) {
+  using InitializerPtrType = InitializerType*;
+  // Verify the dynamic type is compatible
+  if (init && dynamic_cast<InitializerPtrType>(init.get()))
+    // Safely transfer the memory; both release() and reset() are
+    // noexcept so this is memory-safe. The dynamic_cast in the if
+    // statement verifies the dynamic type; no need to redo it.
+    m_initializer.reset(static_cast<InitializerPtrType>(init.release()));
+  else if (init)
+    // The provided pointer was not null, but the dynamic_cast
+    // failed. This is an error.
+    LBANN_ERROR("Initializer has incompatible dynamic type.");
+  else
+    // The provided pointer was null. Set the held pointer to null.
+    m_initializer.reset();
+}
+
+// -----------------------------------------------
+// Optimizer accessors
+// -----------------------------------------------
+
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_optimizer()
+  -> OptimizerType* {
+  return const_cast<OptimizerType*>(
+           static_cast<const WeightsType&>(*this).get_optimizer());
+}
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_optimizer() const
+  -> const OptimizerType* {
+  if (this->is_frozen()) {
+    return nullptr;
+  } else {
+    return m_optimizer.get();
+  }
+}
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_optimizer(
+  std::unique_ptr<optimizer>&& opt) {
+  using OptimizerPtrType = OptimizerType*;
+  if (opt && dynamic_cast<OptimizerPtrType>(opt.get()))
+    m_optimizer.reset(static_cast<OptimizerPtrType>(opt.release()));
+  else if (opt)
+    LBANN_ERROR("Optimizer has incompatible dynamic type");
+  else
+    m_optimizer.reset();
+}
+
+// -----------------------------------------------
+// Setup
+// -----------------------------------------------
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::do_setup_() {
+  // Return immediately if weights have already been setup
+  if (m_values != nullptr) { return; }
+
+  auto matrix_dist = this->get_matrix_distribution();
+  // Construct weights matrix
+  m_values.reset(AbsDistMatrixType::Instantiate(*matrix_dist.grid,
+                                                matrix_dist.root,
+                                                matrix_dist.colDist,
+                                                matrix_dist.rowDist,
+                                                (matrix_dist.blockHeight == 1
+                                                 && matrix_dist.blockWidth == 1 ?
+                                                 El::ELEMENT : El::BLOCK),
+                                                matrix_dist.device));
+  m_values->AlignWith(matrix_dist);
+  m_values->Resize(this->get_matrix_height(), this->get_matrix_width());
+  if (m_initializer != nullptr) {
+    m_initializer->fill(*m_values);
+  } else {
+    El::Zero(*m_values);
+  }
+
+  // Setup optimizer
+  if (m_optimizer != nullptr) {
+    m_optimizer->setup(this);
+  }
+
+}
+
+// -----------------------------------------------
+// Weight matrix accessors
+// -----------------------------------------------
+
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_values() -> AbsDistMatrixType& {
+  return const_cast<AbsDistMatrixType&>(static_cast<const data_type_weights&>(*this).get_values());
+}
+template <typename TensorDataType>
+auto data_type_weights<TensorDataType>::get_values() const
+  -> const AbsDistMatrixType& {
+  if (m_values == nullptr) {
+    LBANN_ERROR("attempted to access values of "
+                "weights \"" + this->get_name() + "\" "
+                "before they are setup");
+  }
+  return *m_values;
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_values(const AbsDistMatrixType& values) {
+  if ((values.Height() != get_values().Height())
+      || (values.Width() != get_values().Width())) {
+    LBANN_ERROR("Expected matrix size ",
+                this->get_matrix_height(), "x", this->get_matrix_width(),
+                "; got a matrix with size ",
+                values.Height(), "x", values.Width());
+  }
+  El::Copy(values, get_values());
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_value(TensorDataType value, int index) {
+
+#ifdef LBANN_DEBUG
+  // Check that tensor position is valid
+  const auto& size = weights::get_size();
+  if (index < 0 || index >= size) {
+    LBANN_ERROR("attempted to set value in "
+                "weights \"", this->get_name(), "\""
+                "at index ", index, ", "
+                "but there are ", size, " values");
+  }
+#endif // LBANN_DEBUG
+
+  // Set matrix entry
+  const auto& height = this->get_matrix_height();
+  set_value(value, index % height, index / height);
+
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_value(TensorDataType value, std::vector<int> pos) {
+
+  // Get tensor dimensions
+  const auto& dims = this->get_dims();
+
+#ifdef LBANN_DEBUG
+  // Check that tensor position is valid
+  bool valid = dims.size() == pos.size();
+  for (size_t i = 0 ; i < dims.size(); ++i) {
+    valid = valid && pos[i] >= 0 && pos[i] < dims[i];
+  }
+  if (!valid) {
+    LBANN_ERROR("attempted to set value in "
+                "weights \"", this->get_name(), "\""
+                "at position (", stringify_dims(pos), ") "
+                "in a tensor with dimensions ", stringify_dims(dims));
+      }
+#endif // LBANN_DEBUG
+
+  // Get index of weight value and set
+  int index = 0;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    index = index * dims[i] + pos[i];
+  }
+  set_value(value, index);
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::set_value(TensorDataType value, int row, int col) {
+
+#ifdef LBANN_DEBUG
+  // Check that matrix entry is valid
+  const auto& height = this->get_matrix_height();
+  const auto& width = this->get_matrix_width();
+  if (row < 0 || row >= height || col < 0 || col > width ) {
+    LBANN_ERROR("attempted to set weights value "
+                "in weights \"", this->get_name(), "\""
+                "at entry (", row, ",", col, ") "
+                "in a ", height, "x", width, " matrix");
+  }
+#endif // LBANN_DEBUG
+
+  // Set value if it is local
+  auto& values = get_values();
+  if (values.IsLocal(row, col)) {
+    values.SetLocal(values.LocalRow(row), values.LocalCol(col), value);
+  }
+
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::reconcile_values() {
+  auto& values = get_values();
+  if (values.RedundantSize() > 1) {
+    El::Scale(TensorDataType(1. / values.RedundantSize()), values);
+    this->get_comm().allreduce(values, values.RedundantComm());
+  }
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::reconcile_values(Al::request& req) {
+  auto& values = get_values();
+  if (values.RedundantSize() > 1) {
+    El::Scale(TensorDataType(1. / values.RedundantSize()), values);
+    this->get_comm().nb_allreduce(values, values.RedundantComm(), req);
+  }
+}
+
+// -----------------------------------------------
+// Checkpointing
+// -----------------------------------------------
+
+template <typename TensorDataType>
+bool data_type_weights<TensorDataType>::save_to_checkpoint_shared(lbann::persist& p)
+{
+  // define name to store weight values
+  char l_name[512];
+  sprintf(l_name, "weights_%s_%lldx%lld", this->get_name().c_str(), m_values->Height(), m_values->Width());
+  // write weights using persist call -- uses Elemental's write function.
+  p.write_distmat(persist_type::model, l_name, m_values.get());
+  // if saving training state, also write out state of optimizer
+  if (m_optimizer != nullptr && (p.get_cb_type() != callback_type::weights_only)) {
+    m_optimizer->save_to_checkpoint_shared(p, this->get_name());
+  }
+
+  return true;
+}
+
+template <typename TensorDataType>
+void data_type_weights<TensorDataType>::write_proto(lbann_data::WeightsData* proto) const {
+
+  // Set proto properties
+  proto->Clear();
+  proto->set_name(this->get_name());
+  for (const auto& d : this->get_dims()) {
+    proto->mutable_shape()->add_dim(d);
+  }
+  proto->set_height(this->get_matrix_height());
+  proto->set_width(this->get_matrix_width());
+
+  // Write weight values to prototext on world master process
+  CircMatDT<TensorDataType, El::Device::CPU> values = *m_values; /// @todo What if weights are on GPU?
+  values.SetRoot(0); /// @todo What if world master is not process 0?
+  if (this->get_comm().am_world_master()) {
+    const auto& local_values = values.LockedMatrix();
+    const El::Int height = local_values.Height();
+    const El::Int width = local_values.Width();
+    /// @todo OpenMP parallelization
+    /** @todo Our matrices are column-major while Numpy expects
+     *  row-major matrices. This row-wise iteration is fine for
+     *  matrices and column vectors, but it can mess up the order of
+     *  the weights if a high-dimensional tensor is represented as a
+     *  matrix. This is what we need for quantization on convolution
+     *  kernel weights.
+     */
+    for (El::Int i = 0; i < height; ++i) {
+      for (El::Int j = 0; j < width; ++j) {
+        proto->add_data(local_values(i,j));
+      }
+    }
+  }
+
+}
+
+template <typename TensorDataType>
+bool data_type_weights<TensorDataType>::load_from_checkpoint_shared(lbann::persist& p)
+{
+  // define filename containing saved weight values
+  auto f_name = El::BuildString("weights_", this->get_name(), "_",
+                                m_values->Height(), "x", m_values->Width(),
+                                ".bin");
+  p.read_distmat(persist_type::model, f_name.c_str(), m_values.get());
+  if (m_optimizer != nullptr) {
+    m_optimizer->load_from_checkpoint_shared(p, this->get_name());
+  }
+
+  return true;
+}
+
+template <typename TensorDataType>
+bool data_type_weights<TensorDataType>::load_from_save(std::string const& ckpt_dir,
+                                                       std::vector<std::string> const& weight_list){
+  // create weight file name to match to weight list entry
+  auto l_name = El::BuildString("model_weights_", this->get_name(), "_",
+                                m_values->Height(), "x", m_values->Width(), ".bin");
+  auto it = std::find(weight_list.begin(),weight_list.end(),l_name);
+  // If match is found read in weight values.
+  if(it != weight_list.end()) {
+    std::string full_path = ckpt_dir + *it;
+    if(this->get_comm().am_world_master()) {
+      std::cout << "Loading " << this->get_name() << " <- " << *it << "\n";
+    }
+    // check whether file exists
+    int exists = lbann::exists(full_path.c_str());
+    if (! exists) {
+      throw lbann_exception(std::string("Failed to read weight matrix: ") + full_path);
+      return false;
+    }
+    El::Read(*m_values,full_path, El::BINARY, true);
+  }
+  return true;
+}
+
+template <typename TensorDataType>
+bool data_type_weights<TensorDataType>::save_to_checkpoint_distributed(lbann::persist& p){
+  // Functions identically to shared checkpoint except weights and parameters are saved on a per rank basis
+  auto l_name = El::BuildString("weights_", this->get_name(),
+                                "_", m_values->LocalHeight(),
+                                "x", m_values->LocalWidth(), ".bin");
+  p.write_rank_distmat(persist_type::model, l_name.c_str(), *m_values);
+  if (m_optimizer != nullptr) {
+    m_optimizer->save_to_checkpoint_distributed(p, this->get_name());
+  }
+  return true;
+}
+
+template <typename TensorDataType>
+bool data_type_weights<TensorDataType>::load_from_checkpoint_distributed(lbann::persist& p){
+  // Functions identically to shared checkpoint except weights and parameters are loaded on a per rank basis
+  auto l_name = El::BuildString("weights_", this->get_name(),
+                                "_", m_values->LocalHeight(),
+                                "x", m_values->LocalWidth(), ".bin");
+  p.read_rank_distmat(persist_type::model, l_name.c_str(), *m_values);
+  if (m_optimizer != nullptr) {
+    m_optimizer->load_from_checkpoint_distributed(p, this->get_name());
+  }
+  return true;
+}
+
+#define PROTO(T)                     \
+  template class data_type_weights<T>
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
+}  // namespace lbann
diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp
index 01c49c72db3..8e3bf294dd3 100644
--- a/src/weights/initializer.cpp
+++ b/src/weights/initializer.cpp
@@ -24,31 +24,42 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_INITIALIZER_INSTANTIATE
 #include "lbann/weights/initializer.hpp"
+
+#include "lbann/proto/proto_common.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
 #include "lbann/utils/random.hpp"
 
+#include <weights.pb.h>
+
+#include <sstream>
+
 namespace lbann {
 
 description weights_initializer::get_description() const {
   return description(get_type() + " weights initializer");
 }
 
-description constant_initializer::get_description() const {
-  auto&& desc = weights_initializer::get_description();
+template <typename TensorDataType>
+description constant_initializer<TensorDataType>::get_description() const {
+  auto desc = data_type_weights_initializer<TensorDataType>::get_description();
   desc.add("Value", m_value);
   return desc;
 }
 
-void constant_initializer::fill(AbsDistMat& matrix) {
-  if (m_value == DataType(0)) {
+template <typename TensorDataType>
+void constant_initializer<TensorDataType>::fill(AbsDistMatrixType& matrix) {
+  if (m_value == TensorDataType(0.)) {
     El::Zero(matrix);
   } else {
     El::Fill(matrix, m_value);
   }
 }
 
-void value_initializer::fill(AbsDistMat& matrix) {
+template <typename TensorDataType>
+void value_initializer<TensorDataType>::fill(AbsDistMatrixType& matrix) {
 
   // Check that number of values matches weights matrix
   if (matrix.Height() * matrix.Width() != (El::Int) m_values.size()) {
@@ -64,7 +75,7 @@ void value_initializer::fill(AbsDistMat& matrix) {
   // Note: If the weights matrix is on CPU, the CPU matrix is a matrix
   // view. Otherwise, the CPU matrix values are copied to the weights
   // matrix.
-  CPUMat matrix_cpu;
+  El::Matrix<TensorDataType, El::Device::CPU> matrix_cpu;
   if (matrix.GetLocalDevice() == El::Device::CPU) {
     El::View(matrix_cpu, matrix.Matrix());
   } else {
@@ -90,29 +101,102 @@ void value_initializer::fill(AbsDistMat& matrix) {
 
 }
 
-description uniform_initializer::get_description() const {
-  auto&& desc = weights_initializer::get_description();
+template <typename TensorDataType>
+description uniform_initializer<TensorDataType>::get_description() const {
+  auto desc = data_type_weights_initializer<TensorDataType>::get_description();
   std::stringstream ss;
   ss << "[" << m_min << "," << m_max << ")";
   desc.add("Range", ss.str());
   return desc;
 }
 
-void uniform_initializer::fill(AbsDistMat& matrix) {
+template <typename TensorDataType>
+void uniform_initializer<TensorDataType>::fill(AbsDistMatrixType& matrix) {
   uniform_fill(matrix, matrix.Height(), matrix.Width(),
-               (m_max + m_min) / 2, (m_max - m_min) / 2);
+               (m_max + m_min) / El::To<TensorDataType>(2),
+               (m_max - m_min) / El::To<TensorDataType>(2));
 }
 
-description normal_initializer::get_description() const {
-  auto&& desc = weights_initializer::get_description();
+template <typename TensorDataType>
+description normal_initializer<TensorDataType>::get_description() const {
+  auto desc = data_type_weights_initializer<TensorDataType>::get_description();
   desc.add("Mean", m_mean);
   desc.add("Standard deviation", m_standard_deviation);
   return desc;
 }
 
-void normal_initializer::fill(AbsDistMat& matrix) {
+template <typename TensorDataType>
+void normal_initializer<TensorDataType>::fill(AbsDistMatrixType& matrix) {
   gaussian_fill(matrix, matrix.Height(), matrix.Width(),
                 m_mean, m_standard_deviation);
 }
 
+//
+// Builder functions
+//
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_constant_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Initializer::ConstantInitializer const&>(msg);
+  return make_unique<constant_initializer<TensorDataType>>(El::To<TensorDataType>(params.value()));
+}
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_value_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Initializer::ValueInitializer const&>(msg);
+  return make_unique<value_initializer<TensorDataType>>(parse_list<TensorDataType>(params.values()));
+}
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Initializer::UniformInitializer const&>(msg);
+  const auto& min = El::To<TensorDataType>(params.min());
+  const auto& max = El::To<TensorDataType>(params.max());
+  if (min != 0.0 || max != 0.0) {
+    return make_unique<uniform_initializer<TensorDataType>>(min, max);
+  } else {
+    return make_unique<uniform_initializer<TensorDataType>>();
+  }
+}
+
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_normal_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  const auto& params =
+    dynamic_cast<lbann_data::Initializer::NormalInitializer const&>(msg);
+  const auto& mean = El::To<TensorDataType>(params.mean());
+  const auto& standard_deviation = El::To<TensorDataType>(params.standard_deviation());
+  if (mean != 0.0 || standard_deviation != 0.0) {
+    return make_unique<normal_initializer<TensorDataType>>(mean, standard_deviation);
+  } else {
+    return make_unique<normal_initializer<TensorDataType>>();
+  }
+}
+
+
+#define PROTO(T)                                                             \
+  template class data_type_weights_initializer<T>;                           \
+  template class constant_initializer<T>;                                    \
+  template class value_initializer<T>;                                       \
+  template class uniform_initializer<T>;                                     \
+  template class normal_initializer<T>;                                      \
+  template std::unique_ptr<weights_initializer>                              \
+  build_constant_initializer_from_pbuf<T>(google::protobuf::Message const&); \
+  template std::unique_ptr<weights_initializer>                              \
+  build_value_initializer_from_pbuf<T>(google::protobuf::Message const&);    \
+  template std::unique_ptr<weights_initializer>                              \
+  build_uniform_initializer_from_pbuf<T>(google::protobuf::Message const&);  \
+  template std::unique_ptr<weights_initializer>                              \
+  build_normal_initializer_from_pbuf<T>(google::protobuf::Message const&)
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 } // namespace lbann
diff --git a/src/weights/unit_test/CMakeLists.txt b/src/weights/unit_test/CMakeLists.txt
new file mode 100644
index 00000000000..0d27f13abd5
--- /dev/null
+++ b/src/weights/unit_test/CMakeLists.txt
@@ -0,0 +1,7 @@
+set_full_path(THIS_DIR_MPI_CATCH2_TEST_FILES
+  weights_proxy_test.cpp
+  )
+
+set(LBANN_MPI_CATCH2_TEST_FILES
+  "${LBANN_MPI_CATCH2_TEST_FILES}"
+  "${THIS_DIR_MPI_CATCH2_TEST_FILES}" PARENT_SCOPE)
diff --git a/src/weights/unit_test/weights_proxy_test.cpp b/src/weights/unit_test/weights_proxy_test.cpp
new file mode 100644
index 00000000000..3bc464d2d2f
--- /dev/null
+++ b/src/weights/unit_test/weights_proxy_test.cpp
@@ -0,0 +1,310 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <catch2/catch.hpp>
+
+#include "MPITestHelpers.hpp"
+
+#include <lbann/base.hpp>
+#include <lbann/weights/data_type_weights.hpp>
+#include <lbann/weights/weights.hpp>
+#include <lbann/weights/weights_proxy.hpp>
+#include <lbann/utils/memory.hpp>
+#include <lbann/utils/h2_tmp.hpp>
+
+// Some convenience typedefs
+
+template <typename T>
+using DataTypeWeights = lbann::data_type_weights<T>;
+
+template <typename T>
+using ConstantInitializer = lbann::constant_initializer<T>;
+
+template <typename T>
+using CircCirc =
+  El::DistMatrix<T, El::CIRC, El::CIRC, El::ELEMENT, El::Device::CPU>;
+
+// Helper functions
+
+namespace {
+
+template <typename T>
+size_t count_differing_values(T const& val, El::AbstractMatrix<T> const& mat_in)
+{
+  El::AbstractMatrixReadDeviceProxy<T, El::Device::CPU> proxy(mat_in);
+  auto const& mat = proxy.GetLocked();
+
+  size_t nnz = 0;
+  for (El::Int col = 0; col < mat.Width(); ++col)
+    for (El::Int row = 0; row < mat.Height(); ++row)
+      nnz += (mat.CRef(row, col) != val);
+  return nnz;
+}
+
+template <typename T>
+size_t count_nonzeros(El::AbstractMatrix<T> const& mat_in)
+{
+  return count_differing_values(El::To<T>(0.f), mat_in);
+}
+
+template <typename T>
+El::AbstractMatrix<T> const& get_local_values(DataTypeWeights<T> const& dtw)
+{
+  return dtw.get_values().LockedMatrix();
+}
+
+template <typename T>
+El::AbstractMatrix<T> const&
+get_local_values(lbann::WeightsProxy<T> const& proxy)
+{
+  return proxy.values().LockedMatrix();
+}
+}
+
+// Test to make sure I understand how weights need to be setup.
+TEST_CASE("Basic weights tests", "[mpi][weights]")
+{
+  using DataType = float;
+
+  auto& world_comm = unit_test::utilities::current_world_comm();
+  int const size_of_world = world_comm.get_procs_in_world();
+
+  // Setup the weights object -- let's hope I do this right. It's not
+  // like it's documented anywhere.
+
+  int const weights_height = 3 * size_of_world;
+  int const weights_width = 2 * size_of_world;
+
+  // Create the object
+  DataTypeWeights<DataType> dtw(&world_comm);
+
+  REQUIRE_NOTHROW(dtw.set_dims({weights_height}, {weights_width}));
+  CHECK(dtw.get_matrix_height() == weights_height);
+  CHECK(dtw.get_matrix_width() == weights_width);
+
+  SECTION("Setup with no initializer.")
+  {
+    REQUIRE_NOTHROW(dtw.setup());
+    CHECK(count_nonzeros(get_local_values(dtw)) == 0UL);
+  }
+
+  SECTION("Setup with constant initializer.")
+  {
+    DataType const value = El::To<DataType>(1.3);
+    REQUIRE_NOTHROW(
+      dtw.set_initializer(
+        lbann::make_unique<ConstantInitializer<DataType>>(value)));
+    REQUIRE_NOTHROW(dtw.setup());
+    CHECK(count_differing_values(value, get_local_values(dtw)) == 0UL);
+
+    CHECK(dtw.get_values().Height() == dtw.get_matrix_height());
+    CHECK(dtw.get_values().Width() == dtw.get_matrix_width());
+  }
+}
+
+template <typename TypePair>
+using MasterType = h2::meta::tlist::Car<TypePair>;
+template <typename TypePair>
+using ProxyType = h2::meta::tlist::Cadr<TypePair>;
+
+// Clean up the output slightly
+template <typename MasterT, typename ProxyT>
+using MasterProxyPair = h2::meta::TL<MasterT, ProxyT>;
+
+// These grew out of a realization that an initial implementation of
+// some of these functions violated the class contract in their usage
+// of the internal pointers.
+TEST_CASE("Empty WeightsProxy tests.", "[mpi][weights][proxy]")
+{
+  lbann::WeightsProxy<float> proxy;
+
+  SECTION("Copy construction.")
+  {
+    lbann::WeightsProxy<float> proxy_copy(proxy);
+    REQUIRE(proxy.empty());
+    REQUIRE(proxy_copy.empty());
+  }
+
+  SECTION("Move construction.")
+  {
+    lbann::WeightsProxy<float> proxy_move(std::move(proxy));
+    REQUIRE(proxy.empty());
+    REQUIRE(proxy_move.empty());
+  }
+
+  SECTION("Copy assignment.")
+  {
+    lbann::WeightsProxy<float> proxy_copy;
+    REQUIRE(proxy.empty());
+    REQUIRE_NOTHROW(proxy_copy = proxy);
+    REQUIRE(proxy.empty());
+    REQUIRE(proxy_copy.empty());
+  }
+}
+
+TEMPLATE_TEST_CASE("Weights proxy tests.", "[mpi][weights][proxy]",
+                   (MasterProxyPair<float, float>),
+                   (MasterProxyPair<double, float>))
+{
+  using MasterDataType = MasterType<TestType>;
+  using DataType = ProxyType<TestType>;
+
+  auto& world_comm = unit_test::utilities::current_world_comm();
+  int const size_of_world = world_comm.get_procs_in_world();
+
+  // Setup the weights object
+  int const weights_height = 3 * size_of_world;
+  int const weights_width = 2 * size_of_world;
+
+  // Create the master weights object.
+  DataTypeWeights<MasterDataType> dtw(&world_comm);
+
+  // Create and set the initializer; using a constant initializer
+  // here. This must be done at the master data type.
+  MasterDataType const value = El::To<MasterDataType>(5.17);
+  REQUIRE_NOTHROW(
+    dtw.set_initializer(
+      lbann::make_unique<ConstantInitializer<MasterDataType>>(value)));
+
+  // Set the size for the weights.
+  REQUIRE_NOTHROW(dtw.set_dims({weights_height}, {weights_width}));
+
+  // Setup the weights object.
+  REQUIRE_NOTHROW(dtw.setup());
+
+  // Phew. Start testing the proxy.
+  lbann::WeightsProxy<DataType> proxy(dtw);
+
+  SECTION("Proxy accesses values correctly.")
+  {
+    // Proxy should not be empty.
+    REQUIRE(!proxy.empty());
+
+    // At this point, the proxy should have the right size.
+    CHECK(proxy.values().Height() == dtw.get_matrix_height());
+    CHECK(proxy.values().Width() == dtw.get_matrix_width());
+
+    REQUIRE_NOTHROW(proxy.synchronize_with_master());
+
+    // At this point, the proxy should have the right values.
+    auto const dt_value = El::To<DataType>(value);
+    CHECK(count_differing_values(dt_value, get_local_values(proxy)) == 0);
+
+  }
+
+  // This SECTION uses `double` since we don't independently test the
+  // <double,double> combination.
+  SECTION("Copy-from-other-type construction")
+  {
+    // Test "copy from other type" construction.
+    lbann::WeightsProxy<double> proxy_copy(proxy);
+    REQUIRE(!proxy_copy.empty());
+    CHECK(&proxy.master_weights() == &proxy_copy.master_weights());
+    CHECK(proxy_copy.values().Height() == proxy.values().Height());
+    CHECK(proxy_copy.values().Width() == proxy.values().Width());
+  }
+
+  SECTION("WeightsProxy move construction")
+  {
+    lbann::WeightsProxy<DataType> proxy_move(std::move(proxy));
+    REQUIRE(proxy.empty());
+    REQUIRE(!proxy_move.empty());
+    CHECK(&proxy_move.master_weights() == &dtw);
+    CHECK(proxy_move.values().Height() == dtw.get_values().Height());
+    CHECK(proxy_move.values().Width() == dtw.get_values().Width());
+  }
+
+  SECTION("WeightsProxy swap operation")
+  {
+    lbann::WeightsProxy<DataType> proxy_other;
+    REQUIRE(proxy_other.empty());
+
+    // Do the swap (no point REQUIRE_NOTHROW since the swap operation
+    // is noexcept -- it will terminate if an exception is encountered).
+    std::swap(proxy, proxy_other);
+    REQUIRE(proxy.empty());
+    REQUIRE(!proxy_other.empty());
+    CHECK(&proxy_other.master_weights() == &dtw);
+    CHECK(proxy_other.values().Height() == dtw.get_values().Height());
+    CHECK(proxy_other.values().Width() == dtw.get_values().Width());
+  }
+
+  SECTION("WeightsProxy copy assignment")
+  {
+    lbann::WeightsProxy<DataType> proxy_other;
+    REQUIRE(proxy_other.empty());
+
+    proxy_other = proxy;
+
+    REQUIRE(!proxy.empty());
+    REQUIRE(!proxy_other.empty());
+
+    CHECK(&proxy_other.master_weights() == &proxy.master_weights());
+    CHECK(proxy_other.values().Height() == proxy.values().Height());
+    CHECK(proxy_other.values().Width() == proxy.values().Width());
+  }
+
+  SECTION("WeightsProxy move assignment")
+  {
+    lbann::WeightsProxy<DataType> proxy_other;
+    REQUIRE(proxy_other.empty());
+
+    proxy_other = std::move(proxy);
+    REQUIRE(proxy.empty());
+    REQUIRE(!proxy_other.empty());
+    CHECK(&proxy_other.master_weights() == &dtw);
+    CHECK(proxy_other.values().Height() == dtw.get_values().Height());
+    CHECK(proxy_other.values().Width() == dtw.get_values().Width());
+  }
+
+  // Verify the default-construction path to sanity.
+  SECTION("Default-constructed proxy.")
+  {
+    lbann::WeightsProxy<DataType> proxy_default;
+    REQUIRE(proxy_default.empty());
+    REQUIRE_NOTHROW(proxy_default.setup(dtw));
+    REQUIRE(!proxy_default.empty());
+    CHECK(&proxy_default.master_weights() == &dtw);
+
+    // At this point, the proxy_default should have the right size.
+    CHECK(proxy_default.values().Height() == dtw.get_matrix_height());
+    CHECK(proxy_default.values().Width() == dtw.get_matrix_width());
+  }
+
+  SECTION("Default-constructed proxy via weights interface.")
+  {
+    lbann::WeightsProxy<DataType> proxy_default;
+    REQUIRE(proxy_default.empty());
+    REQUIRE_NOTHROW(
+      proxy_default.setup(static_cast<lbann::weights const&>(dtw)));
+    REQUIRE(!proxy_default.empty());
+    CHECK(&proxy_default.master_weights() == &dtw);
+
+    // At this point, the proxy_default should have the right size.
+    CHECK(proxy_default.values().Height() == dtw.get_matrix_height());
+    CHECK(proxy_default.values().Width() == dtw.get_matrix_width());
+  }
+}
diff --git a/src/weights/variance_scaling_initializers.cpp b/src/weights/variance_scaling_initializers.cpp
index 278a81f5a11..a34b308a736 100644
--- a/src/weights/variance_scaling_initializers.cpp
+++ b/src/weights/variance_scaling_initializers.cpp
@@ -24,14 +24,79 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
+#define LBANN_VARIANCE_SCALING_INITIALIZER_INSTANTIATE
 #include "lbann/weights/variance_scaling_initializers.hpp"
 #include "lbann/utils/exception.hpp"
+#include "lbann/utils/memory.hpp"
+#include "lbann/utils/h2_tmp.hpp"
+
+#include <weights.pb.h>
 
 namespace lbann {
+namespace {
+using ValidDataTypes = h2::meta::TL<
+#ifdef LBANN_HAS_GPU_FP16
+  fp16,
+#endif
+#ifdef LBANN_HAS_HALF
+  cpu_fp16,
+#endif
+  float, double>;
+
+using InitTypes =
+  h2::meta::tlist::ExpandTL<variance_scaling_initializer, ValidDataTypes>;
+
+struct default_errors {
+  template <typename... Ts>
+  void DispatchError(Ts&&...) {
+    LBANN_ERROR("Failed to dispatch.");
+  }
+  template <typename... Ts>
+  void DeductionError(Ts&&...) {
+    // In this case, the initializer is just not a variance scaling
+    // initializer. This isn't a problem, so do nothing.
+  }
+};
+
+struct fan_in_functor : default_errors {
+  fan_in_functor(double val) : value_{val} {}
+  template <typename DType>
+  void operator()(variance_scaling_initializer<DType>& init)
+  {
+    init.set_fan_in(value_);
+  }
+  double value_;
+};
+struct fan_out_functor : default_errors {
+  fan_out_functor(double val) : value_{val} {}
+  template <typename DType>
+  void operator()(variance_scaling_initializer<DType>& init)
+  {
+    init.set_fan_out(value_);
+  }
+  double value_;
+};
+}
+
+void set_fan_in(weights_initializer& initializer, double value) {
+  using Dispatcher =
+    h2::multimethods::SwitchDispatcher<fan_in_functor,
+                                       void,
+                                       weights_initializer, InitTypes>;
+  Dispatcher::Exec(fan_in_functor(value), initializer);
+}
 
-variance_scaling_initializer::variance_scaling_initializer(probability_distribution dist)
-  : weights_initializer(),
-    m_prob_dist(dist),
+void set_fan_out(weights_initializer& initializer, double value) {
+  using Dispatcher =
+    h2::multimethods::SwitchDispatcher<fan_out_functor,
+                                       void,
+                                       weights_initializer, InitTypes>;
+  Dispatcher::Exec(fan_out_functor(value), initializer);
+}
+
+template <typename TensorDataType>
+variance_scaling_initializer<TensorDataType>::variance_scaling_initializer(probability_distribution dist)
+  : m_prob_dist(dist),
     m_fan_in(0),
     m_fan_out(0) {
   if (m_prob_dist != probability_distribution::gaussian
@@ -44,8 +109,9 @@ variance_scaling_initializer::variance_scaling_initializer(probability_distribut
   }
 }
 
-description variance_scaling_initializer::get_description() const {
-  auto&& desc = weights_initializer::get_description();
+template <typename TensorDataType>
+description variance_scaling_initializer<TensorDataType>::get_description() const {
+  auto desc = data_type_weights_initializer<TensorDataType>::get_description();
   std::string dist_str;
   switch (m_prob_dist) {
   case probability_distribution::gaussian:
@@ -61,7 +127,8 @@ description variance_scaling_initializer::get_description() const {
   return desc;
 }
 
-void variance_scaling_initializer::fill(AbsDistMat& matrix) {
+template <typename TensorDataType>
+void variance_scaling_initializer<TensorDataType>::fill(El::AbstractDistMatrix<TensorDataType>& matrix) {
 
   // Check if fan-in and fan-out parameters are valid
   if (m_fan_in <= 0 || m_fan_out <= 0) {
@@ -79,11 +146,11 @@ void variance_scaling_initializer::fill(AbsDistMat& matrix) {
   switch (m_prob_dist) {
   case probability_distribution::gaussian:
     gaussian_fill(matrix, matrix.Height(), matrix.Width(),
-                  DataType(0), std::sqrt(variance));
+                  TensorDataType(0.), El::Sqrt(variance));
     break;
   case probability_distribution::uniform:
     uniform_fill(matrix, matrix.Height(), matrix.Width(),
-                 DataType(0), std::sqrt(3*variance));
+                 TensorDataType(0.), El::Sqrt(El::To<TensorDataType>(3)*variance));
     break;
   default:
     std::stringstream err;
@@ -95,16 +162,80 @@ void variance_scaling_initializer::fill(AbsDistMat& matrix) {
 
 }
 
-DataType glorot_initializer::get_variance(El::Int fan_in, El::Int fan_out) {
-  return DataType(2) / (fan_in + fan_out);
+template <typename TensorDataType>
+TensorDataType glorot_initializer<TensorDataType>::get_variance(El::Int fan_in, El::Int fan_out) {
+  return El::To<TensorDataType>(2.0) / El::To<TensorDataType>(fan_in + fan_out);
+}
+
+template <typename TensorDataType>
+TensorDataType he_initializer<TensorDataType>::get_variance(El::Int fan_in, El::Int fan_out) {
+  return El::To<TensorDataType>(2.0) / El::To<TensorDataType>(fan_in);
+}
+
+template <typename TensorDataType>
+TensorDataType lecun_initializer<TensorDataType>::get_variance(El::Int fan_in, El::Int fan_out) {
+  return El::TypeTraits<TensorDataType>::One() / El::To<TensorDataType>(fan_in);
+}
+
+//
+// Builder functions
+//
+
+// FIXME (trb 07/31/2019): This is kinda ugly, but its fine if there
+// are only 2 probability distributions
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  if (dynamic_cast<lbann_data::Initializer::GlorotNormalInitializer const*>(&msg))
+    return make_unique<glorot_initializer<TensorDataType>>(probability_distribution::gaussian);
+  else if (dynamic_cast<lbann_data::Initializer::GlorotUniformInitializer const*>(&msg))
+    return make_unique<glorot_initializer<TensorDataType>>(probability_distribution::uniform);
+  else {
+    LBANN_ERROR("build_glorot_initializer_from_pbuf: Bad message.");
+    return nullptr;
+  }
 }
 
-DataType he_initializer::get_variance(El::Int fan_in, El::Int fan_out) {
-  return DataType(2) / fan_in;
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_he_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  if (dynamic_cast<lbann_data::Initializer::HeNormalInitializer const*>(&msg))
+    return make_unique<he_initializer<TensorDataType>>(probability_distribution::gaussian);
+  else if (dynamic_cast<lbann_data::Initializer::HeUniformInitializer const*>(&msg))
+    return make_unique<he_initializer<TensorDataType>>(probability_distribution::uniform);
+  else {
+    LBANN_ERROR("build_he_initializer_from_pbuf: Bad message.");
+    return nullptr;
+  }
 }
 
-DataType lecun_initializer::get_variance(El::Int fan_in, El::Int fan_out) {
-  return DataType(1) / fan_in;
+template <typename TensorDataType>
+std::unique_ptr<weights_initializer>
+build_lecun_initializer_from_pbuf(google::protobuf::Message const& msg) {
+  if (dynamic_cast<lbann_data::Initializer::LeCunNormalInitializer const*>(&msg))
+    return make_unique<lecun_initializer<TensorDataType>>(probability_distribution::gaussian);
+  else if (dynamic_cast<lbann_data::Initializer::LeCunUniformInitializer const*>(&msg))
+    return make_unique<lecun_initializer<TensorDataType>>(probability_distribution::uniform);
+  else {
+    LBANN_ERROR("build_lecun_initializer_from_pbuf: Bad message.");
+    return nullptr;
+  }
 }
 
+
+#define PROTO(T)                                                           \
+  template class glorot_initializer<T>;                                    \
+  template class he_initializer<T>;                                        \
+  template class lecun_initializer<T>;                                     \
+  template std::unique_ptr<weights_initializer>                            \
+  build_glorot_initializer_from_pbuf<T>(google::protobuf::Message const&); \
+  template std::unique_ptr<weights_initializer>                            \
+  build_he_initializer_from_pbuf<T>(google::protobuf::Message const&);     \
+  template std::unique_ptr<weights_initializer>                            \
+  build_lecun_initializer_from_pbuf<T>(google::protobuf::Message const&);  \
+
+#define LBANN_INSTANTIATE_CPU_HALF
+#define LBANN_INSTANTIATE_GPU_HALF
+#include "lbann/macros/instantiate.hpp"
+
 }  // namespace lbann
diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp
index ee5fa776e0c..50514fd28e8 100644
--- a/src/weights/weights.cpp
+++ b/src/weights/weights.cpp
@@ -24,13 +24,18 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#include <utility>
-
 #include "lbann/weights/weights.hpp"
-#include "lbann/optimizers/optimizer.hpp"
 #include "lbann/utils/exception.hpp"
 #include "lbann/io/file_io.hpp"
 
+#include <layers.pb.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
 namespace lbann {
 
 namespace {
@@ -56,28 +61,23 @@ std::string get_dims_string(const std::vector<int>& matrix_height_dims,
 
 } // namespace
 
-weights::weights(lbann_comm* comm)
-  : m_comm(comm),
+weights::weights()
+  : m_comm(nullptr),
     m_frozen(false) {
 
   // Initialize weights name
   static int num_weights = 0;
   m_name = "weights" + std::to_string(num_weights);
   num_weights++;
+}
 
-  // Default matrix distribution
-  m_matrix_dist.colDist = El::STAR;
-  m_matrix_dist.rowDist = El::STAR;
-  m_matrix_dist.blockHeight = 1;
-  m_matrix_dist.blockWidth = 1;
-  m_matrix_dist.colAlign = 0;
-  m_matrix_dist.rowAlign = 0;
-  m_matrix_dist.colCut = 0;
-  m_matrix_dist.rowCut = 0;
-  m_matrix_dist.root = 0;
-  m_matrix_dist.grid = &(comm->get_trainer_grid());
-  m_matrix_dist.device = El::Device::CPU;
+weights::weights(lbann_comm* comm)
+  : weights() {
+
+  m_comm = comm;
+  if(comm == nullptr) { LBANN_ERROR("Unable to construct weights with null comm ptr"); }
 
+  setup_default_matrix_distribution();
 }
 
 weights::weights(const weights& other)
@@ -88,16 +88,6 @@ weights::weights(const weights& other)
     m_matrix_dist(other.m_matrix_dist),
     m_frozen(other.m_frozen) {
 
-  // Deep copies
-  m_values.reset(other.m_values ? other.m_values->Copy() : nullptr);
-  m_initializer.reset(other.m_initializer ?
-                      other.m_initializer->copy() : nullptr);
-  m_optimizer.reset(other.m_optimizer ?
-                    other.m_optimizer->copy() : nullptr);
-  if (m_optimizer != nullptr) {
-    m_optimizer->set_weights(this);
-  }
-
 }
 
 weights& weights::operator=(const weights& other) {
@@ -108,21 +98,11 @@ weights& weights::operator=(const weights& other) {
   m_matrix_dist = other.m_matrix_dist;
   m_frozen = other.m_frozen;
 
-  // Deep copies
-  m_values.reset(other.m_values ? other.m_values->Copy() : nullptr);
-  m_initializer.reset(other.m_initializer ?
-                      other.m_initializer->copy() : nullptr);
-  m_optimizer.reset(other.m_optimizer ?
-                    other.m_optimizer->copy() : nullptr);
-  if (m_optimizer != nullptr) {
-    m_optimizer->set_weights(this);
-  }
-
   return *this;
 }
 
 description weights::get_description() const {
-  std::stringstream ss;
+  std::ostringstream ss;
 
   // Construct description object
   description desc(get_name());
@@ -136,21 +116,14 @@ description weights::get_description() const {
   }
   desc.add("Dimensions", ss.str());
 
-  // Optimizer
-  if (m_optimizer != nullptr) {
-    desc.add(m_optimizer->get_description());
-  }
-
-  // Initializer
-  if (m_initializer != nullptr) {
-    desc.add(m_initializer->get_description());
-  }
-
   // Freeze state
   if (is_frozen()) {
     desc.add("Frozen");
   }
 
+  // Derived class contribution
+  do_augment_description_(desc);
+
   return desc;
 }
 
@@ -187,59 +160,25 @@ int weights::get_matrix_width() const {
 }
 void weights::set_dims(std::vector<int> matrix_height_dims,
                        std::vector<int> matrix_width_dims) {
-  m_matrix_height_dims = matrix_height_dims;
-  m_matrix_width_dims = matrix_width_dims;
-  if (m_values != nullptr) {
-    const auto& height = get_matrix_height();
-    const auto& width = get_matrix_width();
-    if (m_values->Height() != height || m_values->Width() != width) {
-      std::stringstream err;
-      err << "attempted to set weights \"" << get_name() << "\" "
-          << "with dimensions "
-          << get_dims_string(matrix_height_dims, matrix_width_dims) << ", "
-          << "but it is already setup with a "
-          << m_values->Height() << " x " << m_values->Width() << " "
-          << "weights matrix";
-      LBANN_ERROR(err.str());
-    }
-  }
-}
-
-// -----------------------------------------------
-// Initializer accessors
-// -----------------------------------------------
-
-weights_initializer* weights::get_initializer() {
-  return const_cast<weights_initializer*>(static_cast<const weights&>(*this).get_initializer());
-}
-const weights_initializer* weights::get_initializer() const {
-  return m_initializer.get();
-}
-void weights::set_initializer(std::unique_ptr<weights_initializer>& init) {
-  m_initializer = std::move(init);
+  m_matrix_height_dims = std::move(matrix_height_dims);
+  m_matrix_width_dims = std::move(matrix_width_dims);
+  do_set_dims_(matrix_height_dims, matrix_width_dims);
 }
 
 // -----------------------------------------------
-// Optimizer accessors
+// Matrix distribution accessors
 // -----------------------------------------------
 
-optimizer* weights::get_optimizer() {
-  return const_cast<optimizer*>(static_cast<const weights&>(*this).get_optimizer());
-}
-const optimizer* weights::get_optimizer() const {
-  if (m_frozen) {
-    return nullptr;
-  } else {
-    return m_optimizer.get();
+void weights::set_values(El::BaseDistMatrix const& values) {
+  if ((values.Height() != get_values().Height())
+      || (values.Width() != get_values().Width())) {
+    LBANN_ERROR("Expected matrix size ",
+                this->get_matrix_height(), "x", this->get_matrix_width(),
+                "; got a matrix with size ",
+                values.Height(), "x", values.Width());
   }
+  El::Copy(values, this->get_values());
 }
-void weights::set_optimizer(std::unique_ptr<optimizer>& opt) {
-  m_optimizer = std::move(opt);
-}
-
-// -----------------------------------------------
-// Matrix distribution accessors
-// -----------------------------------------------
 
 El::DistData weights::get_matrix_distribution() const {
   return m_matrix_dist;
@@ -248,6 +187,25 @@ void weights::set_matrix_distribution(El::DistData dist) {
   m_matrix_dist = dist;
 }
 
+void weights::set_comm(lbann_comm& comm) {
+  m_comm = &comm;
+}
+
+void weights::setup_default_matrix_distribution() {
+  // Default matrix distribution
+  m_matrix_dist.colDist = El::STAR;
+  m_matrix_dist.rowDist = El::STAR;
+  m_matrix_dist.blockHeight = 1;
+  m_matrix_dist.blockWidth = 1;
+  m_matrix_dist.colAlign = 0;
+  m_matrix_dist.rowAlign = 0;
+  m_matrix_dist.colCut = 0;
+  m_matrix_dist.rowCut = 0;
+  m_matrix_dist.root = 0;
+  m_matrix_dist.grid = &(m_comm->get_trainer_grid());
+  m_matrix_dist.device = El::Device::CPU;
+}
+
 // -----------------------------------------------
 // Setup
 // -----------------------------------------------
@@ -262,266 +220,14 @@ void weights::setup() {
       || std::any_of(m_matrix_width_dims.begin(),
                      m_matrix_width_dims.end(),
                      is_nonpositive)) {
-    std::stringstream err;
-    err << "attempted to setup weights \"" << get_name() << "\" with a "
-        << get_dims_string(m_matrix_height_dims, m_matrix_width_dims) << " "
-        << "weights matrix";
-    LBANN_ERROR(err.str());
-  }
-
-  // Construct weights matrix
-  m_values.reset(AbsDistMat::Instantiate(*m_matrix_dist.grid,
-                                         m_matrix_dist.root,
-                                         m_matrix_dist.colDist,
-                                         m_matrix_dist.rowDist,
-                                         (m_matrix_dist.blockHeight == 1
-                                          && m_matrix_dist.blockWidth == 1 ?
-                                          El::ELEMENT : El::BLOCK),
-                                         m_matrix_dist.device));
-  m_values->AlignWith(m_matrix_dist);
-  m_values->Resize(get_matrix_height(), get_matrix_width());
-  if (m_initializer != nullptr) {
-    m_initializer->fill(*m_values);
-  } else {
-    El::Zero(*m_values);
-  }
-
-  // Setup optimizer
-  if (m_optimizer != nullptr) {
-    m_optimizer->setup(this);
+    LBANN_ERROR(
+      "attempted to setup weights \"", this->get_name(), "\" with a ",
+      get_dims_string(m_matrix_height_dims, m_matrix_width_dims),
+      " weights matrix");
   }
 
-}
-
-// -----------------------------------------------
-// Weight matrix accessors
-// -----------------------------------------------
-
-AbsDistMat& weights::get_values() {
-  return const_cast<AbsDistMat&>(static_cast<const weights&>(*this).get_values());
-}
-const AbsDistMat& weights::get_values() const {
-  if (m_values == nullptr) {
-    LBANN_ERROR("attempted to access values of "
-                "weights \"" + get_name() + "\" "
-                "before they are setup");
-  }
-  return *m_values;
-}
-
-void weights::set_values(const AbsDistMat& values) {
-  El::Copy(values, get_values());
-}
-
-void weights::set_value(DataType value, int index) {
-
-#ifdef LBANN_DEBUG
-  // Check that tensor position is valid
-  const auto& size = get_size();
-  if (index < 0 || index >= size) {
-    std::stringstream err;
-    err << "attempted to set value in "
-        << "weights \"" << get_name() << "\""
-        << "at index " << index << ", "
-        << "but there are " << size << " values";
-    LBANN_ERROR(err.str());
-  }
-#endif // LBANN_DEBUG
-
-  // Set matrix entry
-  const auto& height = get_matrix_height();
-  set_value(value, index % height, index / height);
-
-}
-
-void weights::set_value(DataType value, std::vector<int> pos) {
-
-  // Get tensor dimensions
-  const auto& dims = get_dims();
-
-#ifdef LBANN_DEBUG
-  // Check that tensor position is valid
-  bool valid = dims.size() == pos.size();
-  for (size_t i = 0 ; i < dims.size(); ++i) {
-    valid = valid && pos[i] >= 0 && pos[i] < dims[i];
-  }
-  if (!valid) {
-    std::stringstream err;
-    err << "attempted to set value in "
-        << "weights \"" << get_name() << "\""
-        << "at position (";
-    for (size_t i = 0 ; i < pos.size(); ++i) {
-      err << (i > 0 ? "x" : "") << pos[i];
-    }
-    err << ") in a tensor with dimensions ";
-    for (size_t i = 0 ; i < dims.size(); ++i) {
-      err << (i > 0 ? "x" : "") << dims[i];
-    }
-    LBANN_ERROR(err.str());
-  }
-#endif // LBANN_DEBUG
-
-  // Get index of weight value and set
-  int index = 0;
-  for (size_t i = 0; i < dims.size(); ++i) {
-    index = index * dims[i] + pos[i];
-  }
-  set_value(value, index);
-
-}
-
-void weights::set_value(DataType value, int row, int col) {
-
-#ifdef LBANN_DEBUG
-  // Check that matrix entry is valid
-  const auto& height = get_matrix_height();
-  const auto& width = get_matrix_width();
-  if (row < 0 || row >= height || col < 0 || col > width ) {
-    std::stringstream err;
-    err << "attempted to set weights value "
-        << "in weights \"" << get_name() << "\""
-        << "at entry (" << row << "," << col << ") "
-        << "in a " << height << "x" << width << " matrix";
-    LBANN_ERROR(err.str());
-  }
-#endif // LBANN_DEBUG
-
-  // Set value if it is local
-  auto& values = get_values();
-  if (values.IsLocal(row, col)) {
-    values.SetLocal(values.LocalRow(row), values.LocalCol(col), value);
-  }
-
-}
-
-void weights::reconcile_values() {
-  auto& values = get_values();
-  if (values.RedundantSize() > 1) {
-    El::Scale(DataType(1) / values.RedundantSize(), values);
-    m_comm->allreduce(values, values.RedundantComm());
-  }
-}
-
-void weights::reconcile_values(Al::request& req) {
-  auto& values = get_values();
-  if (values.RedundantSize() > 1) {
-    El::Scale(DataType(1) / values.RedundantSize(), values);
-    m_comm->nb_allreduce(values, values.RedundantComm(), req);
-  }
-}
-
-// -----------------------------------------------
-// Checkpointing
-// -----------------------------------------------
-
-bool weights::save_to_checkpoint_shared(lbann::persist& p)
-{
-  // define name to store weight values
-  char l_name[512];
-  sprintf(l_name, "weights_%s_%lldx%lld", m_name.c_str(), m_values->Height(), m_values->Width());
-  // write weights using persist call -- uses Elemental's write function.
-  p.write_distmat(persist_type::model, l_name, m_values.get());
-  // if saving training state, also write out state of optimizer
-  if (m_optimizer != nullptr && (p.get_cb_type() == callback_type::batch || p.get_cb_type() == callback_type::epoch)) {
-    m_optimizer->save_to_checkpoint_shared(p, m_name);
-  }
-
-  return true;
-}
-
-void weights::write_proto(lbann_data::WeightsData* proto) const {
-
-  // Set proto properties
-  proto->Clear();
-  proto->set_name(m_name);
-  for (const auto& d : get_dims()) {
-    proto->mutable_shape()->add_dim(d);
-  }
-  proto->set_height(get_matrix_height());
-  proto->set_width(get_matrix_width());
-
-  // Write weight values to prototext on world master process
-  CircMat<El::Device::CPU> values = *m_values; /// @todo What if weights are on GPU?
-  values.SetRoot(0); /// @todo What if world master is not process 0?
-  if (m_comm->am_world_master()) {
-    const auto& local_values = values.LockedMatrix();
-    const El::Int height = local_values.Height();
-    const El::Int width = local_values.Width();
-    /// @todo OpenMP parallelization
-    /** @todo Our matrices are column-major while Numpy expects
-     *  row-major matrices. This row-wise iteration is fine for
-     *  matrices and column vectors, but it can mess up the order of
-     *  the weights if a high-dimensional tensor is represented as a
-     *  matrix. This is what we need for quantization on convolution
-     *  kernel weights.
-     */
-    for (El::Int i = 0; i < height; ++i) {
-      for (El::Int j = 0; j < width; ++j) {
-        proto->add_data(local_values(i,j));
-      }
-    }
-  }
-
-}
-
-bool weights::load_from_checkpoint_shared(lbann::persist& p)
-{
-  // define filename containing saved weight values
-  auto f_name = El::BuildString("weights_", m_name, "_",
-                                m_values->Height(), "x", m_values->Width(),
-                                ".bin");
-  p.read_distmat(persist_type::model, f_name.c_str(), m_values.get());
-  if (m_optimizer != nullptr) {
-    m_optimizer->load_from_checkpoint_shared(p, m_name);
-  }
-
-  return true;
-}
-
-bool weights::load_from_save(std::string const& ckpt_dir, std::vector<std::string> const& weight_list){
-  // create weight file name to match to weight list entry
-  auto l_name = El::BuildString("model_weights_", m_name, "_",
-                                m_values->Height(), "x", m_values->Width(), ".bin");
-  auto it = std::find(weight_list.begin(),weight_list.end(),l_name);
-  // If match is found read in weight values.
-  if(it != weight_list.end()) {
-    std::string full_path = ckpt_dir + *it;
-    if(m_comm->am_world_master()) {
-      std::cout << "Loading " << m_name << " <- " << *it << "\n";
-    }
-    // check whether file exists
-    int exists = lbann::exists(full_path.c_str());
-    if (! exists) {
-      throw lbann_exception(std::string("Failed to read weight matrix: ") + full_path);
-      return false;
-    }
-    El::Read(*m_values,full_path, El::BINARY, true);
-  }
-  return true;
-}
-
-bool weights::save_to_checkpoint_distributed(lbann::persist& p){
-  // Functions identically to shared checkpoint except weights and parameters are saved on a per rank basis
-  auto l_name = El::BuildString("weights_", m_name,
-                                "_", m_values->LocalHeight(),
-                                "x", m_values->LocalWidth(), ".bin");
-  p.write_rank_distmat(persist_type::model, l_name.c_str(), *m_values);
-  if (m_optimizer != nullptr) {
-    m_optimizer->save_to_checkpoint_distributed(p, m_name);
-  }
-  return true;
-}
-
-bool weights::load_from_checkpoint_distributed(lbann::persist& p){
-  // Functions identically to shared checkpoint except weights and parameters are loaded on a per rank basis
-  auto l_name = El::BuildString("weights_", m_name,
-                                "_", m_values->LocalHeight(),
-                                "x", m_values->LocalWidth(), ".bin");
-  p.read_rank_distmat(persist_type::model, l_name.c_str(), *m_values);
-  if (m_optimizer != nullptr) {
-    m_optimizer->load_from_checkpoint_distributed(p, m_name);
-  }
-  return true;
+  // Derived class setup
+  do_setup_();
 }
 
 }  // namespace lbann
diff --git a/superbuild/CMakeLists.txt b/superbuild/CMakeLists.txt
index 7fb7133edd4..677feb2ceb5 100644
--- a/superbuild/CMakeLists.txt
+++ b/superbuild/CMakeLists.txt
@@ -35,8 +35,12 @@ option(LBANN_SB_CLONE_VIA_SSH
 
 option(LBANN_SB_BUILD_ALUMINUM "Pull and build Aluminum from Github" OFF)
 
+option(LBANN_SB_BUILD_CATCH2 "Pull and install CATCH2 library from Github" OFF)
+
 option(LBANN_SB_BUILD_CEREAL "Pull and install CEREAL library from Github" OFF)
 
+option(LBANN_SB_BUILD_CLARA "Pull and install Clara library from Github" OFF)
+
 option(LBANN_SB_BUILD_CNPY "Pull and build CNPY from Github" OFF)
 
 option(LBANN_SB_BUILD_CONDUIT "Pull and build CONDUIT from Github" OFF)
@@ -47,6 +51,8 @@ option(LBANN_SB_BUILD_HDF5 "Pull and build HDF5 from the internet (not github)"
 
 option(LBANN_SB_BUILD_HYDROGEN "Pull and build Hydrogen from Github" OFF)
 
+option(LBANN_SB_BUILD_DIHYDROGEN "Pull and build DiHydrogen from Github" OFF)
+
 option(LBANN_SB_BUILD_JPEG_TURBO "Download and build JPEG turbo" OFF)
 
 option(LBANN_SB_BUILD_OPENCV "Pull and build OpenCV from Github" OFF)
@@ -58,6 +64,8 @@ option(LBANN_SB_BUILD_LBANN "Pull and build LBANN from Github" OFF)
 #
 # Add the TPL subdirectories
 #
+set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+set(_GIT_TAG_TAG "GIT_TAG")
 include(LBANNSuperBuildCreateCMakeArguments)
 
 if (LBANN_SB_BUILD_ALUMINUM)
@@ -65,6 +73,16 @@ if (LBANN_SB_BUILD_ALUMINUM)
   list(APPEND _BUILD_PKGS ALUMINUM)
 endif ()
 
+if (LBANN_SB_BUILD_CATCH2)
+  add_subdirectory(catch2)
+  list(APPEND _BUILD_PKGS CATCH2)
+endif ()
+
+if (LBANN_SB_BUILD_CLARA)
+  add_subdirectory(clara)
+  list(APPEND _BUILD_PKGS CLARA)
+endif ()
+
 if (LBANN_SB_BUILD_CEREAL)
   add_subdirectory(cereal)
   list(APPEND _BUILD_PKGS CEREAL)
@@ -102,6 +120,11 @@ if (LBANN_SB_BUILD_HYDROGEN)
   list(APPEND _BUILD_PKGS HYDROGEN)
 endif ()
 
+if (LBANN_SB_BUILD_DIHYDROGEN)
+  add_subdirectory(dihydrogen)
+  list(APPEND _BUILD_PKGS DIHYDROGEN)
+endif ()
+
 if (LBANN_SB_BUILD_JPEG_TURBO)
   add_subdirectory(jpeg-turbo)
   list(APPEND _BUILD_PKGS JPEG-TURBO)
diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt
index 3f2825f2233..75ac7faf73a 100644
--- a/superbuild/aluminum/CMakeLists.txt
+++ b/superbuild/aluminum/CMakeLists.txt
@@ -11,7 +11,7 @@ else ()
     CACHE STRING "The URL from which to clone Aluminum")
 endif ()
 
-set(ALUMINUM_TAG "master"
+set(ALUMINUM_TAG "v0.3.3"
   CACHE STRING "The git tag to checkout for Aluminum")
 
 set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
@@ -44,14 +44,27 @@ create_cmake_arguments(
   EXTRA_REMOVE_PREFIXES "LBANN_SB_FWD_ALUMINUM" "LBANN_SB_FWD_Aluminum"
   VARIABLES ${ALUMINUM_VARIABLES})
 
+if (ALUMINUM_CUSTOM_SOURCE_DIR)
+  set(ALUMINUM_SOURCE_DIR "${ALUMINUM_CUSTOM_SOURCE_DIR}")
+  set(ALUMINUM_URL "")
+  set(ALUMINUM_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using ALUMINUM source in: ${ALUMINUM_SOURCE_DIR}")
+else ()
+  set(ALUMINUM_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(ALUMINUM
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${ALUMINUM_URL}
-  GIT_TAG ${ALUMINUM_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${ALUMINUM_URL}
+  ${_GIT_TAG_TAG} ${ALUMINUM_TAG}
+  SOURCE_DIR ${ALUMINUM_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${ALUMINUM_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/catch2/CMakeLists.txt b/superbuild/catch2/CMakeLists.txt
new file mode 100644
index 00000000000..4190f933169
--- /dev/null
+++ b/superbuild/catch2/CMakeLists.txt
@@ -0,0 +1,72 @@
+# Use CATCH2_URL to specify the location of the git repo. Use
+# CATCH2_TAG to specify the commit.
+
+enable_language(CXX)
+
+# Handle the clone mechanism. First URL
+option(CATCH2_CLONE_VIA_SSH
+  "Clone CATCH2 using SSH instead of HTTPS" ${LBANN_SB_CLONE_VIA_SSH})
+
+if (CATCH2_CLONE_VIA_SSH)
+  set(CATCH2_URL git@github.com:catchorg/catch2.git
+    CACHE STRING "The URL from which to clone CATCH2")
+else ()
+  set(CATCH2_URL "https://github.com/catchorg/catch2.git"
+    CACHE STRING "The URL from which to clone CATCH2")
+endif ()
+
+# ... then the tag.
+set(CATCH2_TAG "v2.9.2"
+  CACHE STRING "The git tag or hash to checkout for CATCH2")
+
+# Where to install CATCH2
+set(CATCH2_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
+  CACHE PATH "The installation location of CATCH2.")
+
+# The build type for CATCH2
+set(CATCH2_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
+  CACHE STRING "The build type for CATCH2.")
+
+if (CATCH2_CUSTOM_SOURCE_DIR)
+  set(CATCH2_SOURCE_DIR "${CATCH2_CUSTOM_SOURCE_DIR}")
+  set(CATCH2_URL "")
+  set(CATCH2_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CATCH2 source in: ${CATCH2_SOURCE_DIR}")
+else ()
+  set(CATCH2_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
+# Now add the external project
+include(ExternalProject)
+ExternalProject_Add(CATCH2
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
+  ${_GIT_REPOSITORY_TAG} ${CATCH2_URL}
+  ${_GIT_TAG_TAG} ${CATCH2_TAG}
+  SOURCE_DIR ${CATCH2_SOURCE_DIR}
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
+  INSTALL_DIR ${CATCH2_CMAKE_INSTALL_PREFIX}
+  USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
+  CMAKE_ARGS
+  -G${CMAKE_GENERATOR}
+  -DCMAKE_INSTALL_PREFIX=${CATCH2_CMAKE_INSTALL_PREFIX}
+  -DCMAKE_BUILD_TYPE=${CATCH2_CMAKE_BUILD_TYPE}
+  -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCATCH_BUILD_TESTING=OFF
+  -DCATCH_BUILDE_EXAMPLES=OFF
+  -DCATCH_ENABLE_WERROR=OFF
+  )
+
+set(CATCH2_DIR ${CATCH2_CMAKE_INSTALL_PREFIX}
+  CACHE INTERNAL "The install prefix of CATCH2.")
diff --git a/superbuild/cereal/CMakeLists.txt b/superbuild/cereal/CMakeLists.txt
index 5f1b924c600..ea726d0d1d0 100644
--- a/superbuild/cereal/CMakeLists.txt
+++ b/superbuild/cereal/CMakeLists.txt
@@ -13,12 +13,25 @@ else ()
     CACHE STRING "The URL from which to clone CEREAL.")
 endif ()
 
-set(CEREAL_TAG "master" CACHE STRING "The git tag or hash to checkout for CEREAL")
+set(CEREAL_TAG "v1.3.0" CACHE STRING "The git tag or hash to checkout for CEREAL")
 
 # Where to install CEREAL
 set(CEREAL_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
   CACHE PATH "The installation location of CEREAL.")
 
+if (CEREAL_CUSTOM_SOURCE_DIR)
+  set(CEREAL_SOURCE_DIR "${CEREAL_CUSTOM_SOURCE_DIR}")
+  set(CEREAL_URL "")
+  set(CEREAL_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CEREAL source in: ${CEREAL_SOURCE_DIR}")
+else ()
+  set(CEREAL_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 # Handle the install of CEREAL
 include(ExternalProject)
 
@@ -26,9 +39,9 @@ ExternalProject_Add(CEREAL
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${CEREAL_URL}
-  GIT_TAG ${CEREAL_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${CEREAL_URL}
+  ${_GIT_TAG_TAG} ${CEREAL_TAG}
+  SOURCE_DIR ${CEREAL_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${CEREAL_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/clara/CMakeLists.txt b/superbuild/clara/CMakeLists.txt
new file mode 100644
index 00000000000..a0b5bc95966
--- /dev/null
+++ b/superbuild/clara/CMakeLists.txt
@@ -0,0 +1,74 @@
+# Use CLARA_URL to specify the location of the git repo. Use
+# CLARA_TAG to specify the commit.
+
+enable_language(CXX)
+
+# Handle the clone mechanism. First URL
+option(CLARA_CLONE_VIA_SSH
+  "Clone CLARA using SSH instead of HTTPS" ${LBANN_SB_CLONE_VIA_SSH})
+
+if (CLARA_CLONE_VIA_SSH)
+  set(CLARA_URL git@github.com:catchorg/clara.git
+    CACHE STRING "The URL from which to clone CLARA")
+else ()
+  set(CLARA_URL "https://github.com/catchorg/clara.git"
+    CACHE STRING "The URL from which to clone CLARA")
+endif ()
+
+# ... then the tag.
+set(CLARA_TAG "v1.1.5"
+  CACHE STRING "The git tag or hash to checkout for CLARA")
+
+# Where to install CLARA
+set(CLARA_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
+  CACHE PATH "The installation location of CLARA.")
+
+# The build type for CLARA
+set(CLARA_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
+  CACHE STRING "The build type for CLARA.")
+
+if (CLARA_CUSTOM_SOURCE_DIR)
+  set(CLARA_SOURCE_DIR "${CLARA_CUSTOM_SOURCE_DIR}")
+  set(CLARA_URL "")
+  set(CLARA_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CLARA source in: ${CLARA_SOURCE_DIR}")
+else ()
+  set(CLARA_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
+set(CLARA_INSTALL_DIR ${CLARA_CMAKE_INSTALL_PREFIX}/include)
+
+# Now add the external project
+include(ExternalProject)
+ExternalProject_Add(CLARA
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
+  ${_GIT_REPOSITORY_TAG} ${CLARA_URL}
+  ${_GIT_TAG_TAG} ${CLARA_TAG}
+  SOURCE_DIR ${CLARA_SOURCE_DIR}
+  BUILD_IN_SOURCE 1
+  INSTALL_DIR ${CLARA_CMAKE_INSTALL_PREFIX}
+  USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND
+  ${CMAKE_COMMAND} -E make_directory ${CLARA_INSTALL_DIR}
+  COMMAND
+  ${CMAKE_COMMAND} -E copy
+  ${CLARA_SOURCE_DIR}/single_include/clara.hpp
+  ${CLARA_INSTALL_DIR}
+  )
+
+set(CLARA_DIR ${CLARA_CMAKE_INSTALL_PREFIX}
+  CACHE INTERNAL "The install prefix of CLARA.")
diff --git a/superbuild/cnpy/CMakeLists.txt b/superbuild/cnpy/CMakeLists.txt
index 400ef006cea..31e2e072564 100644
--- a/superbuild/cnpy/CMakeLists.txt
+++ b/superbuild/cnpy/CMakeLists.txt
@@ -19,8 +19,9 @@ else ()
     CACHE STRING "The URL from which to clone CNPY")
 endif ()
 
+
 # ... then the tag.
-set(CNPY_TAG "f19917f6c442885dcf171de485ba8b17bd178da6"
+set(CNPY_TAG "4e8810b1a8637695171ed346ce68f6984e585ef4"
   CACHE STRING "The git tag or hash to checkout for CNPY")
 
 # Where to install CNPY
@@ -31,16 +32,28 @@ set(CNPY_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
 set(CNPY_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
   CACHE STRING "The build type for CNPY.")
 
+if (CNPY_CUSTOM_SOURCE_DIR)
+  set(CNPY_SOURCE_DIR "${CNPY_CUSTOM_SOURCE_DIR}")
+  set(CNPY_URL "")
+  set(CNPY_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CNPY source in: ${CNPY_SOURCE_DIR}")
+else ()
+  set(CNPY_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 # Now add the external project
 include(ExternalProject)
-
 ExternalProject_Add(CNPY
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${CNPY_URL}
-  GIT_TAG ${CNPY_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${CNPY_URL}
+  ${_GIT_TAG_TAG} ${CNPY_TAG}
+  SOURCE_DIR ${CNPY_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${CNPY_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/conduit/CMakeLists.txt b/superbuild/conduit/CMakeLists.txt
index 61128d47e3e..307a5c31bd1 100644
--- a/superbuild/conduit/CMakeLists.txt
+++ b/superbuild/conduit/CMakeLists.txt
@@ -16,7 +16,7 @@ else ()
     CACHE STRING "The URL from which to clone CONDUIT")
 endif ()
 
-set(CONDUIT_TAG "master"
+set(CONDUIT_TAG "v0.4.0"
   CACHE STRING "The git tag to checkout for CONDUIT")
 
 set(CONDUIT_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
@@ -68,14 +68,27 @@ if (CONDUIT_ENABLE_FORTRAN)
     "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}")
 endif ()
 
+if (CONDUIT_CUSTOM_SOURCE_DIR)
+  set(CONDUIT_SOURCE_DIR "${CONDUIT_CUSTOM_SOURCE_DIR}")
+  set(CONDUIT_URL "")
+  set(CONDUIT_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CONDUIT source in: ${CONDUIT_SOURCE_DIR}")
+else ()
+  set(CONDUIT_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(CONDUIT
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${CONDUIT_URL}
-  GIT_TAG ${CONDUIT_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${CONDUIT_URL}
+  ${_GIT_TAG_TAG} ${CONDUIT_TAG}
+  SOURCE_DIR ${CONDUIT_SOURCE_DIR}
   SOURCE_SUBDIR src
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${CONDUIT_CMAKE_INSTALL_PREFIX}
diff --git a/superbuild/cub/CMakeLists.txt b/superbuild/cub/CMakeLists.txt
index 84af8a4ec71..e3f4ef9ee06 100644
--- a/superbuild/cub/CMakeLists.txt
+++ b/superbuild/cub/CMakeLists.txt
@@ -18,18 +18,31 @@ set(CUB_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
 # Handle the install of CUB
 include(ExternalProject)
 
+if (CUB_CUSTOM_SOURCE_DIR)
+  set(CUB_SOURCE_DIR "${CUB_CUSTOM_SOURCE_DIR}")
+  set(CUB_URL "")
+  set(CUB_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using CUB source in: ${CUB_SOURCE_DIR}")
+else ()
+  set(CUB_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 ExternalProject_Add(CUB
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${CUB_URL}
-  GIT_TAG ${CUB_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${CUB_URL}
+  ${_GIT_TAG_TAG} ${CUB_TAG}
+  SOURCE_DIR ${CUB_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   PATCH_COMMAND
   ${CMAKE_COMMAND} -E copy_if_different
   ${CMAKE_CURRENT_SOURCE_DIR}/CUBCMakeLists.txt
-  ${CMAKE_CURRENT_BINARY_DIR}/src/CMakeLists.txt &&
+  ${CUB_SOURCE_DIR}/CMakeLists.txt &&
   patch -p1 < ${LBANN_SRC_DIR}/external/cub/cub_enable_alloc_free_logging.patch
   INSTALL_DIR ${CUB_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/dihydrogen/CMakeLists.txt b/superbuild/dihydrogen/CMakeLists.txt
new file mode 100644
index 00000000000..f6b82cc5169
--- /dev/null
+++ b/superbuild/dihydrogen/CMakeLists.txt
@@ -0,0 +1,86 @@
+enable_language(CXX)
+
+option(DIHYDROGEN_CLONE_VIA_SSH
+  "Clone DiHydrogen using SSH instead of HTTPS" ${LBANN_SB_CLONE_VIA_SSH})
+
+if (DIHYDROGEN_CLONE_VIA_SSH)
+  set(DIHYDROGEN_URL git@github.com:llnl/DiHydrogen.git
+    CACHE STRING "The URL from which to clone DiHydrogen")
+else ()
+  set(DIHYDROGEN_URL https://github.com/llnl/DiHydrogen
+    CACHE STRING "The URL from which to clone DiHydrogen")
+endif ()
+
+set(DIHYDROGEN_TAG "master"
+  CACHE STRING "The git tag to checkout for DiHydrogen")
+
+set(DIHYDROGEN_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
+  CACHE STRING "The build type for DiHydrogen.")
+
+# Where to install DiHydrogen
+set(DIHYDROGEN_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
+  CACHE PATH "The installation location of DiHydrogen.")
+
+# Aluminum
+set(LBANN_SB_FWD_DIHYDROGEN_Aluminum_DIR
+  "${ALUMINUM_DIR}/lib64/cmake/aluminum"
+  CACHE STRING "The path to Aluminum for DiHydrogen.")
+list(APPEND _DIHYDROGEN_DEPENDS ALUMINUM)
+
+option(DIHYDROGEN_ENABLE_DISTCONV_LEGACY "Whether to enable Distconv in DiHydrogen" OFF)
+if (DIHYDROGEN_ENABLE_DISTCONV_LEGACY)
+  set(LBANN_SB_FWD_DIHYDROGEN_H2_ENABLE_CUDA ON)
+  set(LBANN_SB_FWD_DIHYDROGEN_H2_ENABLE_DISTCONV_LEGACY ON)
+  # CUB
+  set(LBANN_SB_FWD_DIHYDROGEN_CUB_DIR "${CUB_DIR}")
+  list(APPEND _DIHYDROGEN_DEPENDS CUB)
+else ()
+  set(LBANN_SB_FWD_DIHYDROGEN_H2_ENABLE_DISTCONV_LEGACY OFF)
+endif ()
+
+# Get the list of DiHydrogen variables
+get_property(DIHYDROGEN_VARIABLES DIRECTORY PROPERTY VARIABLES)
+list(FILTER DIHYDROGEN_VARIABLES INCLUDE REGEX
+  "^DIHYDROGEN_.*\|^LBANN_SB_FWD_DIHYDROGEN_.*")
+list(FILTER DIHYDROGEN_VARIABLES EXCLUDE REGEX "DIHYDROGEN_URL\|DIHYDROGEN_TAG")
+
+create_cmake_arguments(
+  OUTPUT_VARIABLE DIHYDROGEN_CMAKE_ARGS
+  PACKAGE_NAME DIHYDROGEN
+  SKIP_VARS_WITH_PREFIXES "LBANN_SB"
+  EXTRA_REMOVE_PREFIXES "LBANN_SB_FWD_DIHYDROGEN"
+  VARIABLES ${DIHYDROGEN_VARIABLES})
+
+message(STATUS ${DIHYDROGEN_VARIABLES})
+message(STATUS ${DIHYDROGEN_CMAKE_ARGS})
+
+include(ExternalProject)
+ExternalProject_Add(DIHYDROGEN
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
+  GIT_REPOSITORY ${DIHYDROGEN_URL}
+  GIT_TAG ${DIHYDROGEN_TAG}
+  DEPENDS ${_DIHYDROGEN_DEPENDS}
+  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
+  INSTALL_DIR ${DIHYDROGEN_CMAKE_INSTALL_PREFIX}
+  USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
+  LIST_SEPARATOR |
+  CMAKE_ARGS
+  -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+  -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+  -DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}
+  -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER}
+  ${DIHYDROGEN_CMAKE_ARGS}
+  )
+
+set(DIHYDROGEN_DIR ${DIHYDROGEN_CMAKE_INSTALL_PREFIX}
+  CACHE INTERNAL "The install prefix of DiHydrogen.")
diff --git a/superbuild/hdf5/CMakeLists.txt b/superbuild/hdf5/CMakeLists.txt
index 6266d7627db..5f8eeb70cdc 100644
--- a/superbuild/hdf5/CMakeLists.txt
+++ b/superbuild/hdf5/CMakeLists.txt
@@ -21,7 +21,7 @@ else ()
     CACHE STRING "The URL from which to clone HDF5")
 endif ()
 
-set(HDF5_TAG "develop"
+set(HDF5_TAG "hdf5-1_10_5"
   CACHE STRING "The git tag to checkout for HDF5")
 
 set(HDF5_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
@@ -46,6 +46,7 @@ option(HDF5_USE_18_API_DEFAULT "Use 1.8 API by default" ON)
 option(HDF5_USE_110_API_DEFAULT "Use 1.10 API by default" OFF)
 option(HDF5_USE_112_API_DEFAULT "Use 1.12 API by default" OFF)
 option(HDF5_BUILD_FORTRAN "Build HDF5 with fortran support" OFF)
+set(HDF5_DEFAULT_API_VERSION "v18")
 
 # At present, this is required for LBANN/JAG use.
 option(HDF5_ENABLE_Z_LIB_SUPPORT "Build HDF5 with ZLIB support" ON)
@@ -81,14 +82,27 @@ if (HDF5_BUILD_FORTRAN)
     "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}")
 endif ()
 
+if (HDF5_CUSTOM_SOURCE_DIR)
+  set(HDF5_SOURCE_DIR "${HDF5_CUSTOM_SOURCE_DIR}")
+  set(HDF5_URL "")
+  set(HDF5_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using HDF5 source in: ${HDF5_SOURCE_DIR}")
+else ()
+  set(HDF5_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(HDF5
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${HDF5_URL}
-  GIT_TAG ${HDF5_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${HDF5_URL}
+  ${_GIT_TAG_TAG} ${HDF5_TAG}
+  SOURCE_DIR ${HDF5_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${HDF5_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
@@ -108,6 +122,7 @@ ExternalProject_Add(HDF5
   -DBUILD_SHARED_LIBS=${HDF5_BUILD_SHARED_LIBS}
   -DBUILD_TESTING=${BUILD_HDF5_TESTING}
   -DHDF5_GENERATE_HEADERS=ON
+  -DDEFAULT_API_VERSION=${HDF5_DEFAULT_API_VERSION}
   ${HDF5_CMAKE_ARGS}
   )
 
diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt
index 3f6d816c7a7..bda354252b9 100644
--- a/superbuild/hydrogen/CMakeLists.txt
+++ b/superbuild/hydrogen/CMakeLists.txt
@@ -55,6 +55,29 @@ if (TARGET ALUMINUM)
   endif (Hydrogen_ENABLE_ALUMINUM)
 endif (TARGET ALUMINUM)
 
+if (TARGET CUB)
+  option(Hydrogen_ENABLE_CUB "Whether to use CUB in Hydrogen" ON)
+  if (Hydrogen_ENABLE_CUB)
+    message(STATUS "Building Hydrogen with CUB support")
+    set(LBANN_SB_FWD_HYDROGEN_CUB_DIR "${CUB_DIR}"
+      CACHE STRING "The path to CUB for Hydrogen.")
+
+    set(_hydrogen_depends_tag DEPENDS)
+    list(APPEND _HYDROGEN_DEPENDS CUB)
+
+    set(Hydrogen_ENABLE_CUDA ON)
+  endif ()
+endif (TARGET CUB)
+
+if (TARGET CATCH2)
+  option(Hydrogen_ENABLE_UNIT_TESTS "Build catch2 unit tests in hydrogen" ON)
+  if (Hydrogen_ENABLE_UNIT_TESTS)
+    set(LBANN_SB_FWD_HYDROGEN_CATCH2_DIR ${CATCH2_DIR})
+    set(_hydrogen_depends_tag DEPENDS)
+    list(APPEND _HYDROGEN_DEPENDS CATCH2)
+  endif ()
+endif (TARGET CATCH2)
+
 if (Hydrogen_ENABLE_CUDA)
   enable_language(CUDA)
 endif ()
@@ -86,18 +109,31 @@ else ()
 endif ()
 
 # ... then the tag.
-set(HYDROGEN_TAG "hydrogen"
+set(HYDROGEN_TAG "v1.3.4"
   CACHE STRING "The git tag or hash to checkout for Hydrogen")
 
+if (HYDROGEN_CUSTOM_SOURCE_DIR)
+  set(HYDROGEN_SOURCE_DIR "${HYDROGEN_CUSTOM_SOURCE_DIR}")
+  set(HYDROGEN_URL "")
+  set(HYDROGEN_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using HYDROGEN source in: ${HYDROGEN_SOURCE_DIR}")
+else ()
+  set(HYDROGEN_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(HYDROGEN
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${HYDROGEN_URL}
-  GIT_TAG ${HYDROGEN_TAG}
+  ${_GIT_REPOSITORY_TAG} ${HYDROGEN_URL}
+  ${_GIT_TAG_TAG} ${HYDROGEN_TAG}
+  SOURCE_DIR ${HYDROGEN_SOURCE_DIR}
   ${_hydrogen_depends_tag} ${_HYDROGEN_DEPENDS}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${HYDROGEN_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/jpeg-turbo/CMakeLists.txt b/superbuild/jpeg-turbo/CMakeLists.txt
index e069a21a400..689fc2cad91 100644
--- a/superbuild/jpeg-turbo/CMakeLists.txt
+++ b/superbuild/jpeg-turbo/CMakeLists.txt
@@ -16,22 +16,35 @@ else ()
     CACHE STRING "The URL from which to clone LIBJPEG-TURBO")
 endif ()
 
-set(JPEG-TURBO_TAG "master"
+set(JPEG-TURBO_TAG "2.0.3"
   CACHE STRING "The git tag to checkout for LIBJPEG-TURBO")
 
 # Where to install LIBJPEG-TURBO
 set (JPEG-TURBO_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
   CACHE PATH "The installation location of LIBJPEG-TURBO.")
 
+if (JPEG_TURBO_CUSTOM_SOURCE_DIR)
+  set(JPEG_TURBO_SOURCE_DIR "${JPEG_TURBO_CUSTOM_SOURCE_DIR}")
+  set(JPEG-TURBO_URL "")
+  set(JPEG-TURBO_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using JPEG-TURBO source in: ${JPEG_TURBO_SOURCE_DIR}")
+else ()
+  set(JPEG_TURBO_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 # Handle the install of LIBJPEG-TURBO
 include(ExternalProject)
 ExternalProject_Add(JPEG-TURBO
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${JPEG-TURBO_URL}
-  GIT_TAG ${JPEG-TURBO_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  ${_GIT_REPOSITORY_TAG} ${JPEG-TURBO_URL}
+  ${_GIT_TAG_TAG} ${JPEG-TURBO_TAG}
+  SOURCE_DIR ${JPEG_TURBO_SOURCE_DIR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${JPEG-TURBO_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt
index fd1310769bd..415c244c5ca 100644
--- a/superbuild/lbann/CMakeLists.txt
+++ b/superbuild/lbann/CMakeLists.txt
@@ -63,6 +63,15 @@ if (TARGET ALUMINUM)
     list(APPEND _LBANN_DEPENDS ALUMINUM)
   endif (LBANN_WITH_ALUMINUM)
 endif (TARGET ALUMINUM)
+
+if (TARGET CATCH2)
+  option(LBANN_WITH_UNIT_TESTING "Build catch2 unit tests in LBANN" ON)
+  if (LBANN_WITH_UNIT_TESTING)
+    set(LBANN_SB_FWD_LBANN_CATCH2_DIR ${CATCH2_DIR})
+    list(APPEND _LBANN_DEPENDS CATCH2)
+  endif ()
+endif (TARGET CATCH2)
+
 if (TARGET CEREAL)
   list(APPEND _LBANN_DEPENDS CEREAL)
   set(LBANN_SB_FWD_LBANN_CEREAL_DIR "${CEREAL_DIR}")
@@ -109,6 +118,21 @@ if (TARGET PROTOBUF)
   endif ()
 endif ()
 
+# DiHydrogen and Distconv
+if (TARGET DIHYDROGEN)
+  option(LBANN_WITH_DIHYDROGEN "Whether to use DiHydrogen in LBANN" ON)
+  if (LBANN_WITH_DIHYDROGEN)
+    list(APPEND _LBANN_DEPENDS DIHYDROGEN)
+    set(LBANN_SB_FWD_LBANN_DIHYDROGEN_DIR "${DIHYDROGEN_DIR}"
+      CACHE STRING "The path to DiHydrogen for LBANN.")
+    set(LBANN_SB_FWD_LBANN_WITH_DIHYDROGEN ON)
+  endif ()
+  option(LBANN_WITH_DISTCONV "Whether to use Distconv in LBANN" OFF)
+  if (LBANN_WITH_DISTCONV)
+    set(LBANN_SB_FWD_LBANN_WITH_DISTCONV ON)
+  endif ()
+endif ()
+
 # Get the list of LBANN variables
 get_property(LBANN_VARIABLES DIRECTORY PROPERTY VARIABLES)
 list(FILTER LBANN_VARIABLES INCLUDE REGEX "^LBANN_.*")
@@ -178,5 +202,22 @@ ExternalProject_Add(LBANN
   ${LBANN_CMAKE_ARGS}
   )
 
+# Ensure the JAG utils are built
+ExternalProject_Add_Step(LBANN build-jag-utils
+  COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --config $<CONFIG> --target jag-utils
+  COMMENT "Performing building of JAG utils for 'LBANN'"
+  DEPENDEES build
+  DEPENDERS install
+  LOG 1
+  USES_TERMINAL 1)
+# Ensure the ATOM utils are built
+ExternalProject_Add_Step(LBANN build-atom-utils
+  COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --config $<CONFIG> --target atom-utils
+  COMMENT "Performing building of ATOM tools for 'LBANN'"
+  DEPENDEES build
+  DEPENDERS install
+  LOG 1
+  USES_TERMINAL 1)
+
 set(LBANN_DIR ${LBANN_CMAKE_INSTALL_PREFIX}
   CACHE INTERNAL "The install prefix of LBANN.")
diff --git a/superbuild/openblas/CMakeLists.txt b/superbuild/openblas/CMakeLists.txt
index 6b305efb86b..c0494e454c8 100644
--- a/superbuild/openblas/CMakeLists.txt
+++ b/superbuild/openblas/CMakeLists.txt
@@ -64,13 +64,29 @@ set(OPENBLAS_ARCH_COMMAND "${_TMP_OPENBLAS_ARCH_COMMAND}"
 # instead.
 find_program(GNU_MAKE_PROGRAM make)
 
+if (OPENBLAS_CUSTOM_SOURCE_DIR)
+  set(OPENBLAS_SOURCE_DIR "${OPENBLAS_CUSTOM_SOURCE_DIR}")
+  set(OPENBLAS_URL "")
+  set(OPENBLAS_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using OPENBLAS source in: ${OPENBLAS_SOURCE_DIR}")
+else ()
+  set(OPENBLAS_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include (ExternalProject)
 ExternalProject_Add(OPENBLAS
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  GIT_REPOSITORY ${OPENBLAS_URL}
-  GIT_TAG ${OPENBLAS_TAG}
-  BUILD_IN_SOURCE 1
+  ${_GIT_REPOSITORY_TAG} ${OPENBLAS_URL}
+  ${_GIT_TAG_TAG} ${OPENBLAS_TAG}
+  SOURCE_DIR ${OPENBLAS_SOURCE_DIR}
+  TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
   INSTALL_DIR ${OPENBLAS_CMAKE_INSTALL_PREFIX}
+  BUILD_IN_SOURCE 1
   CONFIGURE_COMMAND ""
   UPDATE_COMMAND ""
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/opencv/AddOpenCVOptions.cmake b/superbuild/opencv/AddOpenCVOptions.cmake
index f1e8439b09f..c45c26e390e 100644
--- a/superbuild/opencv/AddOpenCVOptions.cmake
+++ b/superbuild/opencv/AddOpenCVOptions.cmake
@@ -2,7 +2,7 @@
 option(OPENCV_BUILD_opencv_core "OpenCV: Enable core module" ON)
 option(OPENCV_BUILD_opencv_flann "OpenCV: Enable flann module" OFF)
 option(OPENCV_BUILD_opencv_imgproc "OpenCV: Enable imgproc module" ON)
-option(OPENCV_BUILD_opencv_highgui "OpenCV: Enable highgui module" ON)
+option(OPENCV_BUILD_opencv_highgui "OpenCV: Enable highgui module" OFF)
 option(OPENCV_BUILD_opencv_features2d "OpenCV: Enable features2d module" OFF)
 option(OPENCV_BUILD_opencv_calib3d "OpenCV: Enable calib3d module" OFF)
 option(OPENCV_BUILD_opencv_ml "OpenCV: Enable ml module" OFF)
diff --git a/superbuild/opencv/CMakeLists.txt b/superbuild/opencv/CMakeLists.txt
index 520b67816bf..7d1b4ede210 100644
--- a/superbuild/opencv/CMakeLists.txt
+++ b/superbuild/opencv/CMakeLists.txt
@@ -41,18 +41,31 @@ else ()
     CACHE STRING "The URL from which to clone OpenCV")
 endif ()
 
-set(OPENCV_TAG "3.4.5"
+set(OPENCV_TAG "4.1.0"
   CACHE STRING "The git tag or hash to checkout for OpenCV")
 
+if (OPENCV_CUSTOM_SOURCE_DIR)
+  set(OPENCV_SOURCE_DIR "${OPENCV_CUSTOM_SOURCE_DIR}")
+  set(OPENCV_URL "")
+  set(OPENCV_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using OPENCV source in: ${OPENCV_SOURCE_DIR}")
+else ()
+  set(OPENCV_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(OPENCV
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
   TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
   STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
-  GIT_REPOSITORY ${OPENCV_URL}
-  GIT_TAG ${OPENCV_TAG}
+  ${_GIT_REPOSITORY_TAG} ${OPENCV_URL}
+  ${_GIT_TAG_TAG} ${OPENCV_TAG}
+  SOURCE_DIR ${OPENCV_SOURCE_DIR}
   ${_opencv_depends_tag} ${_OPENCV_DEPENDS}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${OPENCV_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
diff --git a/superbuild/protobuf/CMakeLists.txt b/superbuild/protobuf/CMakeLists.txt
index b004d000949..020353fee49 100644
--- a/superbuild/protobuf/CMakeLists.txt
+++ b/superbuild/protobuf/CMakeLists.txt
@@ -31,7 +31,7 @@ else ()
 endif ()
 
 # ... then the tag.
-set(PROTOBUF_TAG "v3.6.1"
+set(PROTOBUF_TAG "v3.10.1"
   CACHE STRING "The git tag or hash to checkout for PROTOBUF")
 
 # Where to install PROTOBUF
@@ -41,14 +41,27 @@ set(PROTOBUF_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
 set(PROTOBUF_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
   CACHE STRING "The build type for PROTOBUF.")
 
+if (PROTOBUF_CUSTOM_SOURCE_DIR)
+  set(PROTOBUF_SOURCE_DIR "${PROTOBUF_CUSTOM_SOURCE_DIR}")
+  set(PROTOBUF_URL "")
+  set(PROTOBUF_TAG "")
+  set(_GIT_REPOSITORY_TAG)
+  set(_GIT_TAG_TAG)
+  message(STATUS "Using PROTOBUF source in: ${PROTOBUF_SOURCE_DIR}")
+else ()
+  set(PROTOBUF_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+  set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY")
+  set(_GIT_TAG_TAG "GIT_TAG")
+endif ()
+
 include(ExternalProject)
 ExternalProject_Add(PROTOBUF
   PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
   TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp"
   STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp"
-  GIT_REPOSITORY ${PROTOBUF_URL}
-  GIT_TAG ${PROTOBUF_TAG}
-  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src"
+  ${_GIT_REPOSITORY_TAG} ${PROTOBUF_URL}
+  ${_GIT_TAG_TAG} ${PROTOBUF_TAG}
+  SOURCE_DIR ${PROTOBUF_SOURCE_DIR}
   SOURCE_SUBDIR cmake
   BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build"
   INSTALL_DIR "${PROTOBUF_CMAKE_INSTALL_PREFIX}"
diff --git a/test_numpy_conduit_cache.cpp b/test_numpy_conduit_cache.cpp
deleted file mode 100644
index 79928889029..00000000000
--- a/test_numpy_conduit_cache.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann_config.hpp"
-
-#ifdef LBANN_HAS_CONDUIT
-
-#include "conduit/conduit.hpp"
-#include "conduit/conduit_relay.hpp"
-#include "conduit/conduit_relay_io_hdf5.hpp"
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <string>
-#include <sstream>
-#include "lbann/lbann.hpp"
-#include "lbann/utils/jag_utils.hpp"
-#include "lbann/data_readers/numpy_conduit_cache.hpp"
-
-using namespace lbann;
-
-int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
-  bool master = comm->am_world_master();
-
-  try {
-
-  numpy_conduit_cache n(comm.get());
-  n.load("/g/g10/hysom/test.npz", 42);
-
-  } catch (std::exception const &e) {
-    if (master) std::cerr << "caught exception: " << e.what() << "\n";
-    return EXIT_FAILURE;
-  } catch (...) {
-    std::cerr << "unknown exception in main\n";
-    return EXIT_FAILURE;
-  }
-
-  // Clean up
-  return EXIT_SUCCESS;
-}
-
-#endif //#ifdef LBANN_HAS_CONDUIT
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b8fdb1411c3..115d8d39d25 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_executable( test_shuffled_indices test_shuffled_indices.cpp )
+add_executable( test_mpi_err_handling test_mpi_err_handling.cpp )
 target_link_libraries( test_shuffled_indices lbann )
+target_link_libraries( test_mpi_err_handling lbann )
diff --git a/tests/test_img_pipeline/CMakeLists.txt b/tests/test_img_pipeline/CMakeLists.txt
deleted file mode 100644
index c5f40f967a4..00000000000
--- a/tests/test_img_pipeline/CMakeLists.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-project(imgpipe)
-cmake_minimum_required(VERSION 3.8)
-cmake_policy(SET CMP0015 NEW)
-
-set(COMPILER "gnu")
-#set(CLUSTER "surface") # only usable with non-custom built MPI that is detected by SetupMPI.cmake below
-set(CLUSTER "catalyst")
-set(LBANN_DIR ../..)
-set(LBANN_BUILD_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install)
-include(${LBANN_DIR}/cmake/modules/SetupMPI.cmake)
-include(${LBANN_DIR}/cmake/modules/SetupOpenMP.cmake)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-set(LBANN_INCLUDE_DIR ${LBANN_BUILD_DIR}/include)
-include_directories(${LBANN_INCLUDE_DIR})
-
-set(IMGPIPE_EXE imgpipe)
-set(IMGPIPE_SRCS main.cpp)
-
-
-set(WITH_OPENCL OFF)
-
-add_definitions(-Wall)
-add_definitions(-O2)
-add_definitions(-g)
-add_definitions(-std=c++11)
-add_definitions(-DLBANN_HAS_OPENCV)
-
-
-#list(APPEND OpenCV_DIR /usr/local/tools/opencv-3.0.0)
-#list(APPEND OpenCV_DIR /usr)
-#find_package(OpenCV QUIET HINTS ${OpenCV_DIR})
-#message(STATUS "OpenCV_DIR: ${OpenCV_DIR}")
-
-if(NOT OpenCV_FOUND)
-  set(OpenCV_DIR ${LBANN_BUILD_DIR})
-  set(OpenCV_LIBS "libopencv_highgui.so;libopencv_imgproc.so;libopencv_imgcodecs.so;libopencv_core.so")
-  set(OpenCV_INCLUDE_DIRS "${OpenCV_DIR}/include")
-  set(OpenCV_LIB_DIR "${OpenCV_DIR}/lib")
-  message(STATUS "OpenCV_DIR: ${OpenCV_DIR}")
-endif()
-
-include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-link_directories(${OpenCV_LIB_DIR})
-
-
-find_package(MPI REQUIRED)
-message(STATUS "Found MPI: ${MPI_CXX_COMPILER} ${MPI_C_COMPILER} ${MPI_Fortran_COMPILER}")
-include_directories(${MPI_CXX_INCLUDE_PATH})
-
-list (APPEND Hydrogen_DIR ${LBANN_BUILD_DIR})
-message(STATUS "Hydrogen_DIR: ${Hydrogen_DIR}")
-
-include_directories(SYSTEM ${Hydrogen_INCLUDE_DIRS})
-link_directories(${Hydrogen_DIR}/lib)
-set(Hydrogen_LIBS "${Hydrogen_LIBRARIES};-lHydrogen;-lpmrrr;-lopenblas;-lpthread")
-
-file(GLOB IMGPIPE_DEPEND_SRCS
-     lbann/utils/random.cpp
-     ${LBANN_DIR}/src//utils/file_utils.cpp
-     ${LBANN_DIR}/src/data_readers/image_utils.cpp
-     ${LBANN_DIR}/src/data_readers/cv_augmenter.cpp
-     ${LBANN_DIR}/src/data_readers/cv_colorizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_cropper.cpp
-     ${LBANN_DIR}/src/data_readers/cv_decolorizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_mean_extractor.cpp
-     ${LBANN_DIR}/src/data_readers/cv_normalizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_process.cpp
-     ${LBANN_DIR}/src/data_readers/cv_process_patches.cpp
-     ${LBANN_DIR}/src/data_readers/cv_resizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_subtractor.cpp
-     ${LBANN_DIR}/src/data_readers/cv_transform.cpp
-     ${LBANN_DIR}/src/data_readers/cv_utils.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_patch_descriptor.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_ROI.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_stats.cpp)
-
-link_directories(${LBANN_DIR}/lib64)
-link_directories(${LBANN_DIR}/lib)
-add_executable(${IMGPIPE_EXE} ${IMGPIPE_DEPEND_SRCS} ${IMGPIPE_SRCS})
-target_link_libraries(${IMGPIPE_EXE} ${OpenCV_LIBS} ${Hydrogen_LIBS} ${MPI_CXX_LIBRARIES} ${OpenMP_CXX_LIBRARIES} ${OpenMP_Fortran_LIBRARIES})
diff --git a/tests/test_img_pipeline/Mat.hpp b/tests/test_img_pipeline/Mat.hpp
deleted file mode 120000
index 8a49f86123e..00000000000
--- a/tests/test_img_pipeline/Mat.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../../tools/compute_mean/Mat.hpp
\ No newline at end of file
diff --git a/tests/test_img_pipeline/README.txt b/tests/test_img_pipeline/README.txt
deleted file mode 100644
index df981490eee..00000000000
--- a/tests/test_img_pipeline/README.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-check Elemental_DIR in CMakeList.txt
-This requires OpenCV, Elemental, and MPI. cmake will attempt to find these
-under system or LBANN build directories.
-To set the LBANN build directory, set CLUSTER variable in CMakeList.txt
-
-Make sure if the compiler supports c++11, and the environment viriables, CC and CXX, are set.
-e.g.,
-CC=gcc
-CXX=g++
-
-Then, use the sequence of following commands:
- mkdir build
- cd build
- cmake ..
- make
- cd ..
-
-run it as
-build/imgpipe image_filename w h r rw rh bsz a n ni
-
-    The parameters w, h, c, rw and rh are for cropper
-    w: the final crop width of image
-    h: the final crop height of image
-       (w and h are dictated whether by cropping images to the size)
-    r: whether to randomize the crop position within the center region (0|1)
-   rw: The width of the center region with respect to w after resizig the raw image
-   rh: The height of the center region with respect to h after resizing the raw image
-       Raw image will be resized to an image of size rw x rh around the center,
-       which covers area of the original image as much as possible while preseving
-       the aspect ratio of object in the image
-
-  bsz: The batch size for mean extractor
-       if 0, turns off te mean extractor
-
-    a: whether to use augmenter (0|1)
-
-    n: whether to use normalizer (0=none|1=channel-wise|2=pixel-wise)
-
-   ni: The number of iterations.
-       must be greater than 0
-
-e.g., build/imgpipe img.jpg 240 240 1 256 256 4 0 0 8
diff --git a/tests/test_img_pipeline/include b/tests/test_img_pipeline/include
deleted file mode 120000
index 38e8790d9e4..00000000000
--- a/tests/test_img_pipeline/include
+++ /dev/null
@@ -1 +0,0 @@
-../../include/lbann/data_readers
\ No newline at end of file
diff --git a/tests/test_img_pipeline/lbann b/tests/test_img_pipeline/lbann
deleted file mode 120000
index acaf439b382..00000000000
--- a/tests/test_img_pipeline/lbann
+++ /dev/null
@@ -1 +0,0 @@
-../../tools/compute_mean/lbann
\ No newline at end of file
diff --git a/tests/test_img_pipeline/lbann_config.hpp b/tests/test_img_pipeline/lbann_config.hpp
deleted file mode 120000
index b596fc812e3..00000000000
--- a/tests/test_img_pipeline/lbann_config.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../../tools/compute_mean/lbann_config.hpp
\ No newline at end of file
diff --git a/tests/test_img_pipeline/main.cpp b/tests/test_img_pipeline/main.cpp
deleted file mode 100644
index de104321b98..00000000000
--- a/tests/test_img_pipeline/main.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <iterator>
-#include <sstream>
-#include <algorithm>
-#include "lbann/data_readers/image_utils.hpp"
-#include "lbann/data_readers/cv_process.hpp"
-#include "lbann/utils/file_utils.hpp"
-
-
-struct cropper_params {
-  bool m_is_set;
-  bool m_rand_center;
-  bool m_adaptive_interpolation;
-  std::pair<int, int> m_crop_sz;
-  std::pair<int, int> m_roi_sz;
-
-  cropper_params(void)
-    : m_is_set(false),
-      m_rand_center(false),
-      m_adaptive_interpolation(false),
-      m_crop_sz(std::make_pair(0, 0)),
-      m_roi_sz(std::make_pair(0,0)) {}
-};
-
-struct resizer_params {
-  bool m_is_set;
-  unsigned int m_width;
-  unsigned int m_height;
-  bool m_adaptive_interpolation;
-  resizer_params(void)
-    : m_is_set(false),
-      m_width(0u),
-      m_height(0u),
-      m_adaptive_interpolation(false) {}
-};
-
-struct augmenter_params {
-  bool m_is_set;
-  bool m_hflip;
-  bool m_vflip;
-  float m_rot;
-  float m_hshift;
-  float m_vshift;
-  float m_shear;
-
-  augmenter_params(void)
-    : m_is_set(false),
-      m_hflip(false),
-      m_vflip(false),
-      m_rot(0.0f),
-      m_hshift(0.0f),
-      m_vshift(0.0f),
-      m_shear(0.0f) {}
-};
-
-struct main_params {
-  enum normalizer_type {_NONE_,_CHANNEL_WISE_,_PIXEL_WISE_};
-  unsigned int m_num_bytes;
-  bool m_enable_cropper;
-  bool m_enable_resizer;
-  bool m_enable_augmenter;
-  bool m_enable_colorizer;
-  bool m_enable_decolorizer;
-  bool m_enable_mean_extractor;
-  normalizer_type m_enable_normalizer;
-  unsigned int m_mean_batch_size;
-  unsigned int m_num_iter;
-  std::string m_mean_image_name;
-
-  bool is_normalizer_off() const { return (m_enable_normalizer == _NONE_); }
-  bool is_channel_wise_normalizer() const { return (m_enable_normalizer == _CHANNEL_WISE_); }
-  bool is_pixel_wise_normalizer() const { return (m_enable_normalizer == _PIXEL_WISE_); }
-
-  main_params(void)
-    : m_num_bytes(0u),
-      m_enable_cropper(true),
-      m_enable_resizer(false),
-      m_enable_augmenter(false),
-      m_enable_colorizer(false),
-      m_enable_decolorizer(false),
-      m_enable_mean_extractor(true),
-      m_enable_normalizer(_NONE_),
-      m_mean_batch_size(1024u),
-      m_num_iter(1u) {}
-};
-
-bool test_image_io(const std::string filename, const main_params& op, const cropper_params& rp, const resizer_params& sp, const augmenter_params& ap);
-
-void show_help(std::string name);
-
-//-----------------------------------------------------------------------------
-int main(int argc, char *argv[]) {
-
-  if (argc != 11) {
-    show_help(argv[0]);
-    return 0;
-  }
-
-  std::string filename = argv[1];
-
-  main_params mp;
-  mp.m_enable_cropper = true;
-  // to test resizer manually swap m_enalbe_cropper/resizer
-  mp.m_enable_resizer = false;
-  mp.m_enable_augmenter = static_cast<bool>(atoi(argv[8]));
-  mp.m_enable_colorizer = true;
-  mp.m_enable_decolorizer = false;
-  mp.m_enable_normalizer = static_cast<main_params::normalizer_type>(atoi(argv[9]));
-  if (mp.is_pixel_wise_normalizer()) mp.m_mean_image_name = "mean.png";
-  mp.m_mean_batch_size = atoi(argv[7]);
-  mp.m_enable_mean_extractor = (mp.m_mean_batch_size > 0);
-  mp.m_num_iter = atoi(argv[10]);
-
-  cropper_params rp;
-  if (mp.m_enable_cropper) {
-    rp.m_is_set = true;
-    rp.m_crop_sz.first = atoi(argv[2]);
-    rp.m_crop_sz.second = atoi(argv[3]);
-    rp.m_rand_center = static_cast<bool>(atoi(argv[4]));
-    rp.m_roi_sz.first = atoi(argv[5]);
-    rp.m_roi_sz.second = atoi(argv[6]);
-    //rp.m_adaptive_interpolation = true;
-  }
-
-  resizer_params sp;
-  if (mp.m_enable_resizer) {
-    sp.m_is_set = true;
-    sp.m_width = static_cast<unsigned int>(atoi(argv[2]));
-    sp.m_height = static_cast<unsigned int>(atoi(argv[3]));
-    //sp.m_adaptive_interpolation = true;
-  }
-
-  augmenter_params ap;
-  if (mp.m_enable_augmenter) {
-    ap.m_is_set = true;
-    ap.m_rot = 0.1;
-    ap.m_shear = 0.2;
-    ap.m_vflip = true;
-  }
-
-  // read write test with converting to/from a serialized buffer
-  bool ok = test_image_io(filename, mp, rp, sp, ap);
-  if (!ok) {
-    std::cout << "Test failed" << std::endl;
-    return 0;
-  }
-  std::cout << "Complete!" << std::endl;
-
-  return 0;
-}
-
-//-----------------------------------------------------------------------------
-void show_help(std::string name) {
-    std::cout << "Usage: > " << name << " image_filename w h r rw rh bsz a n ni" << std::endl;
-    std::cout << std::endl;
-    std::cout << "    The parameters w, h, c, rw and rh are for cropper" << std::endl;
-    std::cout << "    w: the final crop width of image" << std::endl;
-    std::cout << "    h: the final crop height of image" << std::endl;
-    std::cout << "       (w and h are dictated whether by cropping images to the size)" << std::endl;
-    std::cout << "    r: whether to randomize the crop position within the center region (0|1)" << std::endl;
-    std::cout << "   rw: The width of the center region with respect to w after resizig the raw image" << std::endl;
-    std::cout << "   rh: The height of the center region with respect to h after resizing the raw image" << std::endl;
-    std::cout << "       Raw image will be resized to an image of size rw x rh around the center," << std::endl;
-    std::cout << "       which covers area of the original image as much as possible while preseving" << std::endl;
-    std::cout << "       the aspect ratio of object in the image" << std::endl;
-    std::cout << std::endl;
-    std::cout << "  bsz: The batch size for mean extractor" << std::endl;
-    std::cout << "       if 0, turns off te mean extractor" << std::endl;
-    std::cout << std::endl;
-    std::cout << "    a: whether to use augmenter (0|1)" << std::endl;
-    std::cout << std::endl;
-    std::cout << "    n: whether to use normalizer (0=none|1=channel-wise|2=pixel-wise)" << std::endl;
-    std::cout << std::endl;
-    std::cout << "   ni: The number of iterations." << std::endl;
-    std::cout << "       must be greater than 0" << std::endl;
-}
-
-void show_image_size(const int width, const int height, const int type) {
-  const int depth = CV_MAT_DEPTH(type);
-  const int NCh = CV_MAT_CN(type);
-  const int esz = CV_ELEM_SIZE(depth);
-  std::cout << "Image size                     : " << width << " x " << height << std::endl;
-  std::cout << "Number of channels             : " << NCh << std::endl;
-  std::cout << "Size of the channel value type : " << esz << std::endl;
-  std::cout << "Total bytes                    : " << width *height *NCh *esz << std::endl;
-}
-
-void write_file(const std::string filename, const std::vector<unsigned char>& buf) {
-  std::ofstream file(filename, std::ios::out | std::ios::binary);
-  file.write((const char *) buf.data(), buf.size() * sizeof(unsigned char));
-  file.close();
-}
-
-//-----------------------------------------------------------------------------
-bool test_image_io(const std::string filename,
-  const main_params& mp,
-  const cropper_params& rp,
-  const resizer_params& sp,
-  const augmenter_params& ap)
-{
-
-  int transform_idx = 0;
-  int mean_extractor_idx = -1;
-  unsigned int num_bytes = mp.m_num_bytes; // size of image in bytes
-
-  lbann::cv_process pp;
-  { // Initialize the image processor
-    if (rp.m_is_set) { // If cropper parameters are given
-      // Setup a cropper
-      std::unique_ptr<lbann::cv_cropper> cropper(new(lbann::cv_cropper));
-      cropper->set(rp.m_crop_sz.first, rp.m_crop_sz.second, rp.m_rand_center, rp.m_roi_sz, rp.m_adaptive_interpolation);
-      pp.add_transform(std::move(cropper));
-      num_bytes = rp.m_crop_sz.first * rp.m_crop_sz.second * 3;
-      transform_idx ++;
-    }
-
-    if (sp.m_is_set) { // If resizer parameters are given
-      // Setup a cropper
-      std::unique_ptr<lbann::cv_resizer> resizer(new(lbann::cv_resizer));
-      resizer->set(sp.m_width, sp.m_height, rp.m_adaptive_interpolation);
-      pp.add_transform(std::move(resizer));
-      num_bytes = sp.m_width * sp.m_height * 3;
-      transform_idx ++;
-    }
-
-    if (ap.m_is_set) { // Set up an augmenter
-      std::unique_ptr<lbann::cv_augmenter> augmenter(new(lbann::cv_augmenter));
-      augmenter->set(ap.m_hflip, ap.m_vflip, ap.m_rot, ap.m_hshift, ap.m_vshift, ap.m_shear);
-      pp.add_transform(std::move(augmenter));
-      transform_idx ++;
-    }
-
-    if (mp.m_enable_colorizer) { // Set up a colorizer
-      std::unique_ptr<lbann::cv_colorizer> colorizer(new(lbann::cv_colorizer));
-      pp.add_transform(std::move(colorizer));
-      transform_idx ++;
-    }
-
-    if (mp.m_enable_decolorizer) { // Set up a colorizer
-      std::unique_ptr<lbann::cv_decolorizer> decolorizer(new(lbann::cv_decolorizer));
-      pp.add_transform(std::move(decolorizer));
-      transform_idx ++;
-    }
-
-    if (mp.m_enable_mean_extractor) { // set up a mean extractor
-      mean_extractor_idx = transform_idx;
-      std::unique_ptr<lbann::cv_mean_extractor> mean_extractor(new(lbann::cv_mean_extractor));
-      if (rp.m_is_set)
-        mean_extractor->set(rp.m_crop_sz.first, rp.m_crop_sz.second, 3, mp.m_mean_batch_size);
-      else
-        mean_extractor->set(mp.m_mean_batch_size);
-      pp.add_transform(std::move(mean_extractor));
-      transform_idx ++;
-    }
-
-    if (!mp.is_normalizer_off()) { // Set up a normalizer
-      if (mp.is_channel_wise_normalizer()) {
-        std::unique_ptr<lbann::cv_normalizer> normalizer(new(lbann::cv_normalizer));
-        normalizer->z_score(true);
-        pp.add_normalizer(std::move(normalizer));
-      } else {
-        std::unique_ptr<lbann::cv_subtractor> normalizer(new(lbann::cv_subtractor));
-#if 0
-        cv::Mat img_to_sub = cv::imread(mp.m_mean_image_name);
-        if (img_to_sub.empty()) {
-          std::cout << mp.m_mean_image_name << " does not exist" << std::endl;
-          return false;
-        }
-        normalizer->set_mean(img_to_sub);
-#else
-        std::vector<lbann::DataType> mean = {0.40625, 0.45703, 0.48047};
-        normalizer->set_mean(mean);
-        std::vector<lbann::DataType> stddev = {0.3, 0.5, 0.3};
-        normalizer->set_stddev(stddev);
-#endif
-        pp.add_normalizer(std::move(normalizer));
-      }
-      transform_idx ++;
-    }
-  }
-
-  // Load an image bytestream into memory
-  std::vector<unsigned char> buf;
-  bool ok = lbann::load_file(filename, buf);
-  if (!ok) {
-    std::cout << "Failed to load" << std::endl;
-    return false;
-  }
-
-  int width = 0;
-  int height = 0;
-  int type = 0;
-
-  ::Mat Images;
-  ::Mat Image_v; // matrix view
-  Images.Resize(((num_bytes==0)? 1: num_bytes), 2); // minibatch
-
-  size_t img_begin = 0;
-  size_t img_end = buf.size();
-  for (unsigned int i=0; i < mp.m_num_iter; ++i)
-  {
-    // This has nothing to do with the image type but only to create view on a block of bytes
-    using InputBuf_T = lbann::cv_image_type<uint8_t>;
-    // Construct a zero copying view to a portion of a preloaded data buffer
-    const cv::Mat inbuf(1, (img_end - img_begin), InputBuf_T::T(1), &(buf[img_begin]));
-
-    if (num_bytes == 0) {
-      ok = lbann::image_utils::import_image(inbuf, width, height, type, pp, Images);
-      num_bytes = Images.Height();
-      El::View(Image_v, Images, El::IR(0, num_bytes), El::IR(0, 1));
-    } else {
-      El::View(Image_v, Images, El::IR(0, num_bytes), El::IR(0, 1));
-      //ok = lbann::image_utils::import_image(buf, width, height, type, pp, Image_v);
-      ok = lbann::image_utils::import_image(inbuf, width, height, type, pp, Image_v);
-    }
-    if (!ok) {
-      std::cout << "Failed to import" << std::endl;
-      return false;
-    }
-    //if ((i%3 == 0u) && (mp.m_enable_mean_extractor)) {
-    //  dynamic_cast<lbann::cv_mean_extractor*>(pp.get_transform(mean_extractor_idx))->reset();
-    //}
-  }
-
-  // Print out transforms
-  const unsigned int num_transforms = pp.get_num_transforms();
-  const std::vector<std::unique_ptr<lbann::cv_transform> >& transforms = pp.get_transforms();
-
-  for(unsigned int i=0u; i < num_transforms; ++i) {
-    std::cout << std::endl << "------------ transform " << i << "-------------" << std::endl;
-    std::cout << *transforms[i] << std::endl;
-  }
-
-  if (mp.m_enable_mean_extractor) {
-    // Extract the mean of images
-    cv::Mat mean_image;
-    mean_image = dynamic_cast<lbann::cv_mean_extractor*>(pp.get_transform(mean_extractor_idx))->extract<uint16_t>();
-    cv::imwrite("mean.png", mean_image);
-  }
-
-  // Export the unnormalized image
-  const std::string ext = lbann::get_ext_name(filename);
-  std::vector<unsigned char> outbuf;
-  ok = lbann::image_utils::export_image(ext, outbuf, width, height, type, pp, Image_v);
-  write_file("copy." + ext, outbuf);
-  return ok;
-}
diff --git a/tests/test_img_pipeline/src b/tests/test_img_pipeline/src
deleted file mode 120000
index 7b97ad049e9..00000000000
--- a/tests/test_img_pipeline/src
+++ /dev/null
@@ -1 +0,0 @@
-../../src/data_readers
\ No newline at end of file
diff --git a/tests/test_mpi_err_handling.cpp b/tests/test_mpi_err_handling.cpp
new file mode 100644
index 00000000000..55171974fca
--- /dev/null
+++ b/tests/test_mpi_err_handling.cpp
@@ -0,0 +1,64 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// lbann_proto.cpp - prototext application
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/lbann.hpp"
+#include <vector>
+
+using namespace lbann;
+
+const int Buf_size = 10000;
+const int Trainer = 0;
+
+int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+
+  try {
+    const int size = comm->get_procs_in_world();
+    const int me = comm->get_rank_in_world();
+    if (size != 2) {
+      LBANN_ERROR("Please run with two ranks");
+    }
+
+    if (me == 0) {
+      std::vector<int> buf(Buf_size,-1);
+      comm->send(buf.data(), Buf_size, Trainer, 1);
+    }
+
+    else {
+      std::vector<int> buf;
+      comm->recv(buf.data(), 0, Trainer, 0);
+    }
+
+
+  } catch (lbann_exception& e) {
+    e.print_report();
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/tests/test_patchworks/CMakeLists.txt b/tests/test_patchworks/CMakeLists.txt
deleted file mode 100644
index 9beaf7aad71..00000000000
--- a/tests/test_patchworks/CMakeLists.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-project(patchworks)
-cmake_minimum_required(VERSION 3.8)
-cmake_policy(SET CMP0015 NEW)
-
-set(COMPILER "gnu")
-#set(CLUSTER "surface")
-set(CLUSTER "catalyst")
-set(LBANN_DIR ../..)
-set(LBANN_BUILD_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install)
-include(${LBANN_DIR}/cmake/modules/SetupMPI.cmake)
-include(${LBANN_DIR}/cmake/modules/SetupOpenMP.cmake)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-set(LBANN_INCLUDE_DIR ${LBANN_BUILD_DIR}/include)
-include_directories(${LBANN_INCLUDE_DIR})
-
-set(PATCHWORKS_EXE patchworks)
-set(PATCHWORKS_SRCS main.cpp patchworks_image.cpp patchworks_utils.cpp)
-
-
-set(WITH_OPENCL OFF)
-
-add_definitions(-Wall)
-add_definitions(-O2)
-add_definitions(-g)
-add_definitions(-std=c++11)
-add_definitions(-DLBANN_HAS_OPENCV)
-
-
-#list(APPEND OpenCV_DIR /usr/local/tools/opencv-3.0.0)
-#list(APPEND OpenCV_DIR /usr)
-#find_package(OpenCV QUIET HINTS ${OpenCV_DIR})
-#message(STATUS "OpenCV_DIR: ${OpenCV_DIR}")
-
-if(NOT OpenCV_FOUND)
-  set(OpenCV_DIR ${LBANN_BUILD_DIR})
-  set(OpenCV_LIBS "libopencv_highgui.so;libopencv_imgproc.so;libopencv_imgcodecs.so;libopencv_core.so")
-  set(OpenCV_INCLUDE_DIRS "${OpenCV_DIR}/include")
-  set(OpenCV_LIB_DIR "${OpenCV_DIR}/lib")
-  message(STATUS "OpenCV_DIR: ${OpenCV_DIR}")
-endif()
-
-include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-link_directories(${OpenCV_LIB_DIR})
-
-
-find_package(MPI REQUIRED)
-message(STATUS "Found MPI: ${MPI_CXX_COMPILER} ${MPI_C_COMPILER} ${MPI_Fortran_COMPILER}")
-include_directories(${MPI_CXX_INCLUDE_PATH})
-
-list (APPEND Hydrogen_DIR ${LBANN_BUILD_DIR})
-message(STATUS "Hydrogen_DIR: ${Hydrogen_DIR}")
-
-include_directories(SYSTEM ${Hydrogen_INCLUDE_DIRS})
-link_directories(${Hydrogen_DIR}/lib)
-set(Hydrogen_LIBS "${Hydrogen_LIBRARIES};-lHydrogen;-lpmrrr;-lopenblas;-lpthread")
-
-file(GLOB PATCHWORKS_DEPEND_SRCS
-     lbann/utils/random.cpp
-     ${LBANN_DIR}/src//utils/file_utils.cpp
-     ${LBANN_DIR}/src/data_readers/image_utils.cpp
-     ${LBANN_DIR}/src/data_readers/cv_augmenter.cpp
-     ${LBANN_DIR}/src/data_readers/cv_colorizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_cropper.cpp
-     ${LBANN_DIR}/src/data_readers/cv_decolorizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_mean_extractor.cpp
-     ${LBANN_DIR}/src/data_readers/cv_normalizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_process.cpp
-     ${LBANN_DIR}/src/data_readers/cv_process_patches.cpp
-     ${LBANN_DIR}/src/data_readers/cv_resizer.cpp
-     ${LBANN_DIR}/src/data_readers/cv_subtractor.cpp
-     ${LBANN_DIR}/src/data_readers/cv_transform.cpp
-     ${LBANN_DIR}/src/data_readers/cv_utils.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_patch_descriptor.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_ROI.cpp
-     ${LBANN_DIR}/src/data_readers/patchworks/patchworks_stats.cpp)
-
-link_directories(${LBANN_DIR}/lib64)
-link_directories(${LBANN_DIR}/lib)
-add_executable(${PATCHWORKS_EXE} ${PATCHWORKS_DEPEND_SRCS} ${PATCHWORKS_SRCS})
-target_link_libraries(${PATCHWORKS_EXE} ${OpenCV_LIBS} ${Hydrogen_LIBS} ${MPI_CXX_LIBRARIES} ${OpenMP_CXX_LIBRARIES} ${OpenMP_Fortran_LIBRARIES})
diff --git a/tests/test_patchworks/Mat.hpp b/tests/test_patchworks/Mat.hpp
deleted file mode 120000
index d3dc421f3e4..00000000000
--- a/tests/test_patchworks/Mat.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../test_img_pipeline/Mat.hpp
\ No newline at end of file
diff --git a/tests/test_patchworks/README.txt b/tests/test_patchworks/README.txt
deleted file mode 100644
index f5942a380d2..00000000000
--- a/tests/test_patchworks/README.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-check Elemental_DIR in CMakeList.txt
-This requires OpenCV, Elemental, and MPI. cmake will attempt to find these
-under system or LBANN build directories.
-To set the LBANN build directory, set CLUSTER variable in CMakeList.txt
-
-Make sure if the compiler supports c++11, and the environment viriables, CC and CXX, are set.
-e.g.,
-CC=gcc
-CXX=g++
-
-Then, use the sequence of following commands:
- mkdir build
- cd build
- cmake ..
- make
- cd ..
- export OPENCV_OPENCL_DEVICE=:CPU
- build/patchWorks imgfile 96 48 7 1
- 
- The options of the executable are for 96x96 patch size, 48 gap size, 7 jitter size, and
- the centering mode to generate all 8 neighbors around the center patch.
diff --git a/tests/test_patchworks/include b/tests/test_patchworks/include
deleted file mode 120000
index 38e8790d9e4..00000000000
--- a/tests/test_patchworks/include
+++ /dev/null
@@ -1 +0,0 @@
-../../include/lbann/data_readers
\ No newline at end of file
diff --git a/tests/test_patchworks/lbann b/tests/test_patchworks/lbann
deleted file mode 120000
index acaf439b382..00000000000
--- a/tests/test_patchworks/lbann
+++ /dev/null
@@ -1 +0,0 @@
-../../tools/compute_mean/lbann
\ No newline at end of file
diff --git a/tests/test_patchworks/lbann_config.hpp b/tests/test_patchworks/lbann_config.hpp
deleted file mode 120000
index b596fc812e3..00000000000
--- a/tests/test_patchworks/lbann_config.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../../tools/compute_mean/lbann_config.hpp
\ No newline at end of file
diff --git a/tests/test_patchworks/main.cpp b/tests/test_patchworks/main.cpp
deleted file mode 100644
index c0c41d426c6..00000000000
--- a/tests/test_patchworks/main.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <iterator>
-#include <sstream>
-#include <algorithm>
-#include "lbann/base.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/data_readers/patchworks/patchworks.hpp"
-#include "patchworks_image.hpp"
-#include "lbann/utils/file_utils.hpp"
-//#include <opencv2/cvconfig.h>
-
-
-using namespace lbann::patchworks;
-
-bool test_patch(const int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  if (argc < 2) {
-    std::cout << "Usage: > " << argv[0] << " filename [patch_size [gap_size [jitter [ceteringMode [ca_mode]]]]]" << std::endl;
-    return 0;
-  }
-
-  std::string filename = argv[1];
-
-  lbann::init_random();
-  bool ok = test_patch(argc, argv);
-  if (!ok) {
-    std::cout << "failed to copy the image" << std::endl;
-    return 0;
-  }
-  std::cout << "Complete!" << std::endl;
-
-  return 0;
-}
-
-bool test_patch(const int argc, char *argv[]) {
-  unsigned int patch_size = 96u;
-  unsigned int gap = 48u;
-  unsigned int jitter = 7u;
-  unsigned int mode_centering = 1u;
-  unsigned int mode_chromaberr = 0u;
-  std::string filename = argv[1];
-  if (argc > 2) {
-    patch_size = std::atoi(argv[2]);
-  }
-  if (argc > 3) {
-    gap = std::atoi(argv[3]);
-  }
-  if (argc > 4) {
-    jitter = std::atoi(argv[4]);
-  }
-  if (argc > 5) {
-    mode_centering = std::atoi(argv[5]);
-  }
-  if (argc > 6) {
-    mode_chromaberr = std::atoi(argv[6]);
-  }
-
-  if (patch_size == 0u) {
-    return false;
-  }
-
-#ifdef LBANN_HAS_OPENCV
-  // load input image
-  image *img = new image(filename);
-  if (img->empty()) {
-    std::cout << "failed to load the image " << filename << std::endl;
-    return false;
-  }
-  img->show_info();
- #if defined(HAVE_GTK) || defined(HAVE_CARBON)
-  img->display("original " + filename);
- #endif
-
-  bool ok = true;
-  patch_descriptor pi;
-  pi.set_size(patch_size, patch_size);
-  ok = pi.set_sample_image(static_cast<unsigned int>(img->get_width()),
-                           static_cast<unsigned int>(img->get_height()));
-  if (!ok) {
-    std::cout << "failed to set patch sampling region" << std::endl;
-  }
-  pi.set_gap(gap);
-  pi.set_jitter(jitter);
-  pi.set_mode_centering(mode_centering);
-  pi.set_mode_chromatic_aberration(mode_chromaberr);
-  pi.set_file_ext("png");
-  pi.define_patch_set();
-
-  std::vector<cv::Mat> patches;
-  ok = pi.extract_patches(img->get_image(), patches);
-  if (!ok) {
-    std::cout << "failed to extract patch" << std::endl;
-  }
-  for (size_t i=0u; i < patches.size(); ++i) {
-    std::stringstream sstr;
-    sstr << "patch." << i << ".png";
-    image::write(sstr.str(), patches[i]);
-  }
-  std::cout << "the id of the last patch generated (label in case of paired patches): "
-            << pi.get_last_label()+1 << std::endl;
-
-  std::cout << pi;
- #if defined(HAVE_GTK) || defined(HAVE_CARBON)
-  img->draw_patches(pi);
-  img->display("patches of " + filename);
- #endif
-
-  std::string patched_filename = basename_with_no_extention(filename)
-                                 + ".patched." + lbann::get_ext_name(filename);
-  ok = img->write(patched_filename);
-  if (!ok) {
-    std::cout << "failed to write patch map" << std::endl;
-  }
-
-  delete img;
-  return true;
-#else
-  return false;
-#endif // LBANN_HAS_OPENCV
-}
diff --git a/tests/test_patchworks/patchworks_image.cpp b/tests/test_patchworks/patchworks_image.cpp
deleted file mode 100644
index 78f9911e068..00000000000
--- a/tests/test_patchworks/patchworks_image.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <ctime>
-#include <cmath> // sqrt
-#include <cstdlib>
-#include "lbann/data_readers/patchworks/patchworks_stats.hpp"
-#include "patchworks_image.hpp"
-
-#include <opencv2/core/version.hpp>
-#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3))
-#define DEFAULT_CV_WINDOW_KEEPRATIO cv::WINDOW_KEEPRATIO
-#else
-#define DEFAULT_CV_WINDOW_KEEPRATIO CV_WINDOW_KEEPRATIO
-#endif
-
-namespace lbann {
-namespace patchworks {
-
-std::string showDepth(const cv::Mat& mat) {
-  return showDepth(CV_MAT_DEPTH(mat.type()));
-}
-
-std::string showDepth(const int depth) {
-  switch (depth) {
-  case CV_8U:
-    return "CV_8U";
-    break;
-  case CV_8S:
-    return "CV_8S";
-    break;
-  case CV_16U:
-    return "CV_16U";
-    break;
-  case CV_16S:
-    return "CV_16S";
-    break;
-  case CV_32S:
-    return "CV_32S";
-    break;
-  case CV_32F:
-    return "CV_32F";
-    break;
-  case CV_64F:
-    return "CV_64F";
-    break;
-  default:
-    return "Unknown";
-    break;
-  }
-  return "Unknown";
-}
-
-size_t image_data_amount(const cv::Mat& img) {
-  return static_cast<size_t>(CV_ELEM_SIZE(img.depth())*CV_MAT_CN(img.type())*img.cols*img.rows);
-}
-
-void show_cvMat_info(const int type) {
-  const int depth = CV_MAT_DEPTH(type);
-  std::cout << "showDepth(CV_MAT_DEPTH(img.type())) " << lbann::patchworks::showDepth(depth) << std::endl;
-  std::cout << "CV_ELEM_SIZE(img.depth()) " << CV_ELEM_SIZE(depth) << std::endl;
-  std::cout << "CV_ELEM_SIZE(CV_MAT_DEPTH(img.type())) " << CV_ELEM_SIZE(depth) << std::endl;
-  std::cout << "CV_MAT_CN(img.type())    " << CV_MAT_CN(type) << std::endl;
-}
-
-
-image::image(const std::string fname)
-  : m_screen_width(640), m_screen_height(480) {
-  detect_screen_resolution();
-  if (fname != "") {
-    load(fname);
-  }
-}
-
-image::~image(void) {
-  release();
-}
-
-void image::release(void) {
-  m_filename = "";
-  if (m_img.data != NULL) {
-    m_img.release();
-  }
-  m_img.data = NULL;
-}
-
-bool image::load(const std::string fname) {
-  release();
-
-  m_img = cv::imread(fname, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
-
-  if (m_img.data == NULL) {
-    return false;
-  }
-
-  m_filename = fname;
-  return true;
-}
-
-
-std::ostream& image::show_info(const cv::Mat& img, const std::string title, std::ostream& os) {
-  if (img.data == NULL) {
-    return os;
-  }
-
-  os << title << std::endl
-     << "   Type: " << showDepth(img) << std::endl
-     << "   nCh : " << img.channels() << std::endl
-     << "   Size: " << img.size() << std::endl;
-
-  std::vector<image_stats> stats;
-  get_channel_stats(img, stats);
-
-  for (int ch = 0; ch < img.channels(); ++ch) {
-    os << "channel " << ch;
-    os << stats[ch] << std::endl;
-  }
-
-  return os;
-}
-
-std::ostream& image::show_info(const std::string title, std::ostream& os) const {
-  show_info(m_img, title, os);
-  os << "   File: " << m_filename << std::endl;
-  return os;
-}
-
-void image::detect_screen_resolution(void) {
-  std::vector<std::pair<int, int> > res;
-  unsigned int cnt = get_screen_resolution(res);
-
-  if (cnt == 0u) { // fall back resolution
-    res.push_back(std::make_pair(640,480));
-  }
-
-  m_screen_width = res[0].first;
-  m_screen_height = res[0].second;
-}
-
-void image::display(const std::string title) const {
-  if (m_img.data == NULL) {
-    return;  // nothing to show
-  }
-
-  cv::namedWindow(title, cv::WINDOW_NORMAL | DEFAULT_CV_WINDOW_KEEPRATIO);
-  cv::imshow(title, m_img);
-
-  float zoomFactor = 1.0;
-  const double initialZoomOutRateW = static_cast<double>(m_screen_width)/m_img.cols;
-  const double initialZoomOutRateH = static_cast<double>(m_screen_height)/m_img.rows;
-  const double initialZoomOutRate = std::min(initialZoomOutRateW, initialZoomOutRateH);
-
-  const int eW = static_cast<int>(m_img.cols * initialZoomOutRate);
-  const int eH = static_cast<int>(m_img.rows * initialZoomOutRate);
-
-  const int m_screen_widthZ = std::min(static_cast<int>(zoomFactor*eW), m_img.cols);
-  const int m_screen_heightZ = std::min(static_cast<int>(zoomFactor*eH), m_img.rows);
-
-  cv::resizeWindow(title, m_screen_widthZ, m_screen_heightZ);
-  m_window_title = title;
-  cv::waitKey(0);
-}
-
-void image::draw_rectangle(cv::Mat& img, const ROI& r) {
-  const uint16_t chSet = std::numeric_limits<uint16_t>::max();
-  const cv::Scalar color(0, chSet, chSet);
-  const int thickness = 2;
-  const int lineType = 8;
-
-  cv::rectangle(img,
-                cv::Point(r.left(), r.top()),
-                cv::Point(r.right(), r.bottom()),
-                color, thickness, lineType);
-}
-
-void image::draw_rectangle(const ROI& r) {
-  draw_rectangle(m_img, r);
-}
-
-void image::draw_patches(const patch_descriptor& pi) {
-  const std::vector<ROI>& pos = pi.access_positions();
-  for (size_t i=0u; i< pos.size(); ++i) {
-    draw_rectangle(m_img, pos[i]);
-  }
-}
-
-bool image::write(const std::string out_filename, const cv::Mat& img_to_write) {
-  if ((out_filename == "") || (img_to_write.data == NULL)) {
-    return false;
-    //std::cout << "Failed to write an image file [" << out_filename << "]" << std::endl;
-  }
-  //std::cout << "writing an image file [" << out_filename << "]" << std::endl;
-  return cv::imwrite(out_filename, img_to_write);
-}
-
-
-} // end of namespace patchworks
-} // end of namespace lbann
diff --git a/tests/test_patchworks/patchworks_image.hpp b/tests/test_patchworks/patchworks_image.hpp
deleted file mode 100644
index d2515e75888..00000000000
--- a/tests/test_patchworks/patchworks_image.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  Main image data structure
- *  Author: Jae-Seung Yeom
- */
-
-#ifndef _PATCHWORKS_IMAGE_H_INCLUDED_
-#define _PATCHWORKS_IMAGE_H_INCLUDED_
-
-#include <string>
-#include <ostream>
-#include <iostream>
-#include <vector>
-#include "lbann/data_readers/patchworks/patchworks_common.hpp"
-#include "lbann/data_readers/patchworks/patchworks_ROI.hpp"
-#include "lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp"
-#include "patchworks_utils.hpp"
-
-#include <opencv2/core/version.hpp>
-#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3))
-#include <opencv2/highgui.hpp>
-#define DEFAULT_CV_WINDOW_KEEPRATIO cv::WINDOW_KEEPRATIO
-#else
-#include <opencv2/highgui/highgui.hpp>
-#define DEFAULT_CV_WINDOW_KEEPRATIO CV_WINDOW_KEEPRATIO
-#endif
-
-namespace lbann {
-namespace patchworks {
-
-class image {
- public:
- protected:
-  /// The image data (OpenCV Mat type)
-  cv::Mat m_img;
-  /// A name to show on an image window title bar
-  mutable std::string m_window_title;
-  /// The name of the image file
-  std::string m_filename;
-
-  // current screen resolution
-  int m_screen_width; ///< screen width in the number of pixels
-  int m_screen_height; ///< screen height in the number of pixels
-
-  /// Detect screen resolution to draw window that fits in the screen
-  virtual void detect_screen_resolution(void);
-
- public:
-  image(void) : m_window_title(""), m_filename("") {
-    detect_screen_resolution();
-  }
-  image(const std::string fname);
-  virtual ~image(void);
-
-  /// Check if an image data exists
-  virtual bool empty(void) const {
-    return (m_img.data == NULL);
-  }
-  /// Free the space used by image data
-  virtual void release(void);
-  /// Return the filename of this image
-  virtual std::string get_filename(void) const {
-    return m_filename;
-  }
-  /// Read an image file
-  virtual bool load(const std::string fname);
-  /// Display the image
-  virtual void display(const std::string title="") const;
-  /// Show information on a given image with a given title
-  static std::ostream& show_info(const cv::Mat& img, const std::string title = "image info",
-                                 std::ostream& os = std::cout);
-  /// Show information on the image with a given title
-  virtual std::ostream& show_info(const std::string title = "image info",
-                                  std::ostream& os = std::cout) const;
-
-  /// Return the width (number of columns) of the image
-  virtual int get_width(void) const {
-    return m_img.cols;
-  }
-  /// Return the height (number of rows) of the image
-  virtual int get_height(void) const {
-    return m_img.rows;
-  }
-
-  /// Return the number of channels of the images
-  virtual int get_num_channels(void) const {
-    return m_img.channels();
-  }
-  /// Return the pixel depth of the image in OpenCV term
-  virtual int get_depth(void) const {
-    return m_img.depth();
-  }
-
-  /// Returns the access to the image data (OpenCV Mat type)
-  virtual cv::Mat& get_image(void) {
-    return m_img;
-  }
-
-  /// Mark a retangular region on the image
-  static void draw_rectangle(cv::Mat& img, const ROI& r);
-  /// Mark a retangular region on the image
-  virtual void draw_rectangle(const ROI& r);
-  /// Mark patch regions on the image
-  virtual void draw_patches(const patch_descriptor& pi);
-
-  /// Write an image into the file with the given file name
-  static bool write(const std::string outFileName, const cv::Mat& img_to_write);
-  /// Write the image into the file with the given name
-  virtual bool write(const std::string outFileName) const {
-    return write(outFileName, m_img);
-  }
-};
-
-std::string showDepth(const cv::Mat& mat);
-std::string showDepth(const int depth);
-size_t image_data_amount(const cv::Mat& mat);
-void show_cvMat_info(const int type);
-
-} // end of namespace patchworks
-} // end of namespace lbann
-#endif // _PATCHWORKS_IMAGE_H_INCLUDED_
diff --git a/tests/test_patchworks/patchworks_utils.cpp b/tests/test_patchworks/patchworks_utils.cpp
deleted file mode 100644
index 9b682b6821c..00000000000
--- a/tests/test_patchworks/patchworks_utils.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-#include "patchworks_utils.hpp"
-#include <cstdio> // popen
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <sstream> // std::ostringstream
-
-unsigned int get_screen_resolution(std::vector<std::pair<int, int> >& res) {
-  std::string command = "xrandr | grep '*'";
-  FILE *fpipe = (FILE *) popen(command.c_str(),"r");
-  char line[256];
-  unsigned int cnt = 0u;
-  res.clear();
-
-  while ( fgets( line, sizeof(line), fpipe) ) {
-    //printf("%s", line);
-    std::string rstr(line);
-    unsigned int posS=0u;
-    unsigned int posE=0u;
-    unsigned int posX=0u;
-    posX = rstr.find_first_of("xX", posX);
-    posS = rstr.find_first_of(" \t\r", posS);
-    posE = rstr.find_first_of(" \t\r", posS);
-    std::string widthStr = rstr.substr(posS+1, (posX-posS));
-    std::string heightStr = rstr.substr(posX+1, (posE-posX));
-    std::stringstream wss(widthStr);
-    std::stringstream hss(heightStr);
-    int w=0, h=0;
-    wss >> w;
-    hss >> h;
-    res.push_back(std::make_pair(w,h));
-    cnt ++;
-  }
-  pclose(fpipe);
-
-  return cnt;
-}
-
-struct path_delimiter {
-  bool operator()( char ch ) const {
-    return ch == '/';
-  }
-};
-
-bool split_path(const std::string& path, std::string& dir, std::string& name) {
-  std::string::const_iterator nb
-    = std::find_if( path.rbegin(), path.rend(), path_delimiter()).base();
-  dir =  std::string(path.begin(), nb);
-  name = std::string(nb, path.end());
-  if (name.empty()) {
-    return false;
-  }
-
-  return true;
-}
-
-std::string name_with_no_extention(const std::string filename) {
-  size_t pos = filename.find_last_of('.');
-  if (pos == 0u) {
-    return filename;
-  }
-  return filename.substr(0, pos);
-}
-
-std::string get_file_extention(const std::string filename) {
-  size_t pos = filename.find_last_of('.');
-  if (pos == 0u) {
-    return "";
-  }
-  return filename.substr(pos+1, filename.size());
-}
-
-std::string basename_with_no_extention(const std::string filename) {
-  std::string imgdir;
-  std::string imgfile;
-  split_path(filename, imgdir, imgfile);
-  return name_with_no_extention(imgfile);
-}
diff --git a/tests/test_patchworks/patchworks_utils.hpp b/tests/test_patchworks/patchworks_utils.hpp
deleted file mode 100644
index 71c915b42ea..00000000000
--- a/tests/test_patchworks/patchworks_utils.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  Utility routines
- *  Author: Jae-Seung Yeom
- */
-#ifndef _PATCHWORKS_UTILS_H_INCLUDED_
-#define _PATCHWORKS_UTILS_H_INCLUDED_
-#include <vector>
-#include <utility> // std::pair
-#include "lbann/data_readers/patchworks/patchworks_common.hpp"
-
-/// Obtain the screen resolution, which is useful to size an window
-unsigned int get_screen_resolution(std::vector<std::pair<int, int> >& res);
-
-/// Split a file path into the directory and the file name under it
-bool split_path(const std::string& path, std::string& dir, std::string& name);
-
-/// return the file name without extention
-std::string name_with_no_extention(const std::string filename);
-
-/// return the file extention
-std::string get_file_extention(const std::string filename);
-
-/// return the base file name (respective to its final directory) without extention
-std::string basename_with_no_extention(const std::string filename);
-
-#endif // _PATCHWORKS_UTILS_H_INCLUDED_
diff --git a/tests/test_patchworks/src b/tests/test_patchworks/src
deleted file mode 120000
index 7b97ad049e9..00000000000
--- a/tests/test_patchworks/src
+++ /dev/null
@@ -1 +0,0 @@
-../../src/data_readers
\ No newline at end of file
diff --git a/tests/test_shuffled_indices.cpp b/tests/test_shuffled_indices.cpp
index d260a341f01..1c2a3992ad2 100644
--- a/tests/test_shuffled_indices.cpp
+++ b/tests/test_shuffled_indices.cpp
@@ -29,6 +29,11 @@
 #include "lbann/lbann.hpp"
 #include "lbann/proto/proto_common.hpp"
 
+#include <lbann.pb.h>
+#include <reader.pb.h>
+
+#include <string>
+
 using namespace lbann;
 
 int mini_batch_size = 128;
@@ -36,8 +41,11 @@ int mini_batch_size = 128;
 void test_is_shuffled(const generic_data_reader &reader, bool is_shuffled, const char *msg = nullptr);
 
 int main(int argc, char *argv[]) {
+  world_comm_ptr comm = initialize(argc, argv);
+  // Initialize the general RNGs and the data sequence RNGs
   int random_seed = lbann_default_random_seed;
-  world_comm_ptr comm = initialize(argc, argv, random_seed);
+  init_random(random_seed);
+  init_data_seq_random(random_seed);
   const bool master = comm->am_world_master();
 
   try {
diff --git a/tests/test_stack_tracing/test_lbann_exception_tracing.cpp b/tests/test_stack_tracing/test_lbann_exception_tracing.cpp
index 634182a54f8..c27a3358847 100644
--- a/tests/test_stack_tracing/test_lbann_exception_tracing.cpp
+++ b/tests/test_stack_tracing/test_lbann_exception_tracing.cpp
@@ -30,8 +30,6 @@
 
 using namespace lbann;
 
-const int lbann_default_random_seed = 42;
-
 class A {
   public :
     void testme_class_A_one() {
@@ -66,8 +64,7 @@ class B {
 };
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  lbann_comm *comm = initialize(argc, argv, random_seed);
+  lbann_comm *comm = initialize(argc, argv);
   bool master = comm->am_world_master();
 
   try {
diff --git a/tests/test_stack_tracing/test_sigint_tracing.cpp b/tests/test_stack_tracing/test_sigint_tracing.cpp
index bdf6526a8eb..5867f8d9710 100644
--- a/tests/test_stack_tracing/test_sigint_tracing.cpp
+++ b/tests/test_stack_tracing/test_sigint_tracing.cpp
@@ -30,8 +30,6 @@
 
 using namespace lbann;
 
-const int lbann_default_random_seed = 42;
-
 class A {
   public :
     void testme_class_A_one() {
@@ -66,8 +64,7 @@ class B {
 };
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  lbann_comm *comm = initialize(argc, argv, random_seed);
+  lbann_comm *comm = initialize(argc, argv);
   bool master = comm->am_world_master();
 
   try {
diff --git a/tests/test_stack_tracing/test_sigsev_tracing.cpp b/tests/test_stack_tracing/test_sigsev_tracing.cpp
index e6e1d153454..88ebe315283 100644
--- a/tests/test_stack_tracing/test_sigsev_tracing.cpp
+++ b/tests/test_stack_tracing/test_sigsev_tracing.cpp
@@ -30,8 +30,6 @@
 
 using namespace lbann;
 
-const int lbann_default_random_seed = 42;
-
 class A {
   public :
     void testme_class_A_one() {
@@ -66,8 +64,7 @@ class B {
 };
 
 int main(int argc, char *argv[]) {
-  int random_seed = lbann_default_random_seed;
-  lbann_comm *comm = initialize(argc, argv, random_seed);
+  lbann_comm *comm = initialize(argc, argv);
   bool master = comm->am_world_master();
 
   try {
diff --git a/tools/siamese_patch_list/CMakeLists.txt b/tools/siamese_patch_list/CMakeLists.txt
deleted file mode 100644
index 73a5d9c57d3..00000000000
--- a/tools/siamese_patch_list/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-project(siamese_patches)
-cmake_minimum_required(VERSION 3.8)
-cmake_policy(SET CMP0015 NEW)
-
-set(COMPILER "gnu")
-#set(CLUSTER "catalyst")
-set(CLUSTER "pascal")
-#set(CLUSTER "surface")
-#set(CLUSTER "quartz")
-set(LBANN_DIR ../..)
-set(LBANN_INSTALL_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install)
-include(${LBANN_DIR}/cmake/modules/FindCNPY.cmake)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-set(SIAMESE_PATCHES_EXE siamese_patches)
-set(SIAMESE_PATCHES_SRCS siamese_patches.cpp)
-set(WITH_OPENCL OFF)
-
-add_definitions(-Wall)
-add_definitions(-O2)
-add_definitions(-g)
-add_definitions(-std=c++11)
-add_definitions(-D_OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_)
-
-
-if(NOT CNPY_FOUND)
-  list(APPEND CNPY_DIR /usr)
-  find_package(CNPY QUIET HINTS ${CNPY_DIR})
-  message(STATUS "CNPY_DIR: ${CNPY_DIR}")
-
-  set(CNPY_DIR ${LBANN_INSTALL_DIR})
-  set(CNPY_LIBRARY "libcnpy.so;libz.so")
-  set(CNPY_INCLUDE_DIRS "${CNPY_DIR}/include")
-  set(CNPY_LIB_DIR "${CNPY_DIR}/lib")
-  message(STATUS "CNPY_DIR: ${CNPY_DIR}")
-  link_directories(${CNPY_LIB_DIR})
-endif()
-
-include_directories(SYSTEM ${CNPY_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR})
-
-
-
-file(GLOB SIAMESE_PATCHES_DEPEND_SRCS
-     ${LBANN_DIR}/src/utils/file_utils.cpp
-     ${LBANN_DIR}/src/utils/cnpy_utils.cpp
-     ${LBANN_DIR}/src/data_readers/offline_patches_npz.cpp)
-
-add_executable(${SIAMESE_PATCHES_EXE} ${SIAMESE_PATCHES_SRCS} ${SIAMESE_PATCHES_DEPEND_SRCS})
-target_link_libraries(${SIAMESE_PATCHES_EXE} ${CNPY_LIBRARY})
diff --git a/tools/siamese_patch_list/lbann b/tools/siamese_patch_list/lbann
deleted file mode 120000
index 0e3198db932..00000000000
--- a/tools/siamese_patch_list/lbann
+++ /dev/null
@@ -1 +0,0 @@
-../compute_mean/lbann
\ No newline at end of file
diff --git a/tools/siamese_patch_list/mem.hpp b/tools/siamese_patch_list/mem.hpp
deleted file mode 100644
index a063a0f273f..00000000000
--- a/tools/siamese_patch_list/mem.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#include <sys/sysinfo.h>
-#include <limits>
-#include <fstream>
-
-unsigned long long getTotalSystemMemory() {
-    std::string token;
-    std::ifstream file("/proc/meminfo", std::ifstream::in);
-    while(file >> token) {
-        if(token == "MemFree:") {
-            unsigned long mem;
-            if(file >> mem) {
-                return mem;
-            } else {
-                return 0;       
-            }
-        }
-        // ignore rest of the line
-        file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
-    }
-    return 0; // nothing found
-}
-
-void print_mem(std::string tag) {
-  std::cout << tag << ' ' << getTotalSystemMemory() << std::endl;
-}
diff --git a/tools/siamese_patch_list/siamese_patches.cpp b/tools/siamese_patch_list/siamese_patches.cpp
deleted file mode 100644
index 703b316a72d..00000000000
--- a/tools/siamese_patch_list/siamese_patches.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <iostream>
-#include <string>
-#include <set>
-#include <vector>
-#include <cstdlib> // exit
-#include "mem.hpp"
-#include "lbann/data_readers/offline_patches_npz.hpp"
-
-using namespace lbann;
-
-void print_all_samples(const offline_patches_npz& data) {
-  const size_t num_samples = data.get_num_samples();
-  for(size_t i=0u; i < num_samples; ++i) {
-    offline_patches_npz::sample_t sample = data.get_sample(i);
-#if 0
-    std::cout << "(['" << sample.first[0] << "', '"
-                       << sample.first[1] << "', '"
-                       << sample.first[2] << "'], '"
-              << static_cast<unsigned int>(sample.second) << "')" << std::endl;
-#else
-    std::cout << sample.first[0] << ' '
-              << sample.first[1] << ' '
-              << sample.first[2] << ' '
-              << static_cast<unsigned int>(sample.second) << std::endl;
-#endif
-  }
-}
-
-void load(const std::string& file_name, offline_patches_npz& data, const int out_mode, const size_t n_first = 0u) {
-  const bool keep_file_lists = ((out_mode == 4)? true : false);
-
-  if (!data.load(file_name, n_first, keep_file_lists)) {
-    std::cerr << "Failed to load " << file_name << std::endl;
-    exit(0);
-  }
-}
-
-
-int main(int argc, char** argv)
-{
-  if ((argc < 4) || (argc > 7)) {
-    std::cout << "Uasge: > " << argv[0] << " npz_file in_mode out_mode [arg1 [arg2 [out_file]]]" << std::endl;
-    std::cout << "         in_mode 0: load all data" << std::endl;
-    std::cout << "         in_mode 1: load first n(arg1) samples" << std::endl;
-    std::cout << "         in_mode 2: load all data and proceed to out_mode 2" << std::endl;
-    std::cout << "        out_mode 0: show data description" << std::endl;
-    std::cout << "        out_mode 1: print the list of samples to stdout" << std::endl;
-    std::cout << "        out_mode 2: print the number of samples in first n(arg1) sub directories" << std::endl;
-    std::cout << "        out_mode 3: print the subdirectory names of samples to stdout" << std::endl;
-    std::cout << "        out_mode 4: write samples selected by the id range, between id_start(arg1) and id_end(arg2)" << std::endl;
-    std::cout << "                    The chosen samples are written to out_file" << std::endl;
-    std::cout << "                    id_start is inclusive, and id_end is exclusive." << std::endl;
-    return 0;
-  }
-
-  std::string file_name(argv[1]);
-  int in_mode = atoi(argv[2]);
-  int out_mode = atoi(argv[3]);
-  size_t num_subdirs = 0u;
-  size_t n_first = 0u;
-
-  offline_patches_npz data;
-
-
-  switch (in_mode) {
-    case 0: { // load all data
-      load(file_name, data, out_mode);
-    } break;
-    case 1: { // load first_n samples
-      n_first = static_cast<size_t>(atoi(argv[4]));
-      load(file_name, data, out_mode, n_first);
-    } break;
-    case 2: {
-      if (out_mode != 2) {
-        std::cout << "Changing out_mode to 2, to count the number of samples in first "
-                  << num_subdirs << " directories" << std::endl;
-        out_mode = 2;
-      }
-      load(file_name, data, out_mode);
-      if (argc != 5) {
-        std::cout << "The number of subdir argument (arg1)  is missing" << std::endl;
-        exit(0);
-      }
-      num_subdirs = static_cast<size_t>(atoi(argv[4]));
-    } break;
-    default:
-      std::cout << "Invalid in_mode: " << in_mode << std::endl;
-      return 0;
-  }
-
-  switch (out_mode) {
-    case 0: { // show data description
-      print_mem("Memory status :");
-      std::cout << data.get_description() << std::endl;
-    } break;
-    case 1: { // print the list of samples to stdout
-      print_all_samples(data);
-    } break;
-    case 2: { // print the number of samples in first n sub directories
-      if (in_mode != 2) {
-        std::cout << "in_mode was not 2 but " << in_mode << std::endl;
-      } else {
-        std::cout << "Number of subdirs: " << num_subdirs << std::endl;
-        std::cout << "Number of samples: " << data.count_samples(num_subdirs) << std::endl;
-      }
-    } break;
-    case 3: { // print the subdirectory names of samples
-      const std::vector<std::string> root_names = data.get_file_roots();
-      for(auto&& r: root_names) {
-        std::cout << r << std::endl;
-      }
-    } break;
-    case 4: { // write selected samples into a new file
-      if (argc != 7) {
-        std::cout << "Requires the arguments: id_start, id_end, and out_file." << std::endl;
-        exit(0);
-      }
-      size_t id_start = static_cast<size_t>(atoi(argv[4]));
-      size_t id_end= static_cast<size_t>(atoi(argv[5]));
-      std::string out_file(argv[6]);
-
-      if (out_file == file_name) {
-        std::cout << "Cannot overwrite the data file" << std::endl;
-        exit(0);
-      }
-      bool ok = data.select(out_file, id_start, id_end);
-      if (!ok) {
-        std::cout << "Failed to select [" << id_start << ", " << id_end << std::endl;
-        exit(0);
-      }
-    } break;
-    default:
-      std::cout << "Invalid out_mode: " << in_mode << std::endl;
-      return 0;
-  }
-
-  return 0;
-}
diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
index 1368eafc110..f13c07376d6 100644
--- a/unit_test/CMakeLists.txt
+++ b/unit_test/CMakeLists.txt
@@ -1,8 +1,22 @@
+# Add the unit testing utilities library
+add_subdirectory(utilities)
+
 # Add the sequential test main() function
 add_executable(seq-catch-tests
-  SequentialCatchMain.cpp "${LBANN_CATCH2_TEST_FILES}")
-target_link_libraries(seq-catch-tests PRIVATE lbann Catch2::Catch2)
+  SequentialCatchMain.cpp "${LBANN_SEQ_CATCH2_TEST_FILES}")
+target_link_libraries(seq-catch-tests
+  PRIVATE unit_test_utilities lbann Catch2::Catch2)
 
 catch_discover_tests(seq-catch-tests)
 
-# Add the parallel test main() function -- TODO
+# There's an example MPI test
+add_subdirectory(example)
+
+# Add the parallel test main() function
+add_executable(mpi-catch-tests
+  MPICatchMain.cpp "${LBANN_MPI_CATCH2_TEST_FILES}")
+target_link_libraries(mpi-catch-tests
+  PRIVATE unit_test_utilities lbann Catch2::Catch2)
+
+# TODO: Some "magical" way to automatically run tests if a parallel
+# environment is detected at CTest time
diff --git a/unit_test/MPICatchMain.cpp b/unit_test/MPICatchMain.cpp
new file mode 100644
index 00000000000..8ac93fe774a
--- /dev/null
+++ b/unit_test/MPICatchMain.cpp
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch.hpp>
+
+// Utilities
+#include "MPITestHelpers.hpp"
+#include "ReplaceEscapes.hpp"
+
+#include <lbann/base.hpp>
+#include <lbann/utils/system_info.hpp>
+
+// Just stand up MPI before running all tests; teardown after.
+using namespace unit_test::utilities;
+int main(int argc, char* argv[])
+{
+  // Set up the communication domain
+  auto world_comm = lbann::initialize(argc, argv);
+  expert::register_world_comm(*world_comm);
+
+  // Initialize Catch2
+  Catch::Session session;
+
+  // Parse the command line
+  int return_code = session.applyCommandLine(argc, argv);
+  if (return_code != 0) // Indicates a command line error
+    return return_code;
+
+  // Manipulate output file if needed.
+  auto& config_data = session.configData();
+  auto& output_file = config_data.outputFilename;
+  if (output_file.size() > 0)
+  {
+    lbann::utils::SystemInfo sys_info;
+    output_file = replace_escapes(output_file, sys_info);
+  }
+
+  // Run the catch tests, outputting to the given file.
+  int num_failed = session.run();
+
+  // Clean up the catch environment
+  expert::reset_world_comm();
+
+  // Shut down the communication domain
+  world_comm.reset(); // Force MPI_Finalize, et al, before return.
+
+  return num_failed;
+}
diff --git a/unit_test/SequentialCatchMain.cpp b/unit_test/SequentialCatchMain.cpp
index 4ed06df1f7b..55ed04c7ea6 100644
--- a/unit_test/SequentialCatchMain.cpp
+++ b/unit_test/SequentialCatchMain.cpp
@@ -1,2 +1,39 @@
-#define CATCH_CONFIG_MAIN
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#define CATCH_CONFIG_RUNNER
 #include <catch2/catch.hpp>
+#include <lbann/utils/random_number_generators.hpp>
+
+int main(int argc, char* argv[]) {
+  // Initialize the general RNGs and the data sequence RNGs
+  int random_seed = 42;
+  lbann::init_random(random_seed);
+  lbann::init_data_seq_random(random_seed);
+
+  int result = Catch::Session().run(argc, argv);
+  return result;
+}
diff --git a/unit_test/example/CMakeLists.txt b/unit_test/example/CMakeLists.txt
new file mode 100644
index 00000000000..a651e948fd4
--- /dev/null
+++ b/unit_test/example/CMakeLists.txt
@@ -0,0 +1,7 @@
+set_full_path(THIS_DIR_MPI_CATCH2_TEST_FILES
+  mpi_bcast_example_test.cpp
+  )
+
+set(LBANN_MPI_CATCH2_TEST_FILES
+  "${LBANN_MPI_CATCH2_TEST_FILES}"
+  "${THIS_DIR_MPI_CATCH2_TEST_FILES}" PARENT_SCOPE)
diff --git a/unit_test/example/mpi_bcast_example_test.cpp b/unit_test/example/mpi_bcast_example_test.cpp
new file mode 100644
index 00000000000..79bebb05ed6
--- /dev/null
+++ b/unit_test/example/mpi_bcast_example_test.cpp
@@ -0,0 +1,56 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <catch2/catch.hpp>
+
+#include "MPITestHelpers.hpp"
+
+TEST_CASE("Example: test of Broadcast", "[mpi][example]")
+{
+  auto& world_comm = unit_test::utilities::current_world_comm();
+  int rank_in_world = world_comm.get_rank_in_world();
+
+  SECTION("Scalar broadcast")
+  {
+    int value = (rank_in_world == 0 ? 13 : -1);
+    world_comm.world_broadcast(0, value);
+
+    REQUIRE(value == 13);
+  }
+
+  SECTION("Vector broadcast")
+  {
+    std::vector<float> true_values = {1.f, 2.f, 3.f, 4.f};
+    std::vector<float> values =
+      (rank_in_world == 0
+       ? true_values
+       : std::vector<float>(4, -1.f));
+
+    world_comm.world_broadcast(0, values.data(), values.size());
+
+    REQUIRE(values == true_values);
+  }
+}
diff --git a/unit_test/utilities/CMakeLists.txt b/unit_test/utilities/CMakeLists.txt
new file mode 100644
index 00000000000..9944ad1c926
--- /dev/null
+++ b/unit_test/utilities/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Add the library
+add_library(unit_test_utilities
+  # Headers
+  MPITestHelpers.hpp
+  ReplaceEscapes.hpp
+
+  # C++
+  MPITestHelpers.cpp
+  ReplaceEscapes.cpp
+  ) # add_library unit_test_utilities
+
+target_include_directories(unit_test_utilities
+  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+target_link_libraries(unit_test_utilities PUBLIC lbann)
+
+# Add the unit tests for the library
+add_subdirectory(unit_test)
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQ_CATCH2_TEST_FILES}"
+  PARENT_SCOPE)
diff --git a/unit_test/utilities/MPITestHelpers.cpp b/unit_test/utilities/MPITestHelpers.cpp
new file mode 100644
index 00000000000..b82142ef151
--- /dev/null
+++ b/unit_test/utilities/MPITestHelpers.cpp
@@ -0,0 +1,53 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "MPITestHelpers.hpp"
+
+namespace unit_test {
+namespace utilities {
+namespace {
+lbann::lbann_comm* global_comm_;
+}
+
+lbann::lbann_comm& current_world_comm()
+{
+  LBANN_ASSERT_POINTER(global_comm_);
+  return *global_comm_;
+}
+
+namespace expert {
+void register_world_comm(lbann::lbann_comm& comm) noexcept
+{
+  global_comm_ = &comm;
+}
+
+void reset_world_comm() noexcept
+{
+  global_comm_ = nullptr;
+}
+} // namespace expert
+} // namespace utilities
+} // namespace unit_test
diff --git a/unit_test/utilities/MPITestHelpers.hpp b/unit_test/utilities/MPITestHelpers.hpp
new file mode 100644
index 00000000000..4be99f8882f
--- /dev/null
+++ b/unit_test/utilities/MPITestHelpers.hpp
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_
+#define LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_
+
+#include <lbann/comm.hpp>
+#include <lbann/utils/exception.hpp>
+
+#include <stdexcept>
+#include <string>
+
+#define LBANN_ASSERT_POINTER(ptr)                                       \
+  do {                                                                  \
+  if (!ptr)                                                             \
+    throw ::unit_test::utilities::BadPointer(                           \
+      __FILE__, __LINE__, #ptr);                                        \
+  } while (false)
+
+namespace unit_test {
+namespace utilities {
+
+struct BadPointer : std::runtime_error
+{
+  BadPointer(std::string const& file, unsigned int line,
+             std::string const& var_name)
+    : std::runtime_error{
+      lbann::build_string(
+        file, ":", line, ": \"", var_name, "\" is null.")}
+  {}
+};
+
+/** @brief Get the world communicator for this MPI session. */
+lbann::lbann_comm& current_world_comm();
+
+// Sizes are not signed.
+template <typename T>
+size_t as_size(T const& size) noexcept { return static_cast<size_t>(size); }
+
+/** @brief Expert-only methods */
+namespace expert {
+
+/** @brief Set the world communicator for this session.
+ *
+ *  @warning This may only be called in main().
+ */
+void register_world_comm(lbann::lbann_comm& comm) noexcept;
+
+/** @brief Clear the world communicator for this session.
+ *
+ *  @warning This may only be called in main().
+ */
+void reset_world_comm() noexcept;
+} // namespace expert
+} // namespace utilities
+} // namespace unit_test
+#endif // LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_
diff --git a/unit_test/utilities/ReplaceEscapes.cpp b/unit_test/utilities/ReplaceEscapes.cpp
new file mode 100644
index 00000000000..21dafe01e3e
--- /dev/null
+++ b/unit_test/utilities/ReplaceEscapes.cpp
@@ -0,0 +1,115 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "ReplaceEscapes.hpp"
+
+#include <lbann/utils/environment_variable.hpp>
+#include <lbann/utils/system_info.hpp>
+
+#include <regex>
+#include <stdexcept>
+#include <string>
+
+namespace unit_test
+{
+namespace utilities
+{
+
+namespace
+{
+
+std::string GetBasicReplacement(
+  std::string const& str, lbann::utils::SystemInfo const& system_info)
+{
+  if (str.size() != 2 || str[0] != '%')
+    throw std::logic_error("string is not a valid pattern.");
+
+  switch (str[1])
+  {
+  case 'h':
+    return system_info.host_name();
+  case 'p':
+    return system_info.pid();
+  case 'r':
+    return std::to_string(system_info.mpi_rank());
+  case 's':
+    return std::to_string(system_info.mpi_size());
+  default:
+    throw BadSubstitutionPattern(str);
+  }
+  return ""; // in case a compiler complains about no return.
+}
+
+}// namespace <anon>
+
+BadSubstitutionPattern::BadSubstitutionPattern(std::string const& str)
+  : std::runtime_error("Bad escape sequence: " + str)
+{}
+
+std::string replace_escapes(
+  std::string const& str, lbann::utils::SystemInfo const& system_info)
+{
+  std::regex re("%env\\{([a-zA-Z0-9_]+)\\}|%[a-zA-Z]", std::regex::extended);
+  std::smatch match;
+  std::string outstr;
+  outstr.reserve(str.size());
+  size_t start=0;
+
+  do
+  {
+    // Get the string up to the first %%
+    auto const end = str.find("%%", start);
+    auto tmp = str.substr(start, end-start);
+
+    // Do all replacements
+    while (regex_search(tmp, match, re))
+    {
+      if (match.size() != 2UL)
+        throw std::logic_error("Unexpected match size");
+
+      if (match[1].length() == 0)
+        tmp.replace(match.position(), match.length(),
+                    GetBasicReplacement(match[0], system_info));
+      else
+        tmp.replace(match.position(), match.length(),
+                    system_info.env_variable_value(match[1]));
+    }
+    outstr += tmp + "%";
+
+    // Update the starting position in the original string.
+    start = (end == std::string::npos) ? std::string::npos : end+2;
+
+  }
+  while (start != std::string::npos);
+
+  // Added an extra "%"; remove it.
+  outstr.pop_back();
+
+  return outstr;
+}
+
+}// namespace utilities
+}// namespace unit_test
diff --git a/unit_test/utilities/ReplaceEscapes.hpp b/unit_test/utilities/ReplaceEscapes.hpp
new file mode 100644
index 00000000000..40c4682e6cb
--- /dev/null
+++ b/unit_test/utilities/ReplaceEscapes.hpp
@@ -0,0 +1,102 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED
+#define LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED
+
+#include <lbann/utils/system_info.hpp>
+
+#include <stdexcept>
+#include <string>
+
+namespace unit_test
+{
+namespace utilities
+{
+
+//
+// NOTE TO C++ READERS: The following documentation will appear WRONG
+// to you, but it is not! DO NOT CHANGE THE PATTERN/REPLACEMENT TABLE!
+// There are many extra percent signs, but these are necessary for the
+// markdown to render the HTML correctly! For your benefit, the valid
+// sequences are:
+//
+// %% -- A literal percent sign
+// %h -- The hostname of the current process
+// %p -- The PID of the current process
+// %r -- the MPI rank of the current process, if available, or 0
+// %s -- the MPI size of the current job, if available, or 1
+// %env{NAME} -- The value of ${NAME} in the current environment
+//
+
+/** @brief Substitute basic escape sequences in a string.
+ *
+ *  The following patterns are supported:
+ *
+ *  Pattern         | Replacement
+ *  --------------- | -----------
+ *  %%              | A literal percent sign ("%")
+ *  %%h             | The hostname of the current process
+ *  %%p             | The PID of the current process
+ *  %%r             | The MPI rank of the current process, if available, or 0
+ *  %%s             | The MPI size of the current job, if available, or 1
+ *  %%env{\<NAME\>} | The value of ${NAME} in the current environment
+ *
+ *  The MPI runtime is queried if available for MPI information. After
+ *  that, environment variables are checked for common libraries
+ *  (SLURM, Open-MPI, MVAPICH2). If neither of these methods gives the
+ *  required information, default information consistent with a
+ *  sequential job is returned: the rank will be 0 and the size will
+ *  be 1.
+ *
+ *  If the "%env{<NAME>}" substitution fails to find `NAME` in the
+ *  current environment, the replacement will be the empty string.
+ *
+ *  The double-percent sequence is extracted first, so "%%r" will
+ *  return "%r" and "%%%r" will return "%<mpi-rank>".
+ *
+ *  @param str The string to which substitutions should be applied.
+ *  @param sys_info The source of system information. This is
+ *                  primarily exposed for stubbing the functionality
+ *                  to test this function.
+ *
+ *  @throws BadSubstitutionPattern An escape sequence is found in
+ *          the string that has no valid substitution.
+ *
+ *  @returns A copy of the input string with all substitutions applied.
+ */
+std::string replace_escapes(
+  std::string const& str, lbann::utils::SystemInfo const& sys_info);
+
+/** @brief Indicates that an invalid pattern is detected. */
+struct BadSubstitutionPattern : std::runtime_error
+{
+  BadSubstitutionPattern(std::string const& str);
+};// struct BadSubstitutionPattern
+
+}// namespace utilities
+}// namespace unit_test
+#endif // LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED
diff --git a/unit_test/utilities/unit_test/CMakeLists.txt b/unit_test/utilities/unit_test/CMakeLists.txt
new file mode 100644
index 00000000000..a8fa529bc0e
--- /dev/null
+++ b/unit_test/utilities/unit_test/CMakeLists.txt
@@ -0,0 +1,8 @@
+set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES
+  test_replace_escapes.cpp
+  )
+
+set(LBANN_SEQ_CATCH2_TEST_FILES
+  "${LBANN_SEQ_CATCH2_TEST_FILES}"
+  "${THIS_DIR_SEQ_CATCH2_TEST_FILES}"
+  PARENT_SCOPE)
diff --git a/unit_test/utilities/unit_test/test_replace_escapes.cpp b/unit_test/utilities/unit_test/test_replace_escapes.cpp
new file mode 100644
index 00000000000..10d5c15bb7a
--- /dev/null
+++ b/unit_test/utilities/unit_test/test_replace_escapes.cpp
@@ -0,0 +1,188 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <catch2/catch.hpp>
+
+#include "ReplaceEscapes.hpp"
+
+#include <lbann/utils/system_info.hpp>
+
+#include <string>
+
+// Stub the system info
+class TestSystemInfo : public lbann::utils::SystemInfo
+{
+public:
+  std::string pid() const override { return "321"; }
+  std::string host_name() const override { return "test.host.name"; }
+  int mpi_rank() const override { return 123; }
+  int mpi_size() const override { return 432; }
+  std::string env_variable_value(std::string const& var_name) const override
+  {
+    if (var_name == "PUMPKIN")
+      return "pie";
+    if (var_name == "CRANBERRY")
+      return "sauce";
+    return "";
+  }
+}; // class TestSystemInfo
+
+// Bring the function under test into scope.
+using unit_test::utilities::replace_escapes;
+using unit_test::utilities::BadSubstitutionPattern;
+
+TEST_CASE("Subtitution of patterns in strings", "[seq][utils][testing]")
+{
+  TestSystemInfo sys_info;
+
+  SECTION("No patterns leaves the string unchanged.")
+  {
+    std::string test_string = "I am a string";
+    CHECK(replace_escapes(test_string, sys_info) == test_string);
+  }
+
+  SECTION("Subtitute %p for process ID")
+  {
+    auto pid = sys_info.pid();
+    CHECK(replace_escapes("%p", sys_info) == pid);
+    CHECK(replace_escapes("%p_apple", sys_info) == pid+"_apple");
+    CHECK(replace_escapes("%p%p", sys_info) == pid + pid);
+    CHECK(replace_escapes("%pap%pple_%p", sys_info)
+          == pid+"ap"+pid+"ple_"+pid);
+  }
+
+  SECTION("Substitute %h for hostname")
+  {
+    auto host = sys_info.host_name();
+    CHECK(replace_escapes("%h", sys_info) == host);
+    CHECK(replace_escapes("Tahitian %h farm", sys_info) == "Tahitian "+host+" farm");
+    CHECK(replace_escapes("%h%h", sys_info) == host + host);
+    CHECK(replace_escapes("G%hs%hsss", sys_info) == "G"+host+"s"+host+"sss");
+  }
+
+  SECTION("Substitute %r for MPI rank")
+  {
+    auto rank = std::to_string(sys_info.mpi_rank());
+    CHECK(replace_escapes("%r", sys_info) == rank);
+    CHECK(replace_escapes("I have %r cats", sys_info)
+          == "I have "+rank+" cats");
+    CHECK(replace_escapes("%r%r", sys_info) == rank + rank);
+    CHECK(replace_escapes("G%rs%rhss", sys_info)
+          == "G"+rank+"s"+rank+"hss");
+  }
+
+  SECTION("Substitute %s for MPI size")
+  {
+    auto size = std::to_string(sys_info.mpi_size());
+    CHECK(replace_escapes("%s", sys_info) == size);
+    CHECK(replace_escapes("I have %s puppies", sys_info)
+          == "I have "+size+" puppies");
+    CHECK(replace_escapes("%s%s", sys_info) == size + size);
+    CHECK(replace_escapes("G%ss%shss", sys_info)
+          == "G"+size+"s"+size+"hss");
+  }
+
+  SECTION("Substitute %% for a literal %")
+  {
+    CHECK(replace_escapes("%%", sys_info) == "%");
+    CHECK(replace_escapes("110%% is a lie", sys_info)
+          == "110% is a lie");
+    CHECK(replace_escapes("%%%%", sys_info) == "%%");
+    CHECK(replace_escapes("100%%", sys_info) == "100%");
+    CHECK(replace_escapes("%%hope", sys_info) == "%hope");
+    CHECK(replace_escapes("%%query", sys_info) == "%query");
+  }
+
+  SECTION("Substitute %env{<NAME>} for $<NAME> in the current environment")
+  {
+    auto pumpkin = sys_info.env_variable_value("PUMPKIN");
+    auto cranberry = sys_info.env_variable_value("CRANBERRY");
+    auto pid = sys_info.pid();
+    auto host = sys_info.host_name();
+    CHECK(replace_escapes("%env{PUMPKIN}", sys_info) == pumpkin);
+    CHECK(replace_escapes("%env{PUMPKIN}%env{PUMPKIN}", sys_info)
+          == pumpkin+pumpkin);
+    CHECK(replace_escapes("%env{PUMPKIN}%env{CRANBERRY}", sys_info)
+          == pumpkin+cranberry);
+    CHECK(replace_escapes("%%%env{THIS_IS_UNDEFINED}", sys_info) == "%");
+    CHECK(replace_escapes("eat_%env{PUMPKIN}_%h_%p.txt", sys_info)
+          == "eat_"+pumpkin+"_"+host+"_"+pid + ".txt");
+    CHECK(replace_escapes("%env{THIS_IS_UNDEFINED}", sys_info) == "");
+  }
+
+  SECTION("Bad patterns are rejected")
+  {
+    CHECK_THROWS_AS(replace_escapes("%env", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%a", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%b", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%c", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%d", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%e", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%f", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%g", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%i", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%j", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%k", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%l", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%m", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%n", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%o", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%q", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%t", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%u", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%v", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%w", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%x", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%y", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%z", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%A", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%B", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%C", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%D", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%E", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%F", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%G", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%H", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%I", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%J", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%K", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%L", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%M", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%N", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%O", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%P", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%Q", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%R", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%S", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%T", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%U", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%V", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%W", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%X", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%Y", sys_info), BadSubstitutionPattern);
+    CHECK_THROWS_AS(replace_escapes("%Z", sys_info), BadSubstitutionPattern);
+  }
+}